diff options
Diffstat (limited to 'convert.c')
| -rw-r--r-- | convert.c | 276 | 
1 files changed, 275 insertions, 1 deletions
| @@ -7,6 +7,7 @@  #include "sigchain.h"  #include "pkt-line.h"  #include "sub-process.h" +#include "utf8.h"  /*   * convert.c - convert a file when checking it out and checking it in. @@ -265,6 +266,241 @@ static int will_convert_lf_to_crlf(size_t len, struct text_stat *stats,  } +static int validate_encoding(const char *path, const char *enc, +		      const char *data, size_t len, int die_on_error) +{ +	/* We only check for UTF here as UTF?? can be an alias for UTF-?? */ +	if (istarts_with(enc, "UTF")) { +		/* +		 * Check for detectable errors in UTF encodings +		 */ +		if (has_prohibited_utf_bom(enc, data, len)) { +			const char *error_msg = _( +				"BOM is prohibited in '%s' if encoded as %s"); +			/* +			 * This advice is shown for UTF-??BE and UTF-??LE encodings. +			 * We cut off the last two characters of the encoding name +			 * to generate the encoding name suitable for BOMs. +			 */ +			const char *advise_msg = _( +				"The file '%s' contains a byte order " +				"mark (BOM). Please use UTF-%s as " +				"working-tree-encoding."); +			const char *stripped = NULL; +			char *upper = xstrdup_toupper(enc); +			upper[strlen(upper)-2] = '\0'; +			if (!skip_prefix(upper, "UTF-", &stripped)) +				skip_prefix(stripped, "UTF", &stripped); +			advise(advise_msg, path, stripped); +			free(upper); +			if (die_on_error) +				die(error_msg, path, enc); +			else { +				return error(error_msg, path, enc); +			} + +		} else if (is_missing_required_utf_bom(enc, data, len)) { +			const char *error_msg = _( +				"BOM is required in '%s' if encoded as %s"); +			const char *advise_msg = _( +				"The file '%s' is missing a byte order " +				"mark (BOM). Please use UTF-%sBE or UTF-%sLE " +				"(depending on the byte order) as " +				"working-tree-encoding."); +			const char *stripped = NULL; +			char *upper = xstrdup_toupper(enc); +			if (!skip_prefix(upper, "UTF-", &stripped)) +				skip_prefix(stripped, "UTF", &stripped); +			advise(advise_msg, path, stripped, stripped); +			free(upper); +			if (die_on_error) +				die(error_msg, path, enc); +			else { +				return error(error_msg, path, enc); +			} +		} + +	} +	return 0; +} + +static void trace_encoding(const char *context, const char *path, +			   const char *encoding, const char *buf, size_t len) +{ +	static struct trace_key coe = TRACE_KEY_INIT(WORKING_TREE_ENCODING); +	struct strbuf trace = STRBUF_INIT; +	int i; + +	strbuf_addf(&trace, "%s (%s, considered %s):\n", context, path, encoding); +	for (i = 0; i < len && buf; ++i) { +		strbuf_addf( +			&trace,"| \e[2m%2i:\e[0m %2x \e[2m%c\e[0m%c", +			i, +			(unsigned char) buf[i], +			(buf[i] > 32 && buf[i] < 127 ? buf[i] : ' '), +			((i+1) % 8 && (i+1) < len ? ' ' : '\n') +		); +	} +	strbuf_addchars(&trace, '\n', 1); + +	trace_strbuf(&coe, &trace); +	strbuf_release(&trace); +} + +static int check_roundtrip(const char *enc_name) +{ +	/* +	 * check_roundtrip_encoding contains a string of comma and/or +	 * space separated encodings (eg. "UTF-16, ASCII, CP1125"). +	 * Search for the given encoding in that string. +	 */ +	const char *found = strcasestr(check_roundtrip_encoding, enc_name); +	const char *next; +	int len; +	if (!found) +		return 0; +	next = found + strlen(enc_name); +	len = strlen(check_roundtrip_encoding); +	return (found && ( +			/* +			 * check that the found encoding is at the +			 * beginning of check_roundtrip_encoding or +			 * that it is prefixed with a space or comma +			 */ +			found == check_roundtrip_encoding || ( +				(isspace(found[-1]) || found[-1] == ',') +			) +		) && ( +			/* +			 * check that the found encoding is at the +			 * end of check_roundtrip_encoding or +			 * that it is suffixed with a space or comma +			 */ +			next == check_roundtrip_encoding + len || ( +				next < check_roundtrip_encoding + len && +				(isspace(next[0]) || next[0] == ',') +			) +		)); +} + +static const char *default_encoding = "UTF-8"; + +static int encode_to_git(const char *path, const char *src, size_t src_len, +			 struct strbuf *buf, const char *enc, int conv_flags) +{ +	char *dst; +	int dst_len; +	int die_on_error = conv_flags & CONV_WRITE_OBJECT; + +	/* +	 * No encoding is specified or there is nothing to encode. +	 * Tell the caller that the content was not modified. +	 */ +	if (!enc || (src && !src_len)) +		return 0; + +	/* +	 * Looks like we got called from "would_convert_to_git()". +	 * This means Git wants to know if it would encode (= modify!) +	 * the content. Let's answer with "yes", since an encoding was +	 * specified. +	 */ +	if (!buf && !src) +		return 1; + +	if (validate_encoding(path, enc, src, src_len, die_on_error)) +		return 0; + +	trace_encoding("source", path, enc, src, src_len); +	dst = reencode_string_len(src, src_len, default_encoding, enc, +				  &dst_len); +	if (!dst) { +		/* +		 * We could add the blob "as-is" to Git. However, on checkout +		 * we would try to reencode to the original encoding. This +		 * would fail and we would leave the user with a messed-up +		 * working tree. Let's try to avoid this by screaming loud. +		 */ +		const char* msg = _("failed to encode '%s' from %s to %s"); +		if (die_on_error) +			die(msg, path, enc, default_encoding); +		else { +			error(msg, path, enc, default_encoding); +			return 0; +		} +	} +	trace_encoding("destination", path, default_encoding, dst, dst_len); + +	/* +	 * UTF supports lossless conversion round tripping [1] and conversions +	 * between UTF and other encodings are mostly round trip safe as +	 * Unicode aims to be a superset of all other character encodings. +	 * However, certain encodings (e.g. SHIFT-JIS) are known to have round +	 * trip issues [2]. Check the round trip conversion for all encodings +	 * listed in core.checkRoundtripEncoding. +	 * +	 * The round trip check is only performed if content is written to Git. +	 * This ensures that no information is lost during conversion to/from +	 * the internal UTF-8 representation. +	 * +	 * Please note, the code below is not tested because I was not able to +	 * generate a faulty round trip without an iconv error. Iconv errors +	 * are already caught above. +	 * +	 * [1] http://unicode.org/faq/utf_bom.html#gen2 +	 * [2] https://support.microsoft.com/en-us/help/170559/prb-conversion-problem-between-shift-jis-and-unicode +	 */ +	if (die_on_error && check_roundtrip(enc)) { +		char *re_src; +		int re_src_len; + +		re_src = reencode_string_len(dst, dst_len, +					     enc, default_encoding, +					     &re_src_len); + +		trace_printf("Checking roundtrip encoding for %s...\n", enc); +		trace_encoding("reencoded source", path, enc, +			       re_src, re_src_len); + +		if (!re_src || src_len != re_src_len || +		    memcmp(src, re_src, src_len)) { +			const char* msg = _("encoding '%s' from %s to %s and " +					    "back is not the same"); +			die(msg, path, enc, default_encoding); +		} + +		free(re_src); +	} + +	strbuf_attach(buf, dst, dst_len, dst_len + 1); +	return 1; +} + +static int encode_to_worktree(const char *path, const char *src, size_t src_len, +			      struct strbuf *buf, const char *enc) +{ +	char *dst; +	int dst_len; + +	/* +	 * No encoding is specified or there is nothing to encode. +	 * Tell the caller that the content was not modified. +	 */ +	if (!enc || (src && !src_len)) +		return 0; + +	dst = reencode_string_len(src, src_len, enc, default_encoding, +				  &dst_len); +	if (!dst) { +		error("failed to encode '%s' from %s to %s", +			path, default_encoding, enc); +		return 0; +	} + +	strbuf_attach(buf, dst, dst_len, dst_len + 1); +	return 1; +} +  static int crlf_to_git(const struct index_state *istate,  		       const char *path, const char *src, size_t len,  		       struct strbuf *buf, @@ -978,6 +1214,24 @@ static int ident_to_worktree(const char *path, const char *src, size_t len,  	return 1;  } +static const char *git_path_check_encoding(struct attr_check_item *check) +{ +	const char *value = check->value; + +	if (ATTR_UNSET(value) || !strlen(value)) +		return NULL; + +	if (ATTR_TRUE(value) || ATTR_FALSE(value)) { +		die(_("true/false are no valid working-tree-encodings")); +	} + +	/* Don't encode to the default encoding */ +	if (same_encoding(value, default_encoding)) +		return NULL; + +	return value; +} +  static enum crlf_action git_path_check_crlf(struct attr_check_item *check)  {  	const char *value = check->value; @@ -1033,6 +1287,7 @@ struct conv_attrs {  	enum crlf_action attr_action; /* What attr says */  	enum crlf_action crlf_action; /* When no attr is set, use core.autocrlf */  	int ident; +	const char *working_tree_encoding; /* Supported encoding or default encoding if NULL */  };  static void convert_attrs(struct conv_attrs *ca, const char *path) @@ -1041,7 +1296,8 @@ static void convert_attrs(struct conv_attrs *ca, const char *path)  	if (!check) {  		check = attr_check_initl("crlf", "ident", "filter", -					 "eol", "text", NULL); +					 "eol", "text", "working-tree-encoding", +					 NULL);  		user_convert_tail = &user_convert;  		git_config(read_convert_config, NULL);  	} @@ -1064,6 +1320,7 @@ static void convert_attrs(struct conv_attrs *ca, const char *path)  			else if (eol_attr == EOL_CRLF)  				ca->crlf_action = CRLF_TEXT_CRLF;  		} +		ca->working_tree_encoding = git_path_check_encoding(ccheck + 5);  	} else {  		ca->drv = NULL;  		ca->crlf_action = CRLF_UNDEFINED; @@ -1144,6 +1401,13 @@ int convert_to_git(const struct index_state *istate,  		src = dst->buf;  		len = dst->len;  	} + +	ret |= encode_to_git(path, src, len, dst, ca.working_tree_encoding, conv_flags); +	if (ret && dst) { +		src = dst->buf; +		len = dst->len; +	} +  	if (!(conv_flags & CONV_EOL_KEEP_CRLF)) {  		ret |= crlf_to_git(istate, path, src, len, dst, ca.crlf_action, conv_flags);  		if (ret && dst) { @@ -1167,6 +1431,7 @@ void convert_to_git_filter_fd(const struct index_state *istate,  	if (!apply_filter(path, NULL, 0, fd, dst, ca.drv, CAP_CLEAN, NULL))  		die("%s: clean filter '%s' failed", path, ca.drv->name); +	encode_to_git(path, dst->buf, dst->len, dst, ca.working_tree_encoding, conv_flags);  	crlf_to_git(istate, path, dst->buf, dst->len, dst, ca.crlf_action, conv_flags);  	ident_to_git(path, dst->buf, dst->len, dst, ca.ident);  } @@ -1198,6 +1463,12 @@ static int convert_to_working_tree_internal(const char *path, const char *src,  		}  	} +	ret |= encode_to_worktree(path, src, len, dst, ca.working_tree_encoding); +	if (ret) { +		src = dst->buf; +		len = dst->len; +	} +  	ret_filter = apply_filter(  		path, src, len, -1, dst, ca.drv, CAP_SMUDGE, dco);  	if (!ret_filter && ca.drv && ca.drv->required) @@ -1664,6 +1935,9 @@ struct stream_filter *get_stream_filter(const char *path, const struct object_id  	if (ca.drv && (ca.drv->process || ca.drv->smudge || ca.drv->clean))  		return NULL; +	if (ca.working_tree_encoding) +		return NULL; +  	if (ca.crlf_action == CRLF_AUTO || ca.crlf_action == CRLF_AUTO_CRLF)  		return NULL; | 
