Skip to content

Commit 66dc7b6

Browse files
committed
Merge branch 'en/fast-export-encoding'
The "git fast-export/import" pair has been taught to handle commits with log messages in encoding other than UTF-8 better. * en/fast-export-encoding: fast-export: do automatic reencoding of commit messages only if requested fast-export: differentiate between explicitly UTF-8 and implicitly UTF-8 fast-export: avoid stripping encoding header if we cannot reencode fast-import: support 'encoding' commit header t9350: fix encoding test to actually test reencoding
2 parents c0e78f7 + e80001f commit 66dc7b6

File tree

8 files changed

+163
-17
lines changed

8 files changed

+163
-17
lines changed

Documentation/git-fast-export.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,13 @@ marks the same across runs.
129129
for intermediary filters (e.g. for rewriting commit messages
130130
which refer to older commits, or for stripping blobs by id).
131131

132+
--reencode=(yes|no|abort)::
133+
Specify how to handle `encoding` header in commit objects. When
134+
asking to 'abort' (which is the default), this program will die
135+
when encountering such a commit object. With 'yes', the commit
136+
message will be reencoded into UTF-8. With 'no', the original
137+
encoding will be preserved.
138+
132139
--refspec::
133140
Apply the specified refspec to each ref exported. Multiple of them can
134141
be specified.

Documentation/git-fast-import.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -388,6 +388,7 @@ change to the project.
388388
original-oid?
389389
('author' (SP <name>)? SP LT <email> GT SP <when> LF)?
390390
'committer' (SP <name>)? SP LT <email> GT SP <when> LF
391+
('encoding' SP <encoding>)?
391392
data
392393
('from' SP <commit-ish> LF)?
393394
('merge' SP <commit-ish> LF)?
@@ -455,6 +456,12 @@ that was selected by the --date-format=<fmt> command-line option.
455456
See ``Date Formats'' above for the set of supported formats, and
456457
their syntax.
457458

459+
`encoding`
460+
^^^^^^^^^^
461+
The optional `encoding` command indicates the encoding of the commit
462+
message. Most commits are UTF-8 and the encoding is omitted, but this
463+
allows importing commit messages into git without first reencoding them.
464+
458465
`from`
459466
^^^^^^
460467
The `from` command is used to specify the commit to initialize

builtin/fast-export.c

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ static const char *fast_export_usage[] = {
3333
static int progress;
3434
static enum { SIGNED_TAG_ABORT, VERBATIM, WARN, WARN_STRIP, STRIP } signed_tag_mode = SIGNED_TAG_ABORT;
3535
static enum { TAG_FILTERING_ABORT, DROP, REWRITE } tag_of_filtered_mode = TAG_FILTERING_ABORT;
36+
static enum { REENCODE_ABORT, REENCODE_YES, REENCODE_NO } reencode_mode = REENCODE_ABORT;
3637
static int fake_missing_tagger;
3738
static int use_done_feature;
3839
static int no_data;
@@ -77,6 +78,31 @@ static int parse_opt_tag_of_filtered_mode(const struct option *opt,
7778
return 0;
7879
}
7980

81+
static int parse_opt_reencode_mode(const struct option *opt,
82+
const char *arg, int unset)
83+
{
84+
if (unset) {
85+
reencode_mode = REENCODE_ABORT;
86+
return 0;
87+
}
88+
89+
switch (git_parse_maybe_bool(arg)) {
90+
case 0:
91+
reencode_mode = REENCODE_NO;
92+
break;
93+
case 1:
94+
reencode_mode = REENCODE_YES;
95+
break;
96+
default:
97+
if (!strcasecmp(arg, "abort"))
98+
reencode_mode = REENCODE_ABORT;
99+
else
100+
return error("Unknown reencoding mode: %s", arg);
101+
}
102+
103+
return 0;
104+
}
105+
80106
static struct decoration idnums;
81107
static uint32_t last_idnum;
82108

@@ -453,7 +479,7 @@ static const char *find_encoding(const char *begin, const char *end)
453479
bol = memmem(begin, end ? end - begin : strlen(begin),
454480
needle, strlen(needle));
455481
if (!bol)
456-
return git_commit_encoding;
482+
return NULL;
457483
bol += strlen(needle);
458484
eol = strchrnul(bol, '\n');
459485
*eol = '\0';
@@ -633,18 +659,32 @@ static void handle_commit(struct commit *commit, struct rev_info *rev,
633659
}
634660

635661
mark_next_object(&commit->object);
636-
if (anonymize)
662+
if (anonymize) {
637663
reencoded = anonymize_commit_message(message);
638-
else if (!is_encoding_utf8(encoding))
639-
reencoded = reencode_string(message, "UTF-8", encoding);
664+
} else if (encoding) {
665+
switch(reencode_mode) {
666+
case REENCODE_YES:
667+
reencoded = reencode_string(message, "UTF-8", encoding);
668+
break;
669+
case REENCODE_NO:
670+
break;
671+
case REENCODE_ABORT:
672+
die("Encountered commit-specific encoding %s in commit "
673+
"%s; use --reencode=[yes|no] to handle it",
674+
encoding, oid_to_hex(&commit->object.oid));
675+
}
676+
}
640677
if (!commit->parents)
641678
printf("reset %s\n", refname);
642679
printf("commit %s\nmark :%"PRIu32"\n", refname, last_idnum);
643680
if (show_original_ids)
644681
printf("original-oid %s\n", oid_to_hex(&commit->object.oid));
645-
printf("%.*s\n%.*s\ndata %u\n%s",
682+
printf("%.*s\n%.*s\n",
646683
(int)(author_end - author), author,
647-
(int)(committer_end - committer), committer,
684+
(int)(committer_end - committer), committer);
685+
if (!reencoded && encoding)
686+
printf("encoding %s\n", encoding);
687+
printf("data %u\n%s",
648688
(unsigned)(reencoded
649689
? strlen(reencoded) : message
650690
? strlen(message) : 0),
@@ -1088,6 +1128,9 @@ int cmd_fast_export(int argc, const char **argv, const char *prefix)
10881128
OPT_CALLBACK(0, "tag-of-filtered-object", &tag_of_filtered_mode, N_("mode"),
10891129
N_("select handling of tags that tag filtered objects"),
10901130
parse_opt_tag_of_filtered_mode),
1131+
OPT_CALLBACK(0, "reencode", &reencode_mode, N_("mode"),
1132+
N_("select handling of commit messages in an alternate encoding"),
1133+
parse_opt_reencode_mode),
10911134
OPT_STRING(0, "export-marks", &export_filename, N_("file"),
10921135
N_("Dump marks to this file")),
10931136
OPT_STRING(0, "import-marks", &import_filename, N_("file"),

fast-import.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2585,6 +2585,7 @@ static void parse_new_commit(const char *arg)
25852585
struct branch *b;
25862586
char *author = NULL;
25872587
char *committer = NULL;
2588+
const char *encoding = NULL;
25882589
struct hash_list *merge_list = NULL;
25892590
unsigned int merge_count;
25902591
unsigned char prev_fanout, new_fanout;
@@ -2607,6 +2608,8 @@ static void parse_new_commit(const char *arg)
26072608
}
26082609
if (!committer)
26092610
die("Expected committer but didn't get one");
2611+
if (skip_prefix(command_buf.buf, "encoding ", &encoding))
2612+
read_next_command();
26102613
parse_data(&msg, 0, NULL);
26112614
read_next_command();
26122615
parse_from(b);
@@ -2670,9 +2673,13 @@ static void parse_new_commit(const char *arg)
26702673
}
26712674
strbuf_addf(&new_data,
26722675
"author %s\n"
2673-
"committer %s\n"
2674-
"\n",
2676+
"committer %s\n",
26752677
author ? author : committer, committer);
2678+
if (encoding)
2679+
strbuf_addf(&new_data,
2680+
"encoding %s\n",
2681+
encoding);
2682+
strbuf_addch(&new_data, '\n');
26762683
strbuf_addbuf(&new_data, &msg);
26772684
free(author);
26782685
free(committer);

t/t9300-fast-import.sh

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3299,4 +3299,24 @@ test_expect_success !MINGW 'W: get-mark & empty orphan commit with erroneous thi
32993299
sed -e s/LFs/LLL/ W-input | tr L "\n" | test_must_fail git fast-import
33003300
'
33013301

3302+
###
3303+
### series X (other new features)
3304+
###
3305+
3306+
test_expect_success 'X: handling encoding' '
3307+
test_tick &&
3308+
cat >input <<-INPUT_END &&
3309+
commit refs/heads/encoding
3310+
committer $GIT_COMMITTER_NAME <$GIT_COMMITTER_EMAIL> $GIT_COMMITTER_DATE
3311+
encoding iso-8859-7
3312+
data <<COMMIT
3313+
INPUT_END
3314+
3315+
printf "Pi: \360\nCOMMIT\n" >>input &&
3316+
3317+
git fast-import <input &&
3318+
git cat-file -p encoding | grep $(printf "\360") &&
3319+
git log -1 --format=%B encoding | grep $(printf "\317\200")
3320+
'
3321+
33023322
test_done

t/t9350-fast-export.sh

Lines changed: 69 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -94,22 +94,83 @@ test_expect_success 'fast-export --show-original-ids | git fast-import' '
9494
test $MUSS = $(git rev-parse --verify refs/tags/muss)
9595
'
9696

97-
test_expect_success 'iso-8859-1' '
97+
test_expect_success 'reencoding iso-8859-7' '
9898
99-
git config i18n.commitencoding ISO8859-1 &&
100-
# use author and committer name in ISO-8859-1 to match it.
101-
. "$TEST_DIRECTORY"/t3901/8859-1.txt &&
99+
test_when_finished "git reset --hard HEAD~1" &&
100+
test_config i18n.commitencoding iso-8859-7 &&
102101
test_tick &&
103102
echo rosten >file &&
104-
git commit -s -m den file &&
105-
git fast-export wer^..wer >iso8859-1.fi &&
106-
sed "s/wer/i18n/" iso8859-1.fi |
103+
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
104+
git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
105+
sed "s/wer/i18n/" iso-8859-7.fi |
107106
(cd new &&
108107
git fast-import &&
108+
# The commit object, if not re-encoded, would be 240 bytes.
109+
# Removing the "encoding iso-8859-7\n" header drops 20 bytes.
110+
# Re-encoding the Pi character from \xF0 (\360) in iso-8859-7
111+
# to \xCF\x80 (\317\200) in UTF-8 adds a byte. Check for
112+
# the expected size.
113+
test 221 -eq "$(git cat-file -s i18n)" &&
114+
# ...and for the expected translation of bytes.
109115
git cat-file commit i18n >actual &&
110-
grep "Áéí óú" actual)
116+
grep $(printf "\317\200") actual &&
117+
# Also make sure the commit does not have the "encoding" header
118+
! grep ^encoding actual)
119+
'
120+
121+
test_expect_success 'aborting on iso-8859-7' '
111122
123+
test_when_finished "git reset --hard HEAD~1" &&
124+
test_config i18n.commitencoding iso-8859-7 &&
125+
echo rosten >file &&
126+
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
127+
test_must_fail git fast-export --reencode=abort wer^..wer >iso-8859-7.fi
112128
'
129+
130+
test_expect_success 'preserving iso-8859-7' '
131+
132+
test_when_finished "git reset --hard HEAD~1" &&
133+
test_config i18n.commitencoding iso-8859-7 &&
134+
echo rosten >file &&
135+
git commit -s -F "$TEST_DIRECTORY/t9350/simple-iso-8859-7-commit-message.txt" file &&
136+
git fast-export --reencode=no wer^..wer >iso-8859-7.fi &&
137+
sed "s/wer/i18n-no-recoding/" iso-8859-7.fi |
138+
(cd new &&
139+
git fast-import &&
140+
# The commit object, if not re-encoded, is 240 bytes.
141+
# Removing the "encoding iso-8859-7\n" header would drops 20
142+
# bytes. Re-encoding the Pi character from \xF0 (\360) in
143+
# iso-8859-7 to \xCF\x80 (\317\200) in UTF-8 adds a byte.
144+
# Check for the expected size...
145+
test 240 -eq "$(git cat-file -s i18n-no-recoding)" &&
146+
# ...as well as the expected byte.
147+
git cat-file commit i18n-no-recoding >actual &&
148+
grep $(printf "\360") actual &&
149+
# Also make sure the commit has the "encoding" header
150+
grep ^encoding actual)
151+
'
152+
153+
test_expect_success 'encoding preserved if reencoding fails' '
154+
155+
test_when_finished "git reset --hard HEAD~1" &&
156+
test_config i18n.commitencoding iso-8859-7 &&
157+
echo rosten >file &&
158+
git commit -s -F "$TEST_DIRECTORY/t9350/broken-iso-8859-7-commit-message.txt" file &&
159+
git fast-export --reencode=yes wer^..wer >iso-8859-7.fi &&
160+
sed "s/wer/i18n-invalid/" iso-8859-7.fi |
161+
(cd new &&
162+
git fast-import &&
163+
git cat-file commit i18n-invalid >actual &&
164+
# Make sure the commit still has the encoding header
165+
grep ^encoding actual &&
166+
# Verify that the commit has the expected size; i.e.
167+
# that no bytes were re-encoded to a different encoding.
168+
test 252 -eq "$(git cat-file -s i18n-invalid)" &&
169+
# ...and check for the original special bytes
170+
grep $(printf "\360") actual &&
171+
grep $(printf "\377") actual)
172+
'
173+
113174
test_expect_success 'import/export-marks' '
114175
115176
git checkout -b marks master &&
@@ -224,7 +285,6 @@ GIT_COMMITTER_NAME='C O Mitter'; export GIT_COMMITTER_NAME
224285

225286
test_expect_success 'setup copies' '
226287
227-
git config --unset i18n.commitencoding &&
228288
git checkout -b copy rein &&
229289
git mv file file3 &&
230290
git commit -m move1 &&
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Pi: �; Invalid: �
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Pi: �

0 commit comments

Comments
 (0)