From 00f429af7bfaa6a9141bc0ac30995d4fae24836a Mon Sep 17 00:00:00 2001 From: Martin Koegler Date: Sun, 3 Jun 2007 17:42:44 +0200 Subject: [PATCH] gitweb: Handle non UTF-8 text better MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit gitweb assumes that everything is in UTF-8. If a text contains invalid UTF-8 character sequences, the text must be in a different encoding. This commit introduces $fallback_encoding which would be used as input encoding if gitweb encounters text with is not valid UTF-8. Add basic test for this in t/t9500-gitweb-standalone-no-errors.sh Signed-off-by: Martin Koegler Signed-off-by: Jakub Narebski Tested-by: Alexandre Julliard Tested-by: Ismail Dönmez Signed-off-by: Junio C Hamano --- gitweb/gitweb.perl | 41 +++++++++++++++++++------- t/t9500-gitweb-standalone-no-errors.sh | 28 ++++++++++++++++++ 2 files changed, 59 insertions(+), 10 deletions(-) diff --git a/gitweb/gitweb.perl b/gitweb/gitweb.perl index c3921cb0b..e92596c29 100755 --- a/gitweb/gitweb.perl +++ b/gitweb/gitweb.perl @@ -94,6 +94,13 @@ our $default_text_plain_charset = undef; # (relative to the current git repository) our $mimetypes_file = undef; +# assume this charset if line contains non-UTF-8 characters; +# it should be valid encoding (see Encoding::Supported(3pm) for list), +# for which encoding all byte sequences are valid, for example +# 'iso-8859-1' aka 'latin1' (it is decoded without checking, so it +# could be even 'utf-8' for the old behavior) +our $fallback_encoding = 'latin1'; + # You define site-wide feature defaults here; override them with # $GITWEB_CONFIG as necessary. our %feature = ( @@ -602,6 +609,20 @@ sub validate_refname { return $input; } +# decode sequences of octets in utf8 into Perl's internal form, +# which is utf-8 with utf8 flag set if needed. gitweb writes out +# in utf-8 thanks to "binmode STDOUT, ':utf8'" at beginning +sub to_utf8 { + my $str = shift; + my $res; + eval { $res = decode_utf8($str, Encode::FB_CROAK); }; + if (defined $res) { + return $res; + } else { + return decode($fallback_encoding, $str, Encode::FB_DEFAULT); + } +} + # quote unsafe chars, but keep the slash, even when it's not # correct, but quoted slashes look too horrible in bookmarks sub esc_param { @@ -626,7 +647,7 @@ sub esc_html ($;%) { my $str = shift; my %opts = @_; - $str = decode_utf8($str); + $str = to_utf8($str); $str = $cgi->escapeHTML($str); if ($opts{'-nbsp'}) { $str =~ s/ / /g; @@ -640,7 +661,7 @@ sub esc_path { my $str = shift; my %opts = @_; - $str = decode_utf8($str); + $str = to_utf8($str); $str = $cgi->escapeHTML($str); if ($opts{'-nbsp'}) { $str =~ s/ / /g; @@ -925,7 +946,7 @@ sub format_subject_html { if (length($short) < length($long)) { return $cgi->a({-href => $href, -class => "list subject", - -title => decode_utf8($long)}, + -title => to_utf8($long)}, esc_html($short) . $extra); } else { return $cgi->a({-href => $href, -class => "list subject"}, @@ -1239,7 +1260,7 @@ sub git_get_projects_list { if (check_export_ok("$projectroot/$path")) { my $pr = { path => $path, - owner => decode_utf8($owner), + owner => to_utf8($owner), }; push @list, $pr; (my $forks_path = $path) =~ s/\.git$//; @@ -1269,7 +1290,7 @@ sub git_get_project_owner { $pr = unescape($pr); $ow = unescape($ow); if ($pr eq $project) { - $owner = decode_utf8($ow); + $owner = to_utf8($ow); last; } } @@ -1759,7 +1780,7 @@ sub get_file_owner { } my $owner = $gcos; $owner =~ s/[,;].*$//; - return decode_utf8($owner); + return to_utf8($owner); } ## ...................................................................... @@ -1842,7 +1863,7 @@ sub git_header_html { my $title = "$site_name"; if (defined $project) { - $title .= " - " . decode_utf8($project); + $title .= " - " . to_utf8($project); if (defined $action) { $title .= "/$action"; if (defined $file_name) { @@ -2116,7 +2137,7 @@ sub git_print_page_path { print "
"; print $cgi->a({-href => href(action=>"tree", hash_base=>$hb), - -title => 'tree root'}, decode_utf8("[$project]")); + -title => 'tree root'}, to_utf8("[$project]")); print " / "; if (defined $name) { my @dirname = split '/', $name; @@ -2936,7 +2957,7 @@ sub git_project_list_body { ($pr->{'age'}, $pr->{'age_string'}) = @aa; if (!defined $pr->{'descr'}) { my $descr = git_get_project_description($pr->{'path'}) || ""; - $pr->{'descr_long'} = decode_utf8($descr); + $pr->{'descr_long'} = to_utf8($descr); $pr->{'descr'} = chop_str($descr, 25, 5); } if (!defined $pr->{'owner'}) { @@ -3981,7 +4002,7 @@ sub git_snapshot { my $git = git_cmd_str(); my $name = $project; $name =~ s/\047/\047\\\047\047/g; - my $filename = decode_utf8(basename($project)); + my $filename = to_utf8(basename($project)); my $cmd; if ($suffix eq 'zip') { $filename .= "-$hash.$suffix"; diff --git a/t/t9500-gitweb-standalone-no-errors.sh b/t/t9500-gitweb-standalone-no-errors.sh index b92ab6331..44ae503b9 100755 --- a/t/t9500-gitweb-standalone-no-errors.sh +++ b/t/t9500-gitweb-standalone-no-errors.sh @@ -487,4 +487,32 @@ test_expect_success \ 'gitweb_run "p=.git;a=atom"' test_debug 'cat gitweb.log' +# ---------------------------------------------------------------------- +# encoding/decoding + +test_expect_success \ + 'encode(commit): utf8' \ + '. ../t3901-utf8.txt && + echo "UTF-8" >> file && + git add file && + git commit -F ../t3900/1-UTF-8.txt && + gitweb_run "p=.git;a=commit"' +test_debug 'cat gitweb.log' + +test_expect_success \ + 'encode(commit): iso-8859-1' \ + '. ../t3901-8859-1.txt && + echo "ISO-8859-1" >> file && + git add file && + git config i18n.commitencoding ISO-8859-1 && + git commit -F ../t3900/ISO-8859-1.txt && + git config --unset i18n.commitencoding && + gitweb_run "p=.git;a=commit"' +test_debug 'cat gitweb.log' + +test_expect_success \ + 'encode(log): utf-8 and iso-8859-1' \ + 'gitweb_run "p=.git;a=log"' +test_debug 'cat gitweb.log' + test_done -- 2.30.2