Do not require HTML::Scrubber to correctly strip unsafe tags

2014-08-20 18:12:20 +04:00 · 2014-08-20 18:12:20 +04:00 · c005274130
parent 6c03a67bc7
commit c005274130
3 changed files with 29 additions and 87 deletions
--- a/Bugzilla/Install/Requirements.pm
+++ b/Bugzilla/Install/Requirements.pm
@ -258,19 +258,6 @@ sub OPTIONAL_MODULES {
        version => 0,
        feature => ['jsonrpc', 'xmlrpc'],
    },
    {
        # We need the 'utf8_mode' method of HTML::Parser, for HTML::Scrubber.
        package => 'HTML-Parser',
        module  => 'HTML::Parser',
        version => '3.40',
        feature => ['html_desc'],
    },
    {
        package => 'HTML-Scrubber',
        module  => 'HTML::Scrubber',
        version => 0,
        feature => ['html_desc'],
    },
    # Inbound Email
    {
--- a/Bugzilla/Util.pm
+++ b/Bugzilla/Util.pm
@ -160,80 +160,36 @@ sub html_quote {
    return $var;
 }
 sub _skip_attrs
 {
    my ($tag, $attrs) = @_;
    $tag = lc $tag;
    return "<$tag>" if $tag =~ m!^/!so;
    my ($enclosed) = $attrs =~ m!/$!so ? ' /' : '';
    $attrs = { $attrs =~ /([^\s=]+)=([^\s=\'\"]+|\"[^\"]*\"|\'[^\']*\')/gso };
    my $new = {};
    for (qw(name id class style title))
    {
        $new->{$_} = $attrs->{$_} if $attrs->{$_};
    }
    my %l = (a => 'href', blockquote => 'cite', q => 'cite');
    if ($attrs->{$l{$tag}} && $attrs->{$l{$tag}} !~ /^[\"\']?javascript/iso)
    {
        $new->{$l{$tag}} = $attrs->{$l{$tag}};
    }
    return "<$tag".join("", map { " $_=".$new->{$_} } keys %$new).$enclosed.">";
 }
 sub html_light_quote {
    my ($text) = @_;
    # List of allowed HTML elements having no attributes.
-    my @allow = qw(b strong em i u p br abbr acronym ins del cite code var
+    my @allow = qw(
-                   dfn samp kbd big small sub sup tt dd dt dl ul li ol
+        a b big blockquote strong em i u p br abbr acronym ins del cite code var
-                   fieldset legend);
+        dfn samp kbd q small span sub sup tt dd dt dl ul li ol fieldset legend
-
+    );
-    if (!Bugzilla->feature('html_desc')) {
+    my $safe = join('|', @allow);
-        my $safe = join('|', @allow);
+    $text =~ s{(<(/?(?:$safe))(\s+(?:[^>"']+|"[^"]*"|'[^']*')*)?>)|(<)|(>)}{($1 ? _skip_attrs($2, $3) : ($4 ? '&lt;' : '&gt;'))}egiso;
-        my $chr = chr(1);
+    return $text;
        # First, escape safe elements.
        $text =~ s#<($safe)>#$chr$1$chr#go;
        $text =~ s#</($safe)>#$chr/$1$chr#go;
        # Now filter < and >.
        $text =~ s#<#&lt;#g;
        $text =~ s#>#&gt;#g;
        # Restore safe elements.
        $text =~ s#$chr/($safe)$chr#</$1>#go;
        $text =~ s#$chr($safe)$chr#<$1>#go;
        return $text;
    }
    else {
        # We can be less restrictive. We can accept elements with attributes.
        push(@allow, qw(a blockquote q span));
        # Allowed protocols.
        my $safe_protocols = join('|', SAFE_PROTOCOLS);
        my $protocol_regexp = qr{(^(?:$safe_protocols):|^[^:]+$)}i;
        # Deny all elements and attributes unless explicitly authorized.
        my @default = (0 => {
                             id    => 1,
                             name  => 1,
                             class => 1,
                             '*'   => 0, # Reject all other attributes.
                            }
                       );
        # Specific rules for allowed elements. If no specific rule is set
        # for a given element, then the default is used.
        my @rules = (a => {
                           href  => $protocol_regexp,
                           title => 1,
                           id    => 1,
                           name  => 1,
                           class => 1,
                           '*'   => 0, # Reject all other attributes.
                          },
                     blockquote => {
                                    cite => $protocol_regexp,
                                    id    => 1,
                                    name  => 1,
                                    class => 1,
                                    '*'  => 0, # Reject all other attributes.
                                   },
                     'q' => {
                             cite => $protocol_regexp,
                             id    => 1,
                             name  => 1,
                             class => 1,
                             '*'  => 0, # Reject all other attributes.
                          },
                    );
        my $scrubber = HTML::Scrubber->new(default => \@default,
                                           allow   => \@allow,
                                           rules   => \@rules,
                                           comment => 0,
                                           process => 0);
        return $scrubber->scrub($text);
    }
 }
 sub email_filter {
@ -1150,8 +1106,8 @@ deleted.
 =item C<html_light_quote($val)>
 Returns a string where only explicitly allowed HTML elements and attributes
-are kept. All HTML elements and attributes not being in the whitelist are either
+are kept. All HTML elements not being in the whitelist are escaped; all HTML
-escaped (if HTML::Scrubber is not installed) or removed.
+attributes no being in the whitelist are removed.
 =item C<url_quote($val)>
--- a/template/en/default/setup/strings.txt.pl
+++ b/template/en/default/setup/strings.txt.pl
@ -53,7 +53,6 @@ END
    feature_auth_ldap         => 'LDAP Authentication',
    feature_auth_radius       => 'RADIUS Authentication',
    feature_graphical_reports => 'Graphical Reports',
    feature_html_desc         => 'More HTML in Product/Group Descriptions',
    feature_inbound_email     => 'Inbound Email',
    feature_jobqueue          => 'Mail Queueing',
    feature_jsonrpc           => 'JSON-RPC Interface',