bugzilla-4intranet/Bugzilla/Migrate/Gnats.pm

# -*- Mode: perl; indent-tabs-mode: nil -*-
#
# The contents of this file are subject to the Mozilla Public
# License Version 1.1 (the "License"); you may not use this file
# except in compliance with the License. You may obtain a copy of
# the License at http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS
# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
# implied. See the License for the specific language governing
# rights and limitations under the License.
#
# The Original Code is The Bugzilla Migration Tool.
#
# The Initial Developer of the Original Code is Lambda Research
# Corporation. Portions created by the Initial Developer are Copyright
# (C) 2009 the Initial Developer. All Rights Reserved.
#
# Contributor(s):
#   Max Kanat-Alexander <mkanat@bugzilla.org>

package Bugzilla::Migrate::Gnats;
use strict;
use base qw(Bugzilla::Migrate);

use Bugzilla::Constants;
use Bugzilla::Install::Util qw(indicate_progress);
use Bugzilla::Util qw(format_time trim generate_random_password lsearch);

use Email::Address;
use Email::MIME;
use File::Basename;
use IO::File;
use List::Util qw(first);

use constant REQUIRED_MODULES => [
    {
        package => 'Email-Simple-FromHandle',
        module  => 'Email::Simple::FromHandle',
        # This version added seekable handles.
        version => 0.050,
    },
];

use constant FIELD_MAP => {
    'Number'         => 'bug_id',
    'Category'       => 'product',
    'Synopsis'       => 'short_desc',
    'Responsible'    => 'assigned_to',
    'State'          => 'bug_status',
    'Class'          => 'cf_type',
    'Classification' => '',
    'Originator'     => 'reporter',
    'Arrival-Date'   => 'creation_ts',
    'Last-Modified'  => 'delta_ts',
    'Release'        => 'version',
    'Severity'       => 'bug_severity',
    'Description'    => 'comment',
};

use constant VALUE_MAP => {
    bug_severity => {
        'serious'      => 'major',
        'cosmetic'     => 'trivial',
        'new-feature'  => 'enhancement',
        'non-critical' => 'normal',
    },
    bug_status => {
        'open'      => 'NEW',
        'analyzed'  => 'ASSIGNED',
        'suspended' => 'RESOLVED',
        'feedback'  => 'RESOLVED',
        'released'  => 'VERIFIED',
    },
    bug_status_resolution => {
        'feedback'  => 'FIXED',
        'released'  => 'FIXED',
        'closed'    => 'FIXED',
        'suspended' => 'LATER',
    },
    priority => {
        'medium' => 'Normal',
    },
};

use constant GNATS_CONFIG_VARS => (
    {
        name    => 'gnats_path',
        default => '/var/lib/gnats',
        desc    => <<END,
# The path to the directory that contains the GNATS database.
END
    },
    {
        name    => 'default_email_domain',
        default => 'example.com',
        desc    => <<'END',
# Some GNATS users do not have full email addresses, but Bugzilla requires
# every user to have an email address. What domain should be appended to
# usernames that don't have emails, to make them into email addresses?
# (For example, if you leave this at the default, "unknown" would become
# "unknown@example.com".)
END
    },
    {
        name    => 'component_name',
        default => 'General',
        desc    => <<'END',
# GNATS has only "Category" to classify bugs. However, Bugzilla has a
# multi-level system of Products that contain Components. When importing
# GNATS categories, they become a Product with one Component. What should
# the name of that Component be?
END
    },
    {
        name    => 'version_regex',
        default => '',
        desc    => <<'END',
# In GNATS, the "version" field can contain almost anything. However, in
# Bugzilla, it's a drop-down, so you don't want too many choices in there.
# If you specify a regular expression here, versions will be tested against
# this regular expression, and if they match, the first match (the first set
# of parentheses in the regular expression, also called "$1") will be used
# as the version value for the bug instead of the full version value specified
# in GNATS.
END
    },
    {
        name    => 'default_originator',
        default => 'gnats-admin',
        desc    => <<'END',
# Sometimes, a PR has no valid Originator, so we fall back to the From
# header of the email. If the From header also isn't a valid username
# (is just a name with spaces in it--we can't convert that to an email
# address) then this username (which can either be a GNATS username or an
# email address) will be considered to be the Originator of the PR.
END
    }
);

sub CONFIG_VARS {
    my $self = shift;
    my @vars = (GNATS_CONFIG_VARS, $self->SUPER::CONFIG_VARS);
    my $field_map = first { $_->{name} eq 'translate_fields' } @vars;
    $field_map->{default} = FIELD_MAP;
    my $value_map = first { $_->{name} eq 'translate_values' } @vars;
    $value_map->{default} = VALUE_MAP;
    return @vars;
}

# Directories that aren't projects, or that we shouldn't be parsing
use constant SKIP_DIRECTORIES => qw(
    gnats-adm
    gnats-queue
    pending
);

use constant NON_COMMENT_FIELDS => qw(
    Audit-Trail
    Closed-Date
    Confidential
    Unformatted
    attachments
);

# Certain fields can contain things that look like fields in them,
# because they might contain quoted emails. To avoid mis-parsing,
# we list out here the exact order of fields at the end of a PR
# and wait for the next field to consider that we actually have
# a field to parse.
use constant END_FIELD_ORDER => [qw(
    Description
    How-To-Repeat
    Fix
    Release-Note
    Audit-Trail
    Unformatted
)];

use constant CUSTOM_FIELDS => {
    cf_type => {
        type        => FIELD_TYPE_SINGLE_SELECT,
        description => 'Type',
    },
};

use constant FIELD_REGEX => qr/^>(\S+):\s*(.*)$/;

# Used for bugs that have no Synopsis.
use constant NO_SUBJECT => "(no subject)";

# This is the divider that GNATS uses between attachments in its database
# files. It's missign two hyphens at the beginning because MIME Emails use
# -- to start boundaries.
use constant GNATS_BOUNDARY => '----gnatsweb-attachment----';

use constant LONG_VERSION_LENGTH => 32;

#########
# Hooks #
#########

sub before_insert {
    my $self = shift;

    # gnats_id isn't a valid User::create field, and we don't need it
    # anymore now.
    delete $_->{gnats_id} foreach @{ $self->users };

    # Grab a version out of a bug for each product, so that there is a
    # valid "version" argument for Bugzilla::Product->create.
    foreach my $product (@{ $self->products }) {
        my $bug = first { $_->{product} eq $product->{name} and $_->{version} }
                        @{ $self->bugs };
        if (defined $bug) {
            $product->{version} = $bug->{version};
        }
        else {
            $product->{version} = 'unspecified';
        }
    }
}

#########
# Users #
#########

sub _read_users {
    my $self = shift;
    my $path = $self->config('gnats_path');
    my $file =  "$path/gnats-adm/responsible";
    $self->debug("Reading users from $file");
    my $default_domain = $self->config('default_email_domain');
    open(my $users_fh, '<', $file) || die "$file: $!";
    my @users;
    foreach my $line (<$users_fh>) {
        $line = trim($line);
        next if $line =~ /^#/;
        my ($id, $name, $email) = split(':', $line, 3);
        $email ||= "$id\@$default_domain";
        # We can't call our own translate_value, because that depends on
        # the existence of user_map, which doesn't exist until after
        # this method. However, we still want to translate any users found.
        $email = $self->SUPER::translate_value('user', $email);
        push(@users, { realname => $name, login_name => $email,
                       gnats_id => $id });
    }
    close($users_fh);
    return \@users;
}

sub user_map {
    my $self = shift;
    $self->{user_map} ||= { map { $_->{gnats_id} => $_->{login_name} }
                                @{ $self->users } };
    return $self->{user_map};
}

sub add_user {
    my ($self, $id, $email) = @_;
    return if defined $self->user_map->{$id};
    $self->user_map->{$id} = $email;
    push(@{ $self->users }, { login_name => $email, gnats_id => $id });
}

sub user_to_email {
    my ($self, $value) = @_;
    if (defined $self->user_map->{$value}) {
        $value = $self->user_map->{$value};
    }
    elsif ($value !~ /@/) {
        my $domain = $self->config('default_email_domain');
        $value = "$value\@$domain";
    }
    return $value;
}

############
# Products #
############

sub _read_products {
    my $self = shift;
    my $path = $self->config('gnats_path');
    my $file =  "$path/gnats-adm/categories";
    $self->debug("Reading categories from $file");

    open(my $categories_fh, '<', $file) || die "$file: $!";
    my @products;
    foreach my $line (<$categories_fh>) {
        $line = trim($line);
        next if $line =~ /^#/;
        my ($name, $description, $assigned_to, $cc) = split(':', $line, 4);
        my %product = ( name => $name, description => $description );

        my @initial_cc = split(',', $cc);
        @initial_cc = @{ $self->translate_value('user', \@initial_cc) };
        $assigned_to = $self->translate_value('user', $assigned_to);
        my %component = ( name         => $self->config('component_name'),
                          description  => $description,
                          initialowner => $assigned_to,
                          initial_cc   => \@initial_cc );
        $product{components} = [\%component];
        push(@products, \%product);
    }
    close($categories_fh);
    return \@products;
}

################
# Reading Bugs #
################

sub _read_bugs {
    my $self = shift;
    my $path = $self->config('gnats_path');
    my @directories = glob("$path/*");
    my @bugs;
    foreach my $directory (@directories) {
        next if !-d $directory;
        my $name = basename($directory);
        next if grep($_ eq $name, SKIP_DIRECTORIES);
        push(@bugs, @{ $self->_parse_project($directory) });
    }
    @bugs = sort { $a->{Number} <=> $b->{Number} } @bugs;
    return \@bugs;
}

sub _parse_project {
    my ($self, $directory) = @_;
    my @files = glob("$directory/*");

    $self->debug("Reading Project: $directory");
    # Sometimes other files get into gnats directories.
    @files = grep { basename($_) =~ /^\d+$/ } @files;
    my @bugs;
    my $count = 1;
    my $total = scalar @files;
    print basename($directory) . ":\n";
    foreach my $file (@files) {
        push(@bugs, $self->_parse_bug_file($file));
        if (!$self->verbose) {
            indicate_progress({ current => $count++, every => 5,
                                total => $total });
        }
    }
    return \@bugs;
}

sub _parse_bug_file {
    my ($self, $file) = @_;
    $self->debug("Reading $file");
    open(my $fh, "<", $file) || die "$file: $!";
    my $email = Email::Simple::FromHandle->new($fh);
    my $fields = $self->_get_gnats_field_data($email);
    # We parse attachments here instead of during translate_bug,
    # because otherwise we'd be taking up huge amounts of memory storing
    # all the raw attachment data in memory.
    $fields->{attachments} = $self->_parse_attachments($fields);
    close($fh);
    return $fields;
}

sub _get_gnats_field_data {
    my ($self, $email) = @_;
    my ($current_field, @value_lines, %fields);
    $email->reset_handle();
    my $handle = $email->handle;
    foreach my $line (<$handle>) {
        # If this line starts a field name
        if ($line =~ FIELD_REGEX) {
            my ($new_field, $rest_of_line) = ($1, $2);

            # If this is one of the last few PR fields, then make sure
            # that we're getting our fields in the right order.
            my $new_field_valid = 1;
            my $current_field_pos =
                lsearch(END_FIELD_ORDER, $current_field || '');
            if ($current_field_pos > -1) {
                my $new_field_pos = lsearch(END_FIELD_ORDER, $new_field);
                # We accept any field, as long as it's later than this one.
                $new_field_valid = $new_field_pos > $current_field_pos ? 1 : 0;
            }

            if ($new_field_valid) {
                if ($current_field) {
                    $fields{$current_field} = _handle_lines(\@value_lines);
                    @value_lines = ();
                }
                $current_field = $new_field;
                $line = $rest_of_line;
            }
        }
        push(@value_lines, $line) if defined $line;
    }
    $fields{$current_field} = _handle_lines(\@value_lines);
    $fields{cc} = [$email->header('Cc')] if $email->header('Cc');

    # If the Originator is invalid and we don't have a translation for it,
    # use the From header instead.
    my $originator = $self->translate_value('reporter', $fields{Originator},
                                            { check_only => 1 });
    if ($originator !~ Bugzilla->params->{emailregexp}) {
        # We use the raw header sometimes, because it looks like "From: user"
        # which Email::Address won't parse but we can still use.
        my $address = $email->header('From');
        my ($parsed) = Email::Address->parse($address);
        if ($parsed) {
            $address = $parsed->address;
        }
        if ($address) {
            $self->debug(
                "PR $fields{Number} had an Originator that was not a valid"
                . " user ($fields{Originator}). Using From ($address)"
                . " instead.\n");
            my $address_email = $self->translate_value('reporter', $address,
                                                       { check_only => 1 });
            if ($address_email !~ Bugzilla->params->{emailregexp}) {
                $self->debug(" From was also invalid, using default_originator.\n");
                $address = $self->config('default_originator');
            }
            $fields{Originator} = $address;
        }
    }

    $self->debug(\%fields, 3);
    return \%fields;
}

sub _handle_lines {
    my ($lines) = @_;
    my $value = join('', @$lines);
    $value =~ s/\s+$//;
    return $value;
}

####################
# Translating Bugs #
####################

sub translate_bug {
    my ($self, $fields) = @_;

    my ($bug, $other_fields) = $self->SUPER::translate_bug($fields);

    $bug->{attachments} = delete $other_fields->{attachments};

    if (defined $other_fields->{_add_to_comment}) {
        $bug->{comment} .= delete $other_fields->{_add_to_comment};
    }

    my ($changes, $extra_comment) =
        $self->_parse_audit_trail($bug, $other_fields->{'Audit-Trail'});

    my @comments;
    foreach my $change (@$changes) {
        if (exists $change->{comment}) {
            push(@comments, {
                thetext  => $change->{comment},
                who      => $change->{who},
                bug_when => $change->{bug_when} });
            delete $change->{comment};
        }
    }
    $bug->{history}  = $changes;

    if (trim($extra_comment)) {
        push(@comments, { thetext => $extra_comment, who => $bug->{reporter},
                          bug_when => $bug->{delta_ts} || $bug->{creation_ts} });
    }
    $bug->{comments} = \@comments;

    $bug->{component} = $self->config('component_name');
    if (!$bug->{short_desc}) {
        $bug->{short_desc} = NO_SUBJECT;
    }

    foreach my $attachment (@{ $bug->{attachments} || [] }) {
        $attachment->{submitter} = $bug->{reporter};
        $attachment->{creation_ts} = $bug->{creation_ts};
    }

    $self->debug($bug, 3);
    return $bug;
}

sub _parse_audit_trail {
    my ($self, $bug, $audit_trail) = @_;
    return [] if !trim($audit_trail);
    $self->debug(" Parsing audit trail...", 2);

    if ($audit_trail !~ /^\S+-Changed-\S+:/ms) {
        # This is just a comment from the bug's creator.
        $self->debug("  Audit trail is just a comment.", 2);
        return ([], $audit_trail);
    }

    my (@changes, %current_data, $current_column, $on_why);
    my $extra_comment = '';
    my $current_field;
    my @all_lines = split("\n", $audit_trail);
    foreach my $line (@all_lines) {
        # GNATS history looks like:
        # Status-Changed-From-To: open->closed
        # Status-Changed-By: jack
        # Status-Changed-When: Mon May 12 14:46:59 2003
        # Status-Changed-Why:
        #     This is some comment here about the change.
        if ($line =~ /^(\S+)-Changed-(\S+):(.*)/) {
            my ($field, $column, $value) = ($1, $2, $3);
            my $bz_field = $self->translate_field($field);
            # If it's not a field we're importing, we don't care about
            # its history.
            next if !$bz_field;
            # GNATS doesn't track values for description changes,
            # unfortunately, and that's the only information we'd be able to
            # use in Bugzilla for the audit trail on that field.
            next if $bz_field eq 'comment';
            $current_field = $bz_field if !$current_field;
            if ($bz_field ne $current_field) {
                $self->_store_audit_change(
                    \@changes, $current_field, \%current_data);
                %current_data = ();
                $current_field = $bz_field;
            }
            $value = trim($value);
            $self->debug("  $bz_field $column: $value", 3);
            if ($column eq 'From-To') {
                my ($from, $to) = split('->', $value, 2);
                # Sometimes there's just a - instead of a -> between the values.
                if (!defined($to)) {
                    ($from, $to) = split('-', $value, 2);
                }
                $current_data{added} = $to;
                $current_data{removed} = $from;
            }
            elsif ($column eq 'By') {
                my $email = $self->translate_value('user', $value);
                # Sometimes we hit users in the audit trail that we haven't
                # seen anywhere else.
                $current_data{who} = $email;
            }
            elsif ($column eq 'When') {
                $current_data{bug_when} = $self->parse_date($value);
            }
            if ($column eq 'Why') {
                $value = '' if !defined $value;
                $current_data{comment} = $value;
                $on_why = 1;
            }
            else {
                $on_why = 0;
            }
        }
        elsif ($on_why) {
            # "Why" lines are indented four characters.
            $line =~ s/^\s{4}//;
            $current_data{comment} .= "$line\n";
        }
        else {
            $self->debug(
                "Extra Audit-Trail line on $bug->{product} $bug->{bug_id}:"
                 . " $line\n", 2);
            $extra_comment .= "$line\n";
        }
    }
    $self->_store_audit_change(\@changes, $current_field, \%current_data);
    return (\@changes, $extra_comment);
}

sub _store_audit_change {
    my ($self, $changes, $old_field, $current_data) = @_;

    $current_data->{field} = $old_field;
    $current_data->{removed} =
        $self->translate_value($old_field, $current_data->{removed});
    $current_data->{added} =
        $self->translate_value($old_field, $current_data->{added});
    push(@$changes, { %$current_data });
}

sub _parse_attachments {
    my ($self, $fields) = @_;
    my $unformatted = delete $fields->{'Unformatted'};
    my $gnats_boundary = GNATS_BOUNDARY;
    # A sanity checker to make sure that we're parsing attachments right.
    my $num_attachments = 0;
    $num_attachments++ while ($unformatted =~ /\Q$gnats_boundary\E/g);
    # Sometimes there's a GNATS_BOUNDARY that is on the same line as other data.
    $unformatted =~ s/(\S\s*)\Q$gnats_boundary\E$/$1\n$gnats_boundary/mg;
    # Often the "Unformatted" section starts with stuff before
    # ----gnatsweb-attachment---- that isn't necessary.
    $unformatted =~ s/^\s*From:.+?Reply-to:[^\n]+//s;
    $unformatted = trim($unformatted);
    return [] if !$unformatted;
    $self->debug('Reading attachments...', 2);
    my $boundary = generate_random_password(48);
    $unformatted =~ s/\Q$gnats_boundary\E/--$boundary/g;
    # Sometimes the whole Unformatted section is indented by exactly
    # one space, and needs to be fixed.
    if ($unformatted =~ /--\Q$boundary\E\n /) {
        $unformatted =~ s/^ //mg;
    }
    $unformatted = <<END;
From: nobody
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="$boundary"

This is a multi-part message in MIME format.
--$boundary
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 7bit

$unformatted
--$boundary--
END
    my $email = new Email::MIME(\$unformatted);
    my @parts = $email->parts;
    # Remove the fake body.
    my $part1 = shift @parts;
    if ($part1->body) {
        $self->debug(" Additional Unformatted data found on "
                     . $fields->{Category} . " bug " . $fields->{Number});
        $self->debug($part1->body, 3);
        $fields->{_add_comment} .= "\n\nUnformatted:\n" . $part1->body;
    }

    my @attachments;
    foreach my $part (@parts) {
        $self->debug('  Parsing attachment: ' . $part->filename);
        my $temp_fh = IO::File->new_tmpfile or die ("Can't create tempfile: $!");
        $temp_fh->binmode;
        print $temp_fh $part->body;
        my $content_type = $part->content_type;
        $content_type =~ s/; name=.+$//;
        my $attachment = { filename    => $part->filename,
                           description => $part->filename,
                           mimetype    => $content_type,
                           data        => $temp_fh };
        $self->debug($attachment, 3);
        push(@attachments, $attachment);
    }

    if (scalar(@attachments) ne $num_attachments) {
        warn "WARNING: Expected $num_attachments attachments but got "
             . scalar(@attachments) . "\n" ;
        $self->debug($unformatted, 3);
    }
    return \@attachments;
}

sub translate_value {
    my $self = shift;
    my ($field, $value, $options) = @_;
    my $original_value = $value;
    $options ||= {};

    if (!ref($value) and grep($_ eq $field, $self->USER_FIELDS)) {
        if ($value =~ /(\S+\@\S+)/) {
            $value = $1;
            $value =~ s/^<//;
            $value =~ s/>$//;
        }
        else {
            # Sometimes names have extra stuff on the end like "(Somebody's Name)"
            $value =~ s/\s+\(.+\)$//;
            # Sometimes user fields look like "(user)" instead of just "user".
            $value =~ s/^\((.+)\)$/$1/;
            $value = trim($value);
        }
    }

    if ($field eq 'version' and $value ne '') {
        my $version_re = $self->config('version_regex');
        if ($version_re and $value =~ $version_re) {
            $value = $1;
        }
        # In the GNATS that I tested this with, there were many extremely long
        # values for "version" that caused some import problems (they were
        # longer than the max allowed version value). So if the version value
        # is longer than 32 characters, pull out the first thing that looks
        # like a version number.
        elsif (length($value) > LONG_VERSION_LENGTH) {
            $value =~ s/^.+?\b(\d[\w\.]+)\b.+$/$1/;
        }
    }

    my @args = @_;
    $args[1] = $value;

    $value = $self->SUPER::translate_value(@args);
    return $value if ref $value;

    if (grep($_ eq $field, $self->USER_FIELDS)) {
        my $from_value = $value;
        $value = $self->user_to_email($value);
        $args[1] = $value;
        # If we got something new from user_to_email, do any necessary
        # translation of it.
        $value = $self->SUPER::translate_value(@args);
        if (!$options->{check_only}) {
            $self->add_user($from_value, $value);
        }
    }

    return $value;
}

1;