use emailjs MailParser instead of Imap.parseHeader, simplify DB structure

master
Vitaliy Filippov 2016-10-05 01:46:35 +03:00
parent 8e7aa3d83d
commit 3d218e0aa5
4 changed files with 90 additions and 109 deletions

View File

@ -2,6 +2,7 @@ const gen = require('gen-thread');
const Imap = require('imap'); const Imap = require('imap');
const ImapManager = require('./ImapManager.js'); const ImapManager = require('./ImapManager.js');
const EventEmitter = require('events').EventEmitter; const EventEmitter = require('events').EventEmitter;
const MailParser = require('mailparser').MailParser;
module.exports = Syncer; module.exports = Syncer;
@ -196,8 +197,6 @@ Syncer.prototype.syncBox = function*(srv, accountId, boxName, boxKind, doFull)
account_id: accountId, account_id: accountId,
highestmodseq: 0, highestmodseq: 0,
kind: boxKind||'' kind: boxKind||''
//unread_count: boxStatus.messages.new,
//total_count: boxStatus.messages.total,
}).returning('id').row(gen.ef()); }).returning('id').row(gen.ef());
} }
@ -205,7 +204,7 @@ Syncer.prototype.syncBox = function*(srv, accountId, boxName, boxKind, doFull)
var missing = []; var missing = [];
var [ maxUid ] = yield this.pg.select('MAX(uid)').from('messages') var [ maxUid ] = yield this.pg.select('MAX(uid)').from('messages')
.where({ folder_id: boxRow.id }).val(gen.ef()); .where({ folder_id: boxRow.id }).val(gen.ef());
if (boxStatus.highestmodseq) if (boxRow.highestmodseq)
{ {
this.events.emit('sync', { state: 'start', quick: true, email: this.accounts[accountId].email, folder: boxRow.name }); this.events.emit('sync', { state: 'start', quick: true, email: this.accounts[accountId].email, folder: boxRow.name });
process.stderr.write(this.accounts[accountId].email+'/'+boxRow.name+': quick resync\n'); process.stderr.write(this.accounts[accountId].email+'/'+boxRow.name+': quick resync\n');
@ -227,8 +226,8 @@ Syncer.prototype.syncBox = function*(srv, accountId, boxName, boxKind, doFull)
}, (messages, state) => this.saveMessages(messages, boxRow.id, state)); }, (messages, state) => this.saveMessages(messages, boxRow.id, state));
yield this.pg.update('folders', { yield this.pg.update('folders', {
uidvalidity: boxRow.uidvalidity, uidvalidity: boxStatus.uidvalidity,
highestmodseq: boxRow.highestmodseq||0 highestmodseq: boxStatus.highestmodseq||0
}).where({ id: boxRow.id }).run(gen.ef()); }).where({ id: boxRow.id }).run(gen.ef());
} }
@ -375,6 +374,15 @@ Syncer.prototype.saveMessages = function*(messages, boxId)
yield* this.addMessage(boxId, messages[i][0], messages[i][1]); yield* this.addMessage(boxId, messages[i][0], messages[i][1]);
} }
Syncer.prototype.parseMsg = function*(msg)
{
var parser = new MailParser({ streamAttachments: false, defaultCharset: 'utf-8' });
parser.once('end', gen.cb());
parser.write(msg);
var [ obj ] = yield parser.end();
return obj;
}
Syncer.prototype.addMessage = function*(boxId, msgrow, attrs) Syncer.prototype.addMessage = function*(boxId, msgrow, attrs)
{ {
var self = this; var self = this;
@ -383,49 +391,42 @@ Syncer.prototype.addMessage = function*(boxId, msgrow, attrs)
{ {
[ pgtx, end_transaction ] = yield this.pg.transaction(gen.cb(), function(e) { if (e) throw e; }); [ pgtx, end_transaction ] = yield this.pg.transaction(gen.cb(), function(e) { if (e) throw e; });
var header = Imap.parseHeader(msgrow.headers); let header = yield* this.parseMsg(msgrow.headers);
for (var i in header) header.references = header.references || [];
for (var k = 0; k < header[i].length; k++)
header[i][k] = header[i][k].replace(/\x00/g, '');
header.from = header.from && splitEmails(header.from[0])[0];
header.replyto = header['reply-to'] && splitEmails(header['reply-to'][0])[0];
var re = /(<[^>]*>)/;
header.references = (header.references && header.references[0] || '').split(re).filter(a => a.match(re));
if (header.references.length) if (header.references.length)
{ {
if (header.references.length > 10) if (!header.inReplyTo || !header.inReplyTo[0])
header.references = [ header.references[0] ].concat(header.references.slice(header.references.length-9)); header.inReplyTo = [ header.references[header.references.length-1] ];
if (!header['in-reply-to'] || !header['in-reply-to'][0]) else if (header.references[header.references.length-1] != header.inReplyTo[0])
header['in-reply-to'] = [ header.references[header.references.length-1] ]; header.references.push(header.inReplyTo[0]);
else if (header.references[header.references.length-1] != header['in-reply-to'][0])
header.references.push(header['in-reply-to'][0]);
}
if (header.date)
{
var t = Date.parse(header.date[0]);
if (!isNaN(t))
header.date = new Date(t);
else
header.date = null;
} }
if (!header.date) if (!header.date)
header.date = new Date(attrs.date); header.date = new Date(attrs.date);
if (!header.from)
{
console.log(msgrow.headers);
console.log(header);
}
delete msgrow.headers;
msgrow.folder_id = boxId; msgrow.folder_id = boxId;
msgrow.from_email = header.from && header.from.email || ''; msgrow.subject = header.subject || '';
msgrow.from_name = header.from && header.from.name || ''; msgrow.props = JSON.stringify({
msgrow.replyto_email = header.replyto && header.replyto.email || ''; from: ((header.from||[]).map((a) => [ a.name, a.address ]))[0],
msgrow.replyto_name = header.replyto && header.replyto.name || ''; to: (header.to||[]).map((a) => [ a.name, a.address ]),
msgrow.to_list = header.to && header.to[0] || ''; cc: (header.cc||[]).map((a) => [ a.name, a.address ]),
msgrow.cc_list = header.cc && header.cc[0] || ''; bcc: (header.bcc||[]).map((a) => [ a.name, a.address ]),
msgrow.bcc_list = header.bcc && header.bcc[0] || ''; replyto: (header.replyTo||[]).map((a) => [ a.name, a.address ]),
msgrow.subject = header.subject && header.subject[0] || ''; });
msgrow.messageid = header['message-id'] && header['message-id'][0] || ''; msgrow.messageid = header.messageId || '';
msgrow.inreplyto = header['in-reply-to'] && header['in-reply-to'][0] || ''; msgrow.inreplyto = header.inReplyTo && header.inReplyTo[0] || '';
msgrow.inreplyto = msgrow.inreplyto.replace(/^[\s\S]*(<[^>]*>)[\s\S]*$/, '$1');
msgrow.time = header.date; msgrow.time = header.date;
msgrow.size = attrs.size;
msgrow.flags = toPgArray(msgrow.flags); msgrow.flags = toPgArray(msgrow.flags);
msgrow.refs = toPgArray(header.references); msgrow.refs = toPgArray(header.references);
for (let i in msgrow)
if (typeof msgrow[i] == 'string')
msgrow[i] = msgrow[i].replace(/\x00/g, '');
var thisIsFirst = false; var thisIsFirst = false;
if (header.references.length) if (header.references.length)
@ -441,7 +442,7 @@ Syncer.prototype.addMessage = function*(boxId, msgrow, attrs)
} }
msgrow.thread_id = threadId; msgrow.thread_id = threadId;
} }
console.log(msgrow.time+' '+msgrow.from_email+' '+msgrow.subject); console.log(msgrow.time+' '+(header.from && header.from[0] && header.from[0].address || '?')+' '+msgrow.subject);
[ msgrow.id ] = yield pgtx.insert('messages', msgrow).returning('id').val(gen.ef()); [ msgrow.id ] = yield pgtx.insert('messages', msgrow).returning('id').val(gen.ef());
if (!msgrow.thread_id) if (!msgrow.thread_id)
{ {
@ -469,18 +470,6 @@ Syncer.prototype.addMessage = function*(boxId, msgrow, attrs)
} }
} }
function splitEmails(s)
{
var re = /^[\s,]*(?:(?:["'](.*?)["']|([^<]+))\s*<([^>]+)>|<?([^<>]+)>?)/; // '
var m, r = [];
while (m = re.exec(s))
{
s = s.substr(m[0].length);
r.push({ name: (m[1]||m[2]||'').trim(), email: (m[3]||m[4]||'').trim() });
}
return r;
}
function toPgArray(a) function toPgArray(a)
{ {
a = JSON.stringify(a); a = JSON.stringify(a);

View File

@ -175,6 +175,31 @@ function rewriteCss(ast)
} }
} }
function sanitizeHtml(html)
{
let styles = '';
html = (html||'').replace(/<style[^<>]*>([\s\S]*?)<\/style\s*>/ig, function(m, m1)
{
styles += m1+'\n';
return '';
});
html = html.replace(/^[\s\S]*?<body[^<>]*>([\s\S]*)<\/body>[\s\S]*$/i, '$1');
html = html.replace(/^[\s\S]*?<html[^<>]*>([\s\S]*)<\/html>[\s\S]*$/i, '$1');
if (styles)
{
html = '<style>\n'+styles+'</style>\n'+html;
styles = '';
}
html = htmlawed.sanitize(html||'', { safe: 1, elements: '* +style', keep_bad: 0, comment: 1 });
html = html.replace(/<style[^>]*>([\s\S]*)<\/style\s*>/ig, function(m, m1)
{
var ast = css.parse(m1, { silent: true });
rewriteCss(ast);
return '<style>'+css.stringify(ast)+'</style>';
});
return html;
}
function* getBody(pg, messages, boxId) function* getBody(pg, messages, boxId)
{ {
var p = new MailParser({ streamAttachments: false, defaultCharset: 'windows-1251' }); var p = new MailParser({ streamAttachments: false, defaultCharset: 'windows-1251' });
@ -184,26 +209,7 @@ function* getBody(pg, messages, boxId)
p.on('end', gen.cb()); p.on('end', gen.cb());
p.write(msg[0].headers); p.write(msg[0].headers);
let [ obj ] = yield p.end(); let [ obj ] = yield p.end();
let styles = ''; obj.html = sanitizeHtml(obj.html);
obj.html = (obj.html||'').replace(/<style[^<>]*>([\s\S]*?)<\/style\s*>/ig, function(m, m1)
{
styles += m1+'\n';
return '';
});
obj.html = obj.html.replace(/^[\s\S]*?<body[^<>]*>([\s\S]*)<\/body>[\s\S]*$/i, '$1');
obj.html = obj.html.replace(/^[\s\S]*?<html[^<>]*>([\s\S]*)<\/html>[\s\S]*$/i, '$1');
if (styles)
{
obj.html = '<style>\n'+styles+'</style>\n'+obj.html;
styles = '';
}
obj.html = htmlawed.sanitize(obj.html||'', { safe: 1, elements: '* +style', keep_bad: 0, comment: 1 });
obj.html = obj.html.replace(/<style[^>]*>([\s\S]*)<\/style\s*>/ig, function(m, m1)
{
var ast = css.parse(m1, { silent: true });
rewriteCss(ast);
return '<style>'+css.stringify(ast)+'</style>';
});
let upd = { body_text: obj.text||'', body_html: obj.html }; let upd = { body_text: obj.text||'', body_html: obj.html };
upd.body_html_text = obj.html.replace(/<style[^>]*>.*<\/style\s*>|<\/?[^>]*>/g, ''); upd.body_html_text = obj.html.replace(/<style[^>]*>.*<\/style\s*>|<\/?[^>]*>/g, '');
yield pg.update('messages m', upd).where({ folder_id: boxId, uid: msg[0].uid }).run(gen.ef()); yield pg.update('messages m', upd).where({ folder_id: boxId, uid: msg[0].uid }).run(gen.ef());

60
db.sql
View File

@ -18,7 +18,6 @@ create table folders (
uidvalidity int not null, uidvalidity int not null,
account_id int not null, account_id int not null,
name varchar(255) not null, name varchar(255) not null,
unread_count int not null,
highestmodseq int not null default 0, highestmodseq int not null default 0,
kind varchar(255) not null, kind varchar(255) not null,
foreign key (account_id) references accounts (id) on delete cascade on update cascade foreign key (account_id) references accounts (id) on delete cascade on update cascade
@ -34,20 +33,12 @@ create table messages (
inreplyto varchar(1000) not null, inreplyto varchar(1000) not null,
refs varchar(1000)[] not null, refs varchar(1000)[] not null,
subject text not null, subject text not null,
from_email varchar(255) not null, props jsonb not null,
from_name varchar(255) not null, body_html text not null default '',
replyto_email varchar(255) not null, body_text text not null default '',
replyto_name varchar(255) not null, body_html_text text not null default '',
to_list text not null,
cc_list text not null,
bcc_list text not null,
headers text not null,
body_html text not null,
body_text text not null,
body_html_text text not null,
text_index tsvector not null,
time timestamptz not null, time timestamptz not null,
size unsigned not null, size int not null,
flags varchar(255)[] not null, flags varchar(255)[] not null,
foreign key (folder_id) references folders (id) on delete cascade on update cascade foreign key (folder_id) references folders (id) on delete cascade on update cascade
); );
@ -56,29 +47,23 @@ create index messages_flags on messages using gin (folder_id, flags);
create index messages_messageid on messages (messageid); create index messages_messageid on messages (messageid);
create index messages_refs on messages using gin (refs); create index messages_refs on messages using gin (refs);
create index messages_time on messages (folder_id, time); create index messages_time on messages (folder_id, time);
create index messages_text on messages using gin (text_index); create or replace function messages_fulltext(msg messages) returns tsvector
create or replace function fn_messages_text_index() returns trigger language plpgsql immutable as $$
security definer language plpgsql as $$
begin begin
NEW.text_index = ( return setweight(to_tsvector('russian', regexp_replace(
setweight(to_tsvector('russian', regexp_replace(NEW.from_name || ' ' || NEW.from_email || ' ' || coalesce(msg.props->>'from', '') || ' ' ||
NEW.replyto_name || ' ' || NEW.replyto_email || ' ' || coalesce(msg.props->>'replyto', '') || ' ' ||
NEW.to_list || ' ' || NEW.cc_list || ' ' || NEW.bcc_list || ' ' || NEW.subject, '\W+', ' ', 'g')), 'A') || coalesce(msg.props->>'to', '') || ' ' ||
setweight(to_tsvector('russian', NEW.body_html_text || ' ' || NEW.body_text), 'B') coalesce(msg.props->>'cc', '') || ' ' ||
); coalesce(msg.props->>'bcc', '') || ' ' ||
return NEW; coalesce(msg.props->>'attachments', '') || ' ' ||
msg.subject,
'\W+', ' ', 'g'
)), 'A')
|| setweight(to_tsvector('russian', msg.body_html_text || ' ' || msg.body_text), 'B');
end end
$$; $$;
create trigger messages_text_index before insert or update on messages create index messages_text on messages using gin (messages_fulltext(messages));
for each row execute procedure fn_messages_text_index();
create table attachments (
id serial not null primary key,
msg_id int not null,
ctype varchar(255) not null,
size unsigned not null,
foreign key (msg_id) references messages (id) on delete cascade on update cascade
);
create table threads ( create table threads (
id serial not null primary key, id serial not null primary key,
@ -90,6 +75,7 @@ create index threads_first_msg on threads (first_msg);
alter table messages add foreign key (thread_id) references threads (id) on delete restrict on update cascade; alter table messages add foreign key (thread_id) references threads (id) on delete restrict on update cascade;
--create table tt as with recursive t (id, messageid, upperid, uppermsg) as (select (array_agg(m1.id))[0], m1.messageid, (array_agg(m1.id))[1], m1.messageid from messages m1 left join messages m2 on m1.messageid!='' and m1.inreplyto!='' and m2.messageid=m1.inreplyto where m2.id is null group by m1.messageid union select m1.id, m1.messageid, t.upperid, t.uppermsg from messages m1 inner join t on m1.inreplyto!='' and m1.inreplyto=t.messageid where m1.messageid!='') select * from t; alter table accounts owner to operetta;
alter table folders owner to operetta;
--alter table messages alter flags type varchar(255)[] using (case when flags&1=1 then array['recent'] else array[]::varchar(255)[] end) || (case when flags&2=2 then array['flagged'] else array[]::varchar(255)[] end) || (case when flags&4=4 then array['answered'] else array[]::varchar(255)[] end) || (case when flags&8=8 then array['unread'] else array[]::varchar(255)[] end); alter table messages owner to operetta;
alter table threads owner to operetta;

View File

@ -42,7 +42,7 @@ var syncerweb = new SyncerWeb(syncer, pg, cfg);
gen.run(function*() gen.run(function*()
{ {
yield* syncer.init(cfg); yield* syncer.init(cfg);
//yield* syncer.syncAll(); yield* syncer.syncAll();
}); });
syncerweb.listen(8057); syncerweb.listen(8057);