use emailjs MailParser instead of Imap.parseHeader, simplify DB structure

master
Vitaliy Filippov 2016-10-05 01:46:35 +03:00
parent 8e7aa3d83d
commit 3d218e0aa5
4 changed files with 90 additions and 109 deletions

View File

@ -2,6 +2,7 @@ const gen = require('gen-thread');
const Imap = require('imap');
const ImapManager = require('./ImapManager.js');
const EventEmitter = require('events').EventEmitter;
const MailParser = require('mailparser').MailParser;
module.exports = Syncer;
@ -196,8 +197,6 @@ Syncer.prototype.syncBox = function*(srv, accountId, boxName, boxKind, doFull)
account_id: accountId,
highestmodseq: 0,
kind: boxKind||''
//unread_count: boxStatus.messages.new,
//total_count: boxStatus.messages.total,
}).returning('id').row(gen.ef());
}
@ -205,7 +204,7 @@ Syncer.prototype.syncBox = function*(srv, accountId, boxName, boxKind, doFull)
var missing = [];
var [ maxUid ] = yield this.pg.select('MAX(uid)').from('messages')
.where({ folder_id: boxRow.id }).val(gen.ef());
if (boxStatus.highestmodseq)
if (boxRow.highestmodseq)
{
this.events.emit('sync', { state: 'start', quick: true, email: this.accounts[accountId].email, folder: boxRow.name });
process.stderr.write(this.accounts[accountId].email+'/'+boxRow.name+': quick resync\n');
@ -227,8 +226,8 @@ Syncer.prototype.syncBox = function*(srv, accountId, boxName, boxKind, doFull)
}, (messages, state) => this.saveMessages(messages, boxRow.id, state));
yield this.pg.update('folders', {
uidvalidity: boxRow.uidvalidity,
highestmodseq: boxRow.highestmodseq||0
uidvalidity: boxStatus.uidvalidity,
highestmodseq: boxStatus.highestmodseq||0
}).where({ id: boxRow.id }).run(gen.ef());
}
@ -375,6 +374,15 @@ Syncer.prototype.saveMessages = function*(messages, boxId)
yield* this.addMessage(boxId, messages[i][0], messages[i][1]);
}
Syncer.prototype.parseMsg = function*(msg)
{
var parser = new MailParser({ streamAttachments: false, defaultCharset: 'utf-8' });
parser.once('end', gen.cb());
parser.write(msg);
var [ obj ] = yield parser.end();
return obj;
}
Syncer.prototype.addMessage = function*(boxId, msgrow, attrs)
{
var self = this;
@ -383,49 +391,42 @@ Syncer.prototype.addMessage = function*(boxId, msgrow, attrs)
{
[ pgtx, end_transaction ] = yield this.pg.transaction(gen.cb(), function(e) { if (e) throw e; });
var header = Imap.parseHeader(msgrow.headers);
for (var i in header)
for (var k = 0; k < header[i].length; k++)
header[i][k] = header[i][k].replace(/\x00/g, '');
header.from = header.from && splitEmails(header.from[0])[0];
header.replyto = header['reply-to'] && splitEmails(header['reply-to'][0])[0];
var re = /(<[^>]*>)/;
header.references = (header.references && header.references[0] || '').split(re).filter(a => a.match(re));
let header = yield* this.parseMsg(msgrow.headers);
header.references = header.references || [];
if (header.references.length)
{
if (header.references.length > 10)
header.references = [ header.references[0] ].concat(header.references.slice(header.references.length-9));
if (!header['in-reply-to'] || !header['in-reply-to'][0])
header['in-reply-to'] = [ header.references[header.references.length-1] ];
else if (header.references[header.references.length-1] != header['in-reply-to'][0])
header.references.push(header['in-reply-to'][0]);
}
if (header.date)
{
var t = Date.parse(header.date[0]);
if (!isNaN(t))
header.date = new Date(t);
else
header.date = null;
if (!header.inReplyTo || !header.inReplyTo[0])
header.inReplyTo = [ header.references[header.references.length-1] ];
else if (header.references[header.references.length-1] != header.inReplyTo[0])
header.references.push(header.inReplyTo[0]);
}
if (!header.date)
header.date = new Date(attrs.date);
if (!header.from)
{
console.log(msgrow.headers);
console.log(header);
}
delete msgrow.headers;
msgrow.folder_id = boxId;
msgrow.from_email = header.from && header.from.email || '';
msgrow.from_name = header.from && header.from.name || '';
msgrow.replyto_email = header.replyto && header.replyto.email || '';
msgrow.replyto_name = header.replyto && header.replyto.name || '';
msgrow.to_list = header.to && header.to[0] || '';
msgrow.cc_list = header.cc && header.cc[0] || '';
msgrow.bcc_list = header.bcc && header.bcc[0] || '';
msgrow.subject = header.subject && header.subject[0] || '';
msgrow.messageid = header['message-id'] && header['message-id'][0] || '';
msgrow.inreplyto = header['in-reply-to'] && header['in-reply-to'][0] || '';
msgrow.inreplyto = msgrow.inreplyto.replace(/^[\s\S]*(<[^>]*>)[\s\S]*$/, '$1');
msgrow.subject = header.subject || '';
msgrow.props = JSON.stringify({
from: ((header.from||[]).map((a) => [ a.name, a.address ]))[0],
to: (header.to||[]).map((a) => [ a.name, a.address ]),
cc: (header.cc||[]).map((a) => [ a.name, a.address ]),
bcc: (header.bcc||[]).map((a) => [ a.name, a.address ]),
replyto: (header.replyTo||[]).map((a) => [ a.name, a.address ]),
});
msgrow.messageid = header.messageId || '';
msgrow.inreplyto = header.inReplyTo && header.inReplyTo[0] || '';
msgrow.time = header.date;
msgrow.size = attrs.size;
msgrow.flags = toPgArray(msgrow.flags);
msgrow.refs = toPgArray(header.references);
for (let i in msgrow)
if (typeof msgrow[i] == 'string')
msgrow[i] = msgrow[i].replace(/\x00/g, '');
var thisIsFirst = false;
if (header.references.length)
@ -441,7 +442,7 @@ Syncer.prototype.addMessage = function*(boxId, msgrow, attrs)
}
msgrow.thread_id = threadId;
}
console.log(msgrow.time+' '+msgrow.from_email+' '+msgrow.subject);
console.log(msgrow.time+' '+(header.from && header.from[0] && header.from[0].address || '?')+' '+msgrow.subject);
[ msgrow.id ] = yield pgtx.insert('messages', msgrow).returning('id').val(gen.ef());
if (!msgrow.thread_id)
{
@ -469,18 +470,6 @@ Syncer.prototype.addMessage = function*(boxId, msgrow, attrs)
}
}
function splitEmails(s)
{
var re = /^[\s,]*(?:(?:["'](.*?)["']|([^<]+))\s*<([^>]+)>|<?([^<>]+)>?)/; // '
var m, r = [];
while (m = re.exec(s))
{
s = s.substr(m[0].length);
r.push({ name: (m[1]||m[2]||'').trim(), email: (m[3]||m[4]||'').trim() });
}
return r;
}
function toPgArray(a)
{
a = JSON.stringify(a);

View File

@ -175,6 +175,31 @@ function rewriteCss(ast)
}
}
function sanitizeHtml(html)
{
let styles = '';
html = (html||'').replace(/<style[^<>]*>([\s\S]*?)<\/style\s*>/ig, function(m, m1)
{
styles += m1+'\n';
return '';
});
html = html.replace(/^[\s\S]*?<body[^<>]*>([\s\S]*)<\/body>[\s\S]*$/i, '$1');
html = html.replace(/^[\s\S]*?<html[^<>]*>([\s\S]*)<\/html>[\s\S]*$/i, '$1');
if (styles)
{
html = '<style>\n'+styles+'</style>\n'+html;
styles = '';
}
html = htmlawed.sanitize(html||'', { safe: 1, elements: '* +style', keep_bad: 0, comment: 1 });
html = html.replace(/<style[^>]*>([\s\S]*)<\/style\s*>/ig, function(m, m1)
{
var ast = css.parse(m1, { silent: true });
rewriteCss(ast);
return '<style>'+css.stringify(ast)+'</style>';
});
return html;
}
function* getBody(pg, messages, boxId)
{
var p = new MailParser({ streamAttachments: false, defaultCharset: 'windows-1251' });
@ -184,26 +209,7 @@ function* getBody(pg, messages, boxId)
p.on('end', gen.cb());
p.write(msg[0].headers);
let [ obj ] = yield p.end();
let styles = '';
obj.html = (obj.html||'').replace(/<style[^<>]*>([\s\S]*?)<\/style\s*>/ig, function(m, m1)
{
styles += m1+'\n';
return '';
});
obj.html = obj.html.replace(/^[\s\S]*?<body[^<>]*>([\s\S]*)<\/body>[\s\S]*$/i, '$1');
obj.html = obj.html.replace(/^[\s\S]*?<html[^<>]*>([\s\S]*)<\/html>[\s\S]*$/i, '$1');
if (styles)
{
obj.html = '<style>\n'+styles+'</style>\n'+obj.html;
styles = '';
}
obj.html = htmlawed.sanitize(obj.html||'', { safe: 1, elements: '* +style', keep_bad: 0, comment: 1 });
obj.html = obj.html.replace(/<style[^>]*>([\s\S]*)<\/style\s*>/ig, function(m, m1)
{
var ast = css.parse(m1, { silent: true });
rewriteCss(ast);
return '<style>'+css.stringify(ast)+'</style>';
});
obj.html = sanitizeHtml(obj.html);
let upd = { body_text: obj.text||'', body_html: obj.html };
upd.body_html_text = obj.html.replace(/<style[^>]*>.*<\/style\s*>|<\/?[^>]*>/g, '');
yield pg.update('messages m', upd).where({ folder_id: boxId, uid: msg[0].uid }).run(gen.ef());

60
db.sql
View File

@ -18,7 +18,6 @@ create table folders (
uidvalidity int not null,
account_id int not null,
name varchar(255) not null,
unread_count int not null,
highestmodseq int not null default 0,
kind varchar(255) not null,
foreign key (account_id) references accounts (id) on delete cascade on update cascade
@ -34,20 +33,12 @@ create table messages (
inreplyto varchar(1000) not null,
refs varchar(1000)[] not null,
subject text not null,
from_email varchar(255) not null,
from_name varchar(255) not null,
replyto_email varchar(255) not null,
replyto_name varchar(255) not null,
to_list text not null,
cc_list text not null,
bcc_list text not null,
headers text not null,
body_html text not null,
body_text text not null,
body_html_text text not null,
text_index tsvector not null,
props jsonb not null,
body_html text not null default '',
body_text text not null default '',
body_html_text text not null default '',
time timestamptz not null,
size unsigned not null,
size int not null,
flags varchar(255)[] not null,
foreign key (folder_id) references folders (id) on delete cascade on update cascade
);
@ -56,29 +47,23 @@ create index messages_flags on messages using gin (folder_id, flags);
create index messages_messageid on messages (messageid);
create index messages_refs on messages using gin (refs);
create index messages_time on messages (folder_id, time);
create index messages_text on messages using gin (text_index);
create or replace function fn_messages_text_index() returns trigger
security definer language plpgsql as $$
create or replace function messages_fulltext(msg messages) returns tsvector
language plpgsql immutable as $$
begin
NEW.text_index = (
setweight(to_tsvector('russian', regexp_replace(NEW.from_name || ' ' || NEW.from_email || ' ' ||
NEW.replyto_name || ' ' || NEW.replyto_email || ' ' ||
NEW.to_list || ' ' || NEW.cc_list || ' ' || NEW.bcc_list || ' ' || NEW.subject, '\W+', ' ', 'g')), 'A') ||
setweight(to_tsvector('russian', NEW.body_html_text || ' ' || NEW.body_text), 'B')
);
return NEW;
return setweight(to_tsvector('russian', regexp_replace(
coalesce(msg.props->>'from', '') || ' ' ||
coalesce(msg.props->>'replyto', '') || ' ' ||
coalesce(msg.props->>'to', '') || ' ' ||
coalesce(msg.props->>'cc', '') || ' ' ||
coalesce(msg.props->>'bcc', '') || ' ' ||
coalesce(msg.props->>'attachments', '') || ' ' ||
msg.subject,
'\W+', ' ', 'g'
)), 'A')
|| setweight(to_tsvector('russian', msg.body_html_text || ' ' || msg.body_text), 'B');
end
$$;
create trigger messages_text_index before insert or update on messages
for each row execute procedure fn_messages_text_index();
create table attachments (
id serial not null primary key,
msg_id int not null,
ctype varchar(255) not null,
size unsigned not null,
foreign key (msg_id) references messages (id) on delete cascade on update cascade
);
create index messages_text on messages using gin (messages_fulltext(messages));
create table threads (
id serial not null primary key,
@ -90,6 +75,7 @@ create index threads_first_msg on threads (first_msg);
alter table messages add foreign key (thread_id) references threads (id) on delete restrict on update cascade;
--create table tt as with recursive t (id, messageid, upperid, uppermsg) as (select (array_agg(m1.id))[0], m1.messageid, (array_agg(m1.id))[1], m1.messageid from messages m1 left join messages m2 on m1.messageid!='' and m1.inreplyto!='' and m2.messageid=m1.inreplyto where m2.id is null group by m1.messageid union select m1.id, m1.messageid, t.upperid, t.uppermsg from messages m1 inner join t on m1.inreplyto!='' and m1.inreplyto=t.messageid where m1.messageid!='') select * from t;
--alter table messages alter flags type varchar(255)[] using (case when flags&1=1 then array['recent'] else array[]::varchar(255)[] end) || (case when flags&2=2 then array['flagged'] else array[]::varchar(255)[] end) || (case when flags&4=4 then array['answered'] else array[]::varchar(255)[] end) || (case when flags&8=8 then array['unread'] else array[]::varchar(255)[] end);
alter table accounts owner to operetta;
alter table folders owner to operetta;
alter table messages owner to operetta;
alter table threads owner to operetta;

View File

@ -42,7 +42,7 @@ var syncerweb = new SyncerWeb(syncer, pg, cfg);
gen.run(function*()
{
yield* syncer.init(cfg);
//yield* syncer.syncAll();
yield* syncer.syncAll();
});
syncerweb.listen(8057);