From 3d218e0aa5a3e7d0f42e28d6d35b50b5a001e167 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Wed, 5 Oct 2016 01:46:35 +0300 Subject: [PATCH] use emailjs MailParser instead of Imap.parseHeader, simplify DB structure --- Syncer.js | 91 +++++++++++++++++++++++----------------------------- SyncerWeb.js | 46 ++++++++++++++------------ db.sql | 60 +++++++++++++--------------------- operetta.js | 2 +- 4 files changed, 90 insertions(+), 109 deletions(-) diff --git a/Syncer.js b/Syncer.js index 19f6d6b..3437aa2 100644 --- a/Syncer.js +++ b/Syncer.js @@ -2,6 +2,7 @@ const gen = require('gen-thread'); const Imap = require('imap'); const ImapManager = require('./ImapManager.js'); const EventEmitter = require('events').EventEmitter; +const MailParser = require('mailparser').MailParser; module.exports = Syncer; @@ -196,8 +197,6 @@ Syncer.prototype.syncBox = function*(srv, accountId, boxName, boxKind, doFull) account_id: accountId, highestmodseq: 0, kind: boxKind||'' - //unread_count: boxStatus.messages.new, - //total_count: boxStatus.messages.total, }).returning('id').row(gen.ef()); } @@ -205,7 +204,7 @@ Syncer.prototype.syncBox = function*(srv, accountId, boxName, boxKind, doFull) var missing = []; var [ maxUid ] = yield this.pg.select('MAX(uid)').from('messages') .where({ folder_id: boxRow.id }).val(gen.ef()); - if (boxStatus.highestmodseq) + if (boxRow.highestmodseq) { this.events.emit('sync', { state: 'start', quick: true, email: this.accounts[accountId].email, folder: boxRow.name }); process.stderr.write(this.accounts[accountId].email+'/'+boxRow.name+': quick resync\n'); @@ -227,8 +226,8 @@ Syncer.prototype.syncBox = function*(srv, accountId, boxName, boxKind, doFull) }, (messages, state) => this.saveMessages(messages, boxRow.id, state)); yield this.pg.update('folders', { - uidvalidity: boxRow.uidvalidity, - highestmodseq: boxRow.highestmodseq||0 + uidvalidity: boxStatus.uidvalidity, + highestmodseq: boxStatus.highestmodseq||0 }).where({ id: boxRow.id }).run(gen.ef()); } @@ -375,6 +374,15 @@ Syncer.prototype.saveMessages = function*(messages, boxId) yield* this.addMessage(boxId, messages[i][0], messages[i][1]); } +Syncer.prototype.parseMsg = function*(msg) +{ + var parser = new MailParser({ streamAttachments: false, defaultCharset: 'utf-8' }); + parser.once('end', gen.cb()); + parser.write(msg); + var [ obj ] = yield parser.end(); + return obj; +} + Syncer.prototype.addMessage = function*(boxId, msgrow, attrs) { var self = this; @@ -383,49 +391,42 @@ Syncer.prototype.addMessage = function*(boxId, msgrow, attrs) { [ pgtx, end_transaction ] = yield this.pg.transaction(gen.cb(), function(e) { if (e) throw e; }); - var header = Imap.parseHeader(msgrow.headers); - for (var i in header) - for (var k = 0; k < header[i].length; k++) - header[i][k] = header[i][k].replace(/\x00/g, ''); - header.from = header.from && splitEmails(header.from[0])[0]; - header.replyto = header['reply-to'] && splitEmails(header['reply-to'][0])[0]; - var re = /(<[^>]*>)/; - header.references = (header.references && header.references[0] || '').split(re).filter(a => a.match(re)); + let header = yield* this.parseMsg(msgrow.headers); + header.references = header.references || []; if (header.references.length) { - if (header.references.length > 10) - header.references = [ header.references[0] ].concat(header.references.slice(header.references.length-9)); - if (!header['in-reply-to'] || !header['in-reply-to'][0]) - header['in-reply-to'] = [ header.references[header.references.length-1] ]; - else if (header.references[header.references.length-1] != header['in-reply-to'][0]) - header.references.push(header['in-reply-to'][0]); - } - if (header.date) - { - var t = Date.parse(header.date[0]); - if (!isNaN(t)) - header.date = new Date(t); - else - header.date = null; + if (!header.inReplyTo || !header.inReplyTo[0]) + header.inReplyTo = [ header.references[header.references.length-1] ]; + else if (header.references[header.references.length-1] != header.inReplyTo[0]) + header.references.push(header.inReplyTo[0]); } if (!header.date) header.date = new Date(attrs.date); + if (!header.from) + { + console.log(msgrow.headers); + console.log(header); + } + delete msgrow.headers; msgrow.folder_id = boxId; - msgrow.from_email = header.from && header.from.email || ''; - msgrow.from_name = header.from && header.from.name || ''; - msgrow.replyto_email = header.replyto && header.replyto.email || ''; - msgrow.replyto_name = header.replyto && header.replyto.name || ''; - msgrow.to_list = header.to && header.to[0] || ''; - msgrow.cc_list = header.cc && header.cc[0] || ''; - msgrow.bcc_list = header.bcc && header.bcc[0] || ''; - msgrow.subject = header.subject && header.subject[0] || ''; - msgrow.messageid = header['message-id'] && header['message-id'][0] || ''; - msgrow.inreplyto = header['in-reply-to'] && header['in-reply-to'][0] || ''; - msgrow.inreplyto = msgrow.inreplyto.replace(/^[\s\S]*(<[^>]*>)[\s\S]*$/, '$1'); + msgrow.subject = header.subject || ''; + msgrow.props = JSON.stringify({ + from: ((header.from||[]).map((a) => [ a.name, a.address ]))[0], + to: (header.to||[]).map((a) => [ a.name, a.address ]), + cc: (header.cc||[]).map((a) => [ a.name, a.address ]), + bcc: (header.bcc||[]).map((a) => [ a.name, a.address ]), + replyto: (header.replyTo||[]).map((a) => [ a.name, a.address ]), + }); + msgrow.messageid = header.messageId || ''; + msgrow.inreplyto = header.inReplyTo && header.inReplyTo[0] || ''; msgrow.time = header.date; + msgrow.size = attrs.size; msgrow.flags = toPgArray(msgrow.flags); msgrow.refs = toPgArray(header.references); + for (let i in msgrow) + if (typeof msgrow[i] == 'string') + msgrow[i] = msgrow[i].replace(/\x00/g, ''); var thisIsFirst = false; if (header.references.length) @@ -441,7 +442,7 @@ Syncer.prototype.addMessage = function*(boxId, msgrow, attrs) } msgrow.thread_id = threadId; } - console.log(msgrow.time+' '+msgrow.from_email+' '+msgrow.subject); + console.log(msgrow.time+' '+(header.from && header.from[0] && header.from[0].address || '?')+' '+msgrow.subject); [ msgrow.id ] = yield pgtx.insert('messages', msgrow).returning('id').val(gen.ef()); if (!msgrow.thread_id) { @@ -469,18 +470,6 @@ Syncer.prototype.addMessage = function*(boxId, msgrow, attrs) } } -function splitEmails(s) -{ - var re = /^[\s,]*(?:(?:["'](.*?)["']|([^<]+))\s*<([^>]+)>|]+)>?)/; // ' - var m, r = []; - while (m = re.exec(s)) - { - s = s.substr(m[0].length); - r.push({ name: (m[1]||m[2]||'').trim(), email: (m[3]||m[4]||'').trim() }); - } - return r; -} - function toPgArray(a) { a = JSON.stringify(a); diff --git a/SyncerWeb.js b/SyncerWeb.js index 6c261f8..84483ed 100644 --- a/SyncerWeb.js +++ b/SyncerWeb.js @@ -175,6 +175,31 @@ function rewriteCss(ast) } } +function sanitizeHtml(html) +{ + let styles = ''; + html = (html||'').replace(/]*>([\s\S]*?)<\/style\s*>/ig, function(m, m1) + { + styles += m1+'\n'; + return ''; + }); + html = html.replace(/^[\s\S]*?]*>([\s\S]*)<\/body>[\s\S]*$/i, '$1'); + html = html.replace(/^[\s\S]*?]*>([\s\S]*)<\/html>[\s\S]*$/i, '$1'); + if (styles) + { + html = '\n'+html; + styles = ''; + } + html = htmlawed.sanitize(html||'', { safe: 1, elements: '* +style', keep_bad: 0, comment: 1 }); + html = html.replace(/]*>([\s\S]*)<\/style\s*>/ig, function(m, m1) + { + var ast = css.parse(m1, { silent: true }); + rewriteCss(ast); + return ''; + }); + return html; +} + function* getBody(pg, messages, boxId) { var p = new MailParser({ streamAttachments: false, defaultCharset: 'windows-1251' }); @@ -184,26 +209,7 @@ function* getBody(pg, messages, boxId) p.on('end', gen.cb()); p.write(msg[0].headers); let [ obj ] = yield p.end(); - let styles = ''; - obj.html = (obj.html||'').replace(/]*>([\s\S]*?)<\/style\s*>/ig, function(m, m1) - { - styles += m1+'\n'; - return ''; - }); - obj.html = obj.html.replace(/^[\s\S]*?]*>([\s\S]*)<\/body>[\s\S]*$/i, '$1'); - obj.html = obj.html.replace(/^[\s\S]*?]*>([\s\S]*)<\/html>[\s\S]*$/i, '$1'); - if (styles) - { - obj.html = '\n'+obj.html; - styles = ''; - } - obj.html = htmlawed.sanitize(obj.html||'', { safe: 1, elements: '* +style', keep_bad: 0, comment: 1 }); - obj.html = obj.html.replace(/]*>([\s\S]*)<\/style\s*>/ig, function(m, m1) - { - var ast = css.parse(m1, { silent: true }); - rewriteCss(ast); - return ''; - }); + obj.html = sanitizeHtml(obj.html); let upd = { body_text: obj.text||'', body_html: obj.html }; upd.body_html_text = obj.html.replace(/]*>.*<\/style\s*>|<\/?[^>]*>/g, ''); yield pg.update('messages m', upd).where({ folder_id: boxId, uid: msg[0].uid }).run(gen.ef()); diff --git a/db.sql b/db.sql index b8a1da5..3e57fcc 100644 --- a/db.sql +++ b/db.sql @@ -18,7 +18,6 @@ create table folders ( uidvalidity int not null, account_id int not null, name varchar(255) not null, - unread_count int not null, highestmodseq int not null default 0, kind varchar(255) not null, foreign key (account_id) references accounts (id) on delete cascade on update cascade @@ -34,20 +33,12 @@ create table messages ( inreplyto varchar(1000) not null, refs varchar(1000)[] not null, subject text not null, - from_email varchar(255) not null, - from_name varchar(255) not null, - replyto_email varchar(255) not null, - replyto_name varchar(255) not null, - to_list text not null, - cc_list text not null, - bcc_list text not null, - headers text not null, - body_html text not null, - body_text text not null, - body_html_text text not null, - text_index tsvector not null, + props jsonb not null, + body_html text not null default '', + body_text text not null default '', + body_html_text text not null default '', time timestamptz not null, - size unsigned not null, + size int not null, flags varchar(255)[] not null, foreign key (folder_id) references folders (id) on delete cascade on update cascade ); @@ -56,29 +47,23 @@ create index messages_flags on messages using gin (folder_id, flags); create index messages_messageid on messages (messageid); create index messages_refs on messages using gin (refs); create index messages_time on messages (folder_id, time); -create index messages_text on messages using gin (text_index); -create or replace function fn_messages_text_index() returns trigger -security definer language plpgsql as $$ +create or replace function messages_fulltext(msg messages) returns tsvector +language plpgsql immutable as $$ begin - NEW.text_index = ( - setweight(to_tsvector('russian', regexp_replace(NEW.from_name || ' ' || NEW.from_email || ' ' || - NEW.replyto_name || ' ' || NEW.replyto_email || ' ' || - NEW.to_list || ' ' || NEW.cc_list || ' ' || NEW.bcc_list || ' ' || NEW.subject, '\W+', ' ', 'g')), 'A') || - setweight(to_tsvector('russian', NEW.body_html_text || ' ' || NEW.body_text), 'B') - ); - return NEW; + return setweight(to_tsvector('russian', regexp_replace( + coalesce(msg.props->>'from', '') || ' ' || + coalesce(msg.props->>'replyto', '') || ' ' || + coalesce(msg.props->>'to', '') || ' ' || + coalesce(msg.props->>'cc', '') || ' ' || + coalesce(msg.props->>'bcc', '') || ' ' || + coalesce(msg.props->>'attachments', '') || ' ' || + msg.subject, + '\W+', ' ', 'g' + )), 'A') + || setweight(to_tsvector('russian', msg.body_html_text || ' ' || msg.body_text), 'B'); end $$; -create trigger messages_text_index before insert or update on messages -for each row execute procedure fn_messages_text_index(); - -create table attachments ( - id serial not null primary key, - msg_id int not null, - ctype varchar(255) not null, - size unsigned not null, - foreign key (msg_id) references messages (id) on delete cascade on update cascade -); +create index messages_text on messages using gin (messages_fulltext(messages)); create table threads ( id serial not null primary key, @@ -90,6 +75,7 @@ create index threads_first_msg on threads (first_msg); alter table messages add foreign key (thread_id) references threads (id) on delete restrict on update cascade; ---create table tt as with recursive t (id, messageid, upperid, uppermsg) as (select (array_agg(m1.id))[0], m1.messageid, (array_agg(m1.id))[1], m1.messageid from messages m1 left join messages m2 on m1.messageid!='' and m1.inreplyto!='' and m2.messageid=m1.inreplyto where m2.id is null group by m1.messageid union select m1.id, m1.messageid, t.upperid, t.uppermsg from messages m1 inner join t on m1.inreplyto!='' and m1.inreplyto=t.messageid where m1.messageid!='') select * from t; - ---alter table messages alter flags type varchar(255)[] using (case when flags&1=1 then array['recent'] else array[]::varchar(255)[] end) || (case when flags&2=2 then array['flagged'] else array[]::varchar(255)[] end) || (case when flags&4=4 then array['answered'] else array[]::varchar(255)[] end) || (case when flags&8=8 then array['unread'] else array[]::varchar(255)[] end); +alter table accounts owner to operetta; +alter table folders owner to operetta; +alter table messages owner to operetta; +alter table threads owner to operetta; diff --git a/operetta.js b/operetta.js index 5a946e8..954a3df 100644 --- a/operetta.js +++ b/operetta.js @@ -42,7 +42,7 @@ var syncerweb = new SyncerWeb(syncer, pg, cfg); gen.run(function*() { yield* syncer.init(cfg); - //yield* syncer.syncAll(); + yield* syncer.syncAll(); }); syncerweb.listen(8057);