diff options
Diffstat (limited to 'lib/storage.rb')
-rw-r--r-- | lib/storage.rb | 208 |
1 files changed, 208 insertions, 0 deletions
diff --git a/lib/storage.rb b/lib/storage.rb new file mode 100644 index 0000000..9045a0b --- /dev/null +++ b/lib/storage.rb @@ -0,0 +1,208 @@ +require 'elasticsearch' +require 'date' + +module Ag::Storage + module_function + def create_index(list) + begin + $es.indices.delete index: 'ml-' + list + rescue Elasticsearch::Transport::Transport::Errors::NotFound => e + $stderr.puts "Index did not exist yet. Creating." + end + + $es.indices.create( + index: 'ml-' + list, + body: { + mappings: { + message: { + properties: { + attachments: { + properties: { + filename: { + type: 'string', + index: 'not_analyzed' + }, + mime: { + type: 'string', + index: 'not_analyzed' + } + } + }, + cc: { + type: 'string' + }, + content: { + type: 'string' + }, + date: { + type: 'date', + format: 'dateOptionalTime' + }, + from: { + type: 'string' + }, + from_realname: { + type: 'string' + }, + month: { + type: 'integer' + }, + parent: { + type: 'string', + index: 'not_analyzed' + }, + raw_message_id: { + type: 'string', + index: 'not_analyzed' + }, + raw_parent: { + type: 'string' + }, + subject: { + type: 'string' + }, + to: { + type: 'string' + } + } + } + } + }) + end + + def get_content(message) + content = "Cannot parse MIME/contents." + begin + raw_content = Ag::Rendering::HTMLizer.HTMLize(message) + content = Ag::Utils.fix_encoding(raw_content || '').strip + + if content == '' + $stderr.puts "#{message.message_id}: Content empty" + end + rescue + $stderr.puts "#{message.message_id}: Invalid encoding" + end + + content + end + + def get_parent_message(list, parent_message_id = nil) + return nil if parent_message_id == nil + + result = $es.search( + index: 'ml-' + list, + body: { + query: { + filtered: { + filter: { + term: { raw_message_id: parent_message_id } + } + } + }, + fields: ['_id'] + } + ) + + return nil if result['hits']['total'] == 0 + + result['hits']['hits'].first['_id'] + end + + def store(list, message) + content = get_content(message) + + identifier = message['X-Archives-Hash'].value + raw_parent = Ag::Threading.get_parent_message_id(message) + + from = Ag::Utils.fix_encoding(message[:from].formatted.first) + from_realname = from.gsub(/<(.*)>/, '').strip + + to = '' + if message[:to] + to = Ag::Utils.fix_encoding(message[:to].formatted.join(',')) + end + + cc = '' + if message[:cc] + cc = Ag::Utils.fix_encoding(message[:cc].formatted.join(',')) + end + + subject = Ag::Utils.fix_encoding(message.subject) + + attachments = [] + if message.has_attachments? + message.attachments.each do |attachment| + attachments << { + filename: attachment.filename, + mime: attachment.mime_type + } + end + end + + $es.index( + index: 'ml-' + list, + type: 'message', + id: identifier, + body: { + raw_message_id: message.message_id, + subject: subject, + to: to, + cc: cc, + from: from, + from_realname: from_realname, + date: message.date, + month: ("%i%02i" % [message.date.year, message.date.month]).to_i, # this is a sortable number! + content: content, + attachments: attachments, + raw_parent: raw_parent + } + ) + end + + def fix_threading(list) + result = $es.search( + index: 'ml-' + list, + size: 100000, + body: { + size: 100000, + query: { + filtered: { + filter: { + and: [ + { + missing: { + field: 'parent' + } + }, + { + exists: { + field: 'raw_parent' + } + } + ] + } + } + } + } + ) + + result['hits']['hits'].each do |hit| + msg = get_parent_message(list, hit['_source']['raw_parent']) + + unless msg == nil + $es.update( + index: 'ml-' + list, + type: 'message', + id: hit['_id'], + body: { + doc: { + parent: msg + } + } + ) + end + end + + result['hits']['total'] + end +end
\ No newline at end of file |