aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'lib/storage.rb')
-rw-r--r--lib/storage.rb208
1 files changed, 208 insertions, 0 deletions
diff --git a/lib/storage.rb b/lib/storage.rb
new file mode 100644
index 0000000..9045a0b
--- /dev/null
+++ b/lib/storage.rb
@@ -0,0 +1,208 @@
+require 'elasticsearch'
+require 'date'
+
+module Ag::Storage
+ module_function
+ def create_index(list)
+ begin
+ $es.indices.delete index: 'ml-' + list
+ rescue Elasticsearch::Transport::Transport::Errors::NotFound => e
+ $stderr.puts "Index did not exist yet. Creating."
+ end
+
+ $es.indices.create(
+ index: 'ml-' + list,
+ body: {
+ mappings: {
+ message: {
+ properties: {
+ attachments: {
+ properties: {
+ filename: {
+ type: 'string',
+ index: 'not_analyzed'
+ },
+ mime: {
+ type: 'string',
+ index: 'not_analyzed'
+ }
+ }
+ },
+ cc: {
+ type: 'string'
+ },
+ content: {
+ type: 'string'
+ },
+ date: {
+ type: 'date',
+ format: 'dateOptionalTime'
+ },
+ from: {
+ type: 'string'
+ },
+ from_realname: {
+ type: 'string'
+ },
+ month: {
+ type: 'integer'
+ },
+ parent: {
+ type: 'string',
+ index: 'not_analyzed'
+ },
+ raw_message_id: {
+ type: 'string',
+ index: 'not_analyzed'
+ },
+ raw_parent: {
+ type: 'string'
+ },
+ subject: {
+ type: 'string'
+ },
+ to: {
+ type: 'string'
+ }
+ }
+ }
+ }
+ })
+ end
+
+ def get_content(message)
+ content = "Cannot parse MIME/contents."
+ begin
+ raw_content = Ag::Rendering::HTMLizer.HTMLize(message)
+ content = Ag::Utils.fix_encoding(raw_content || '').strip
+
+ if content == ''
+ $stderr.puts "#{message.message_id}: Content empty"
+ end
+ rescue
+ $stderr.puts "#{message.message_id}: Invalid encoding"
+ end
+
+ content
+ end
+
+ def get_parent_message(list, parent_message_id = nil)
+ return nil if parent_message_id == nil
+
+ result = $es.search(
+ index: 'ml-' + list,
+ body: {
+ query: {
+ filtered: {
+ filter: {
+ term: { raw_message_id: parent_message_id }
+ }
+ }
+ },
+ fields: ['_id']
+ }
+ )
+
+ return nil if result['hits']['total'] == 0
+
+ result['hits']['hits'].first['_id']
+ end
+
+ def store(list, message)
+ content = get_content(message)
+
+ identifier = message['X-Archives-Hash'].value
+ raw_parent = Ag::Threading.get_parent_message_id(message)
+
+ from = Ag::Utils.fix_encoding(message[:from].formatted.first)
+ from_realname = from.gsub(/<(.*)>/, '').strip
+
+ to = ''
+ if message[:to]
+ to = Ag::Utils.fix_encoding(message[:to].formatted.join(','))
+ end
+
+ cc = ''
+ if message[:cc]
+ cc = Ag::Utils.fix_encoding(message[:cc].formatted.join(','))
+ end
+
+ subject = Ag::Utils.fix_encoding(message.subject)
+
+ attachments = []
+ if message.has_attachments?
+ message.attachments.each do |attachment|
+ attachments << {
+ filename: attachment.filename,
+ mime: attachment.mime_type
+ }
+ end
+ end
+
+ $es.index(
+ index: 'ml-' + list,
+ type: 'message',
+ id: identifier,
+ body: {
+ raw_message_id: message.message_id,
+ subject: subject,
+ to: to,
+ cc: cc,
+ from: from,
+ from_realname: from_realname,
+ date: message.date,
+ month: ("%i%02i" % [message.date.year, message.date.month]).to_i, # this is a sortable number!
+ content: content,
+ attachments: attachments,
+ raw_parent: raw_parent
+ }
+ )
+ end
+
+ def fix_threading(list)
+ result = $es.search(
+ index: 'ml-' + list,
+ size: 100000,
+ body: {
+ size: 100000,
+ query: {
+ filtered: {
+ filter: {
+ and: [
+ {
+ missing: {
+ field: 'parent'
+ }
+ },
+ {
+ exists: {
+ field: 'raw_parent'
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ )
+
+ result['hits']['hits'].each do |hit|
+ msg = get_parent_message(list, hit['_source']['raw_parent'])
+
+ unless msg == nil
+ $es.update(
+ index: 'ml-' + list,
+ type: 'message',
+ id: hit['_id'],
+ body: {
+ doc: {
+ parent: msg
+ }
+ }
+ )
+ end
+ end
+
+ result['hits']['total']
+ end
+end \ No newline at end of file