aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlex Legler <alex@a3li.li>2015-02-19 20:02:20 +0100
committerAlex Legler <alex@a3li.li>2015-02-19 20:02:20 +0100
commit5407f1f169e932063fb145bbb2a971a2188b9cd4 (patch)
tree3b1d38bcf4b14cffbd54899614c1cbda156e6861
downloadbackend-5407f1f169e932063fb145bbb2a971a2188b9cd4.tar.gz
backend-5407f1f169e932063fb145bbb2a971a2188b9cd4.tar.bz2
backend-5407f1f169e932063fb145bbb2a971a2188b9cd4.zip
Initial version
-rw-r--r--Gemfile7
-rw-r--r--Gemfile.lock40
-rwxr-xr-xag161
-rw-r--r--lib/rendering.rb58
-rw-r--r--lib/storage.rb208
-rw-r--r--lib/threading.rb68
-rw-r--r--lib/utils.rb14
7 files changed, 556 insertions, 0 deletions
diff --git a/Gemfile b/Gemfile
new file mode 100644
index 0000000..6689dd1
--- /dev/null
+++ b/Gemfile
@@ -0,0 +1,7 @@
+source 'https://rubygems.org'
+
+gem 'mail'
+gem 'maildir'
+gem 'elasticsearch'
+gem 'sanitize'
+gem 'charlock_holmes' \ No newline at end of file
diff --git a/Gemfile.lock b/Gemfile.lock
new file mode 100644
index 0000000..ca40918
--- /dev/null
+++ b/Gemfile.lock
@@ -0,0 +1,40 @@
+GEM
+ remote: https://rubygems.org/
+ specs:
+ charlock_holmes (0.7.3)
+ crass (1.0.1)
+ elasticsearch (1.0.6)
+ elasticsearch-api (= 1.0.6)
+ elasticsearch-transport (= 1.0.6)
+ elasticsearch-api (1.0.6)
+ multi_json
+ elasticsearch-transport (1.0.6)
+ faraday
+ multi_json
+ faraday (0.9.1)
+ multipart-post (>= 1.2, < 3)
+ mail (2.6.3)
+ mime-types (>= 1.16, < 3)
+ maildir (2.2.0)
+ mime-types (2.4.3)
+ mini_portile (0.6.2)
+ multi_json (1.10.1)
+ multipart-post (2.0.0)
+ nokogiri (1.6.6.2)
+ mini_portile (~> 0.6.0)
+ nokogumbo (1.2.0)
+ nokogiri
+ sanitize (3.1.1)
+ crass (~> 1.0.1)
+ nokogiri (>= 1.4.4)
+ nokogumbo (= 1.2.0)
+
+PLATFORMS
+ ruby
+
+DEPENDENCIES
+ charlock_holmes
+ elasticsearch
+ mail
+ maildir
+ sanitize
diff --git a/ag b/ag
new file mode 100755
index 0000000..3e7bf55
--- /dev/null
+++ b/ag
@@ -0,0 +1,161 @@
+#!/usr/bin/env ruby
+# Ag -- archiving all the 'golden' flamewars on -dev
+# Alex Legler <a3li@gentoo.org>
+
+require 'bundler/setup'
+require 'mail'
+require 'maildir'
+require 'elasticsearch'
+require 'optparse'
+require_relative 'lib/utils'
+require_relative 'lib/threading'
+require_relative 'lib/rendering'
+require_relative 'lib/storage'
+
+$options = OpenStruct.new
+$options.action = nil
+$options.name = nil
+$options.index_only = false
+$options.no_threading = false
+$options.debug = false
+
+op = OptionParser.new do |opts|
+ opts.banner = "Usage: ag <<--index-full|--index-new|--delete|--reindex|--info> <--list listname>|<--fix>> <maildir/file/hash/messageid> [options]"
+
+ opts.on('--index-full', 'Read the full past archive from the .cur Maildir') do
+ abort 'Can only select one action' if $options.action != nil
+
+ $options.action = :do_full
+ end
+
+ opts.on('--index-new', 'Read new messages from .new and move them to .cur') do
+ abort 'Can only select one action' if $options.action != nil
+
+ $options.action = :do_incremental
+ end
+
+ opts.on('--fix', 'Fix up weird info sections') do
+ abort 'Can only select one action' if $options.action != nil
+
+ $options.action = :do_fix
+ end
+
+ opts.on('--delete', 'Delete message. Needs --file, --msgid, or --hash') do
+ abort 'Can only select one action' if $options.action != nil
+
+ $options.action = :do_delete
+ end
+
+ opts.on('--reindex', 'Reindex message. Needs --file') do
+ abort 'Can only select one action' if $options.action != nil
+
+ $options.action = :do_reindex
+ end
+
+ opts.on('--list NAME', 'Name of the mailing list to work with') do |name|
+ $options.name = name
+ end
+
+ opts.on('--file', 'The argument is a file') do
+ $options.argmode = :file
+ end
+
+ opts.on('--msgid', 'The argument is a Message-Id') do
+ $options.argmode = :msgid
+ end
+
+ opts.on('--hash', 'The argument is a X-Archives-Hash') do
+ $options.argmode = :hash
+ end
+
+ opts.on('--index-only', 'Only delete the message from the index, not from disk') do
+ $options.index_only = true
+ end
+
+ opts.on('--no-threading', 'Only index, don\'t update threading') do
+ $options.no_threading = true
+ end
+
+ opts.on('--debug', 'Print debug messages') do
+ $options.debug = true
+ end
+end
+op.parse!
+
+abort op.help unless $options.action
+abort 'List name required' unless $options.name
+$options.dir = ARGV[0] or abort 'Need a Maildir/File/Hash/Message-Id to work with'
+
+$listname = ARGV[0] or abort 'List name required'
+
+# Open maildir and set serializer
+$maildir = Maildir.new(File.join($options.dir), false)
+$maildir.serializer = Maildir::Serializer::Mail.new
+
+# Connect to Elasticsearch
+$es = Elasticsearch::Client.new(log: false)
+$es.transport.reload_connections!
+
+###############################################################################
+
+def do_fix
+ Dir.chdir($options.dir) do
+ Dir.glob('*:2,:2,S').each do |f|
+ puts "Fixing #{f}"
+ File.rename(f, f.gsub(':2,:2,S', ':2,S'))
+ end
+ end
+end
+
+def do_full
+ Ag::Storage.create_index($options.name)
+
+ $maildir.list(:cur).each do |maildir_message|
+ mail = maildir_message.data
+
+ begin
+ Ag::Storage.store($options.name, mail)
+ rescue => e
+ $stderr.puts "Cannot save message #{mail.message_id}: #{e.message}"
+ next
+ end
+ end
+
+ Ag::Threading.calc($options.name) unless $options.no_threading
+end
+
+def do_incremental
+ $maildir.list(:new).each do |maildir_message|
+ mail = maildir_message.data
+
+ begin
+ Ag::Storage.store($options.name, mail)
+ maildir_message.process
+ rescue => e
+ $stderr.puts "Cannot save message #{mail.message_id}: #{e.message}"
+ next
+ end
+ end
+
+ Ag::Threading.calc($options.name) unless $options.no_threading
+end
+
+def do_delete
+ abort 'Come back later.'
+end
+
+def do_reindex
+ abort 'Come back later.'
+end
+
+def do_info
+ abort 'Come back later.'
+end
+
+###############################################################################
+
+begin
+ send $options.action
+rescue NoMethodError
+ abort 'Internal Error: Unknown action'
+end \ No newline at end of file
diff --git a/lib/rendering.rb b/lib/rendering.rb
new file mode 100644
index 0000000..3e77414
--- /dev/null
+++ b/lib/rendering.rb
@@ -0,0 +1,58 @@
+require 'sanitize'
+require 'cgi'
+
+module Ag::Rendering
+ class HTMLizer
+ def self.HTMLize(mail)
+ if mail.multipart?
+ content_type = mime_split(mail.parts.first.content_type)
+
+ if content_type == 'text/plain' or content_type == 'text/html'
+ to_content(content_type, mail.parts.first.decoded)
+ else
+ # Nested multipart?
+ if mail.parts.first.multipart?
+ content_type = mime_split(mail.parts.first.parts.first.content_type)
+
+ if content_type == 'text/plain' or content_type == 'text/html'
+ to_content(content_type, mail.parts.first.parts.first.decoded)
+ else
+ raise "Cannot find body: #{mail.message_id}"
+ end
+ # Specialty: Gnus/Emacs signed emails with no explicit multipart type
+ elsif mime_split(mail.content_type) == 'multipart/signed'
+ to_content('text/plain', mail.parts.first.decoded)
+ end
+ end
+ else
+ # No Content-Type, assume plain text (git-send-email)
+ if mail.content_type == nil
+ to_content('text/plain', mail.body.decoded)
+ else
+ to_content(mime_split(mail.content_type), mail.body.decoded)
+ end
+ end
+ end
+
+ def self.to_content(content_type, content)
+ if content_type == 'text/plain'
+ escaped_content = CGI::escapeHTML(content)
+ escaped_content.lines.map do |line|
+ if line.start_with? '&gt;'
+ "<div class=\"ag-quote\">#{line.rstrip}</div>\n"
+ else
+ line
+ end
+ end.join.gsub("</div>\n<div class=\"ag-quote\">", "\n")
+ elsif content_type == 'text/html'
+ '<div class="ag-html-content">' + Sanitize.clean(content, Sanitize::Config::BASIC) + '</div>'
+ else
+ '<div class="alert alert-danger" role="alert"><strong>Unsupported Content-Type</strong></div>'
+ end
+ end
+
+ def self.mime_split(content_type)
+ (content_type || '').split(';').first
+ end
+ end
+end \ No newline at end of file
diff --git a/lib/storage.rb b/lib/storage.rb
new file mode 100644
index 0000000..9045a0b
--- /dev/null
+++ b/lib/storage.rb
@@ -0,0 +1,208 @@
+require 'elasticsearch'
+require 'date'
+
+module Ag::Storage
+ module_function
+ def create_index(list)
+ begin
+ $es.indices.delete index: 'ml-' + list
+ rescue Elasticsearch::Transport::Transport::Errors::NotFound => e
+ $stderr.puts "Index did not exist yet. Creating."
+ end
+
+ $es.indices.create(
+ index: 'ml-' + list,
+ body: {
+ mappings: {
+ message: {
+ properties: {
+ attachments: {
+ properties: {
+ filename: {
+ type: 'string',
+ index: 'not_analyzed'
+ },
+ mime: {
+ type: 'string',
+ index: 'not_analyzed'
+ }
+ }
+ },
+ cc: {
+ type: 'string'
+ },
+ content: {
+ type: 'string'
+ },
+ date: {
+ type: 'date',
+ format: 'dateOptionalTime'
+ },
+ from: {
+ type: 'string'
+ },
+ from_realname: {
+ type: 'string'
+ },
+ month: {
+ type: 'integer'
+ },
+ parent: {
+ type: 'string',
+ index: 'not_analyzed'
+ },
+ raw_message_id: {
+ type: 'string',
+ index: 'not_analyzed'
+ },
+ raw_parent: {
+ type: 'string'
+ },
+ subject: {
+ type: 'string'
+ },
+ to: {
+ type: 'string'
+ }
+ }
+ }
+ }
+ })
+ end
+
+ def get_content(message)
+ content = "Cannot parse MIME/contents."
+ begin
+ raw_content = Ag::Rendering::HTMLizer.HTMLize(message)
+ content = Ag::Utils.fix_encoding(raw_content || '').strip
+
+ if content == ''
+ $stderr.puts "#{message.message_id}: Content empty"
+ end
+ rescue
+ $stderr.puts "#{message.message_id}: Invalid encoding"
+ end
+
+ content
+ end
+
+ def get_parent_message(list, parent_message_id = nil)
+ return nil if parent_message_id == nil
+
+ result = $es.search(
+ index: 'ml-' + list,
+ body: {
+ query: {
+ filtered: {
+ filter: {
+ term: { raw_message_id: parent_message_id }
+ }
+ }
+ },
+ fields: ['_id']
+ }
+ )
+
+ return nil if result['hits']['total'] == 0
+
+ result['hits']['hits'].first['_id']
+ end
+
+ def store(list, message)
+ content = get_content(message)
+
+ identifier = message['X-Archives-Hash'].value
+ raw_parent = Ag::Threading.get_parent_message_id(message)
+
+ from = Ag::Utils.fix_encoding(message[:from].formatted.first)
+ from_realname = from.gsub(/<(.*)>/, '').strip
+
+ to = ''
+ if message[:to]
+ to = Ag::Utils.fix_encoding(message[:to].formatted.join(','))
+ end
+
+ cc = ''
+ if message[:cc]
+ cc = Ag::Utils.fix_encoding(message[:cc].formatted.join(','))
+ end
+
+ subject = Ag::Utils.fix_encoding(message.subject)
+
+ attachments = []
+ if message.has_attachments?
+ message.attachments.each do |attachment|
+ attachments << {
+ filename: attachment.filename,
+ mime: attachment.mime_type
+ }
+ end
+ end
+
+ $es.index(
+ index: 'ml-' + list,
+ type: 'message',
+ id: identifier,
+ body: {
+ raw_message_id: message.message_id,
+ subject: subject,
+ to: to,
+ cc: cc,
+ from: from,
+ from_realname: from_realname,
+ date: message.date,
+ month: ("%i%02i" % [message.date.year, message.date.month]).to_i, # this is a sortable number!
+ content: content,
+ attachments: attachments,
+ raw_parent: raw_parent
+ }
+ )
+ end
+
+ def fix_threading(list)
+ result = $es.search(
+ index: 'ml-' + list,
+ size: 100000,
+ body: {
+ size: 100000,
+ query: {
+ filtered: {
+ filter: {
+ and: [
+ {
+ missing: {
+ field: 'parent'
+ }
+ },
+ {
+ exists: {
+ field: 'raw_parent'
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ )
+
+ result['hits']['hits'].each do |hit|
+ msg = get_parent_message(list, hit['_source']['raw_parent'])
+
+ unless msg == nil
+ $es.update(
+ index: 'ml-' + list,
+ type: 'message',
+ id: hit['_id'],
+ body: {
+ doc: {
+ parent: msg
+ }
+ }
+ )
+ end
+ end
+
+ result['hits']['total']
+ end
+end \ No newline at end of file
diff --git a/lib/threading.rb b/lib/threading.rb
new file mode 100644
index 0000000..8988f23
--- /dev/null
+++ b/lib/threading.rb
@@ -0,0 +1,68 @@
+module Ag
+ module Threading
+ module_function
+ # Figures out the Message-Id of the parent message,
+ # or returns nil if we asusme this message is not a reply
+ def get_parent_message_id(mail)
+ # No headers -> no parent message
+ if mail.in_reply_to == nil and mail.references == nil
+ return nil
+ else
+ irt_value = nil
+
+ if mail.in_reply_to.is_a? Array
+ irt_value = mail.in_reply_to.last
+ elsif mail.in_reply_to.is_a? String
+ irt_value = mail.in_reply_to
+
+ # Gnus/Emacs specialty du jour
+ # => "<1075186049.4264.1.camel@TesterTop.tester.ca> (Olivier CrĂȘte's message of \"Tue, 27 Jan 2004 07:47:29 +0100\")"
+ if irt_value.start_with? '<'
+ irt_value = irt_value[0..irt_value.rindex('>')] unless irt_value.end_with? '>'
+ irt_value.gsub!(/(^<|>$)/, '')
+ end
+ elsif mail.in_reply_to == nil
+ # nothing to do
+ else
+ $stderr.puts "In-Reply-To is a weird type: #{mail.message_id}" if $options.debug
+ end
+
+ ref_value = nil
+ if mail.references.is_a? Array
+ ref_value = mail.references.last
+ elsif mail.references.is_a? String
+ ref_value = mail.references
+ elsif mail.references == nil
+ # nothing to do
+ else
+ $stderr.puts "References is a weird type: #{mail.message_id}" if $options.debug
+ end
+
+ if irt_value == ref_value
+ return irt_value.to_s
+ elsif irt_value == nil
+ return ref_value.to_s
+ elsif ref_value == nil
+ return irt_value.to_s
+ else
+ $stderr.puts "In-Reply-To and References disagree: #{mail.message_id}" if $options.debug
+ # If in doubt, let In-Reply-To win
+ return irt_value.to_s
+ end
+ end
+
+ $stderr.puts "Couldn't find a parent id for Message-Id: #{mail.message_id}" if $options.debug
+ nil
+ end
+
+ def calc(list)
+ number_of_root_threads = -1
+ loop do
+ new_num = Ag::Storage.fix_threading(list)
+
+ break if new_num == number_of_root_threads
+ number_of_root_threads = new_num
+ end
+ end
+ end
+end \ No newline at end of file
diff --git a/lib/utils.rb b/lib/utils.rb
new file mode 100644
index 0000000..109a6a5
--- /dev/null
+++ b/lib/utils.rb
@@ -0,0 +1,14 @@
+require 'charlock_holmes'
+
+module Ag
+ module Utils
+ module_function
+ def fix_encoding(str)
+ detection = CharlockHolmes::EncodingDetector.detect(str)
+ CharlockHolmes::Converter.convert(str, detection[:encoding], 'UTF-8')
+ rescue => e
+ $stderr.puts e.message if $options.debug
+ 'Encoding could not be reliably detected. Message contents not available.'
+ end
+ end
+end \ No newline at end of file