diff options
author | Alex Legler <alex@a3li.li> | 2015-02-19 20:02:20 +0100 |
---|---|---|
committer | Alex Legler <alex@a3li.li> | 2015-02-19 20:02:20 +0100 |
commit | 5407f1f169e932063fb145bbb2a971a2188b9cd4 (patch) | |
tree | 3b1d38bcf4b14cffbd54899614c1cbda156e6861 | |
download | backend-5407f1f169e932063fb145bbb2a971a2188b9cd4.tar.gz backend-5407f1f169e932063fb145bbb2a971a2188b9cd4.tar.bz2 backend-5407f1f169e932063fb145bbb2a971a2188b9cd4.zip |
Initial version
-rw-r--r-- | Gemfile | 7 | ||||
-rw-r--r-- | Gemfile.lock | 40 | ||||
-rwxr-xr-x | ag | 161 | ||||
-rw-r--r-- | lib/rendering.rb | 58 | ||||
-rw-r--r-- | lib/storage.rb | 208 | ||||
-rw-r--r-- | lib/threading.rb | 68 | ||||
-rw-r--r-- | lib/utils.rb | 14 |
7 files changed, 556 insertions, 0 deletions
@@ -0,0 +1,7 @@ +source 'https://rubygems.org' + +gem 'mail' +gem 'maildir' +gem 'elasticsearch' +gem 'sanitize' +gem 'charlock_holmes'
\ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock new file mode 100644 index 0000000..ca40918 --- /dev/null +++ b/Gemfile.lock @@ -0,0 +1,40 @@ +GEM + remote: https://rubygems.org/ + specs: + charlock_holmes (0.7.3) + crass (1.0.1) + elasticsearch (1.0.6) + elasticsearch-api (= 1.0.6) + elasticsearch-transport (= 1.0.6) + elasticsearch-api (1.0.6) + multi_json + elasticsearch-transport (1.0.6) + faraday + multi_json + faraday (0.9.1) + multipart-post (>= 1.2, < 3) + mail (2.6.3) + mime-types (>= 1.16, < 3) + maildir (2.2.0) + mime-types (2.4.3) + mini_portile (0.6.2) + multi_json (1.10.1) + multipart-post (2.0.0) + nokogiri (1.6.6.2) + mini_portile (~> 0.6.0) + nokogumbo (1.2.0) + nokogiri + sanitize (3.1.1) + crass (~> 1.0.1) + nokogiri (>= 1.4.4) + nokogumbo (= 1.2.0) + +PLATFORMS + ruby + +DEPENDENCIES + charlock_holmes + elasticsearch + mail + maildir + sanitize @@ -0,0 +1,161 @@ +#!/usr/bin/env ruby +# Ag -- archiving all the 'golden' flamewars on -dev +# Alex Legler <a3li@gentoo.org> + +require 'bundler/setup' +require 'mail' +require 'maildir' +require 'elasticsearch' +require 'optparse' +require_relative 'lib/utils' +require_relative 'lib/threading' +require_relative 'lib/rendering' +require_relative 'lib/storage' + +$options = OpenStruct.new +$options.action = nil +$options.name = nil +$options.index_only = false +$options.no_threading = false +$options.debug = false + +op = OptionParser.new do |opts| + opts.banner = "Usage: ag <<--index-full|--index-new|--delete|--reindex|--info> <--list listname>|<--fix>> <maildir/file/hash/messageid> [options]" + + opts.on('--index-full', 'Read the full past archive from the .cur Maildir') do + abort 'Can only select one action' if $options.action != nil + + $options.action = :do_full + end + + opts.on('--index-new', 'Read new messages from .new and move them to .cur') do + abort 'Can only select one action' if $options.action != nil + + $options.action = :do_incremental + end + + opts.on('--fix', 'Fix up weird info sections') do + abort 'Can only select one action' if $options.action != nil + + $options.action = :do_fix + end + + opts.on('--delete', 'Delete message. Needs --file, --msgid, or --hash') do + abort 'Can only select one action' if $options.action != nil + + $options.action = :do_delete + end + + opts.on('--reindex', 'Reindex message. Needs --file') do + abort 'Can only select one action' if $options.action != nil + + $options.action = :do_reindex + end + + opts.on('--list NAME', 'Name of the mailing list to work with') do |name| + $options.name = name + end + + opts.on('--file', 'The argument is a file') do + $options.argmode = :file + end + + opts.on('--msgid', 'The argument is a Message-Id') do + $options.argmode = :msgid + end + + opts.on('--hash', 'The argument is a X-Archives-Hash') do + $options.argmode = :hash + end + + opts.on('--index-only', 'Only delete the message from the index, not from disk') do + $options.index_only = true + end + + opts.on('--no-threading', 'Only index, don\'t update threading') do + $options.no_threading = true + end + + opts.on('--debug', 'Print debug messages') do + $options.debug = true + end +end +op.parse! + +abort op.help unless $options.action +abort 'List name required' unless $options.name +$options.dir = ARGV[0] or abort 'Need a Maildir/File/Hash/Message-Id to work with' + +$listname = ARGV[0] or abort 'List name required' + +# Open maildir and set serializer +$maildir = Maildir.new(File.join($options.dir), false) +$maildir.serializer = Maildir::Serializer::Mail.new + +# Connect to Elasticsearch +$es = Elasticsearch::Client.new(log: false) +$es.transport.reload_connections! + +############################################################################### + +def do_fix + Dir.chdir($options.dir) do + Dir.glob('*:2,:2,S').each do |f| + puts "Fixing #{f}" + File.rename(f, f.gsub(':2,:2,S', ':2,S')) + end + end +end + +def do_full + Ag::Storage.create_index($options.name) + + $maildir.list(:cur).each do |maildir_message| + mail = maildir_message.data + + begin + Ag::Storage.store($options.name, mail) + rescue => e + $stderr.puts "Cannot save message #{mail.message_id}: #{e.message}" + next + end + end + + Ag::Threading.calc($options.name) unless $options.no_threading +end + +def do_incremental + $maildir.list(:new).each do |maildir_message| + mail = maildir_message.data + + begin + Ag::Storage.store($options.name, mail) + maildir_message.process + rescue => e + $stderr.puts "Cannot save message #{mail.message_id}: #{e.message}" + next + end + end + + Ag::Threading.calc($options.name) unless $options.no_threading +end + +def do_delete + abort 'Come back later.' +end + +def do_reindex + abort 'Come back later.' +end + +def do_info + abort 'Come back later.' +end + +############################################################################### + +begin + send $options.action +rescue NoMethodError + abort 'Internal Error: Unknown action' +end
\ No newline at end of file diff --git a/lib/rendering.rb b/lib/rendering.rb new file mode 100644 index 0000000..3e77414 --- /dev/null +++ b/lib/rendering.rb @@ -0,0 +1,58 @@ +require 'sanitize' +require 'cgi' + +module Ag::Rendering + class HTMLizer + def self.HTMLize(mail) + if mail.multipart? + content_type = mime_split(mail.parts.first.content_type) + + if content_type == 'text/plain' or content_type == 'text/html' + to_content(content_type, mail.parts.first.decoded) + else + # Nested multipart? + if mail.parts.first.multipart? + content_type = mime_split(mail.parts.first.parts.first.content_type) + + if content_type == 'text/plain' or content_type == 'text/html' + to_content(content_type, mail.parts.first.parts.first.decoded) + else + raise "Cannot find body: #{mail.message_id}" + end + # Specialty: Gnus/Emacs signed emails with no explicit multipart type + elsif mime_split(mail.content_type) == 'multipart/signed' + to_content('text/plain', mail.parts.first.decoded) + end + end + else + # No Content-Type, assume plain text (git-send-email) + if mail.content_type == nil + to_content('text/plain', mail.body.decoded) + else + to_content(mime_split(mail.content_type), mail.body.decoded) + end + end + end + + def self.to_content(content_type, content) + if content_type == 'text/plain' + escaped_content = CGI::escapeHTML(content) + escaped_content.lines.map do |line| + if line.start_with? '>' + "<div class=\"ag-quote\">#{line.rstrip}</div>\n" + else + line + end + end.join.gsub("</div>\n<div class=\"ag-quote\">", "\n") + elsif content_type == 'text/html' + '<div class="ag-html-content">' + Sanitize.clean(content, Sanitize::Config::BASIC) + '</div>' + else + '<div class="alert alert-danger" role="alert"><strong>Unsupported Content-Type</strong></div>' + end + end + + def self.mime_split(content_type) + (content_type || '').split(';').first + end + end +end
\ No newline at end of file diff --git a/lib/storage.rb b/lib/storage.rb new file mode 100644 index 0000000..9045a0b --- /dev/null +++ b/lib/storage.rb @@ -0,0 +1,208 @@ +require 'elasticsearch' +require 'date' + +module Ag::Storage + module_function + def create_index(list) + begin + $es.indices.delete index: 'ml-' + list + rescue Elasticsearch::Transport::Transport::Errors::NotFound => e + $stderr.puts "Index did not exist yet. Creating." + end + + $es.indices.create( + index: 'ml-' + list, + body: { + mappings: { + message: { + properties: { + attachments: { + properties: { + filename: { + type: 'string', + index: 'not_analyzed' + }, + mime: { + type: 'string', + index: 'not_analyzed' + } + } + }, + cc: { + type: 'string' + }, + content: { + type: 'string' + }, + date: { + type: 'date', + format: 'dateOptionalTime' + }, + from: { + type: 'string' + }, + from_realname: { + type: 'string' + }, + month: { + type: 'integer' + }, + parent: { + type: 'string', + index: 'not_analyzed' + }, + raw_message_id: { + type: 'string', + index: 'not_analyzed' + }, + raw_parent: { + type: 'string' + }, + subject: { + type: 'string' + }, + to: { + type: 'string' + } + } + } + } + }) + end + + def get_content(message) + content = "Cannot parse MIME/contents." + begin + raw_content = Ag::Rendering::HTMLizer.HTMLize(message) + content = Ag::Utils.fix_encoding(raw_content || '').strip + + if content == '' + $stderr.puts "#{message.message_id}: Content empty" + end + rescue + $stderr.puts "#{message.message_id}: Invalid encoding" + end + + content + end + + def get_parent_message(list, parent_message_id = nil) + return nil if parent_message_id == nil + + result = $es.search( + index: 'ml-' + list, + body: { + query: { + filtered: { + filter: { + term: { raw_message_id: parent_message_id } + } + } + }, + fields: ['_id'] + } + ) + + return nil if result['hits']['total'] == 0 + + result['hits']['hits'].first['_id'] + end + + def store(list, message) + content = get_content(message) + + identifier = message['X-Archives-Hash'].value + raw_parent = Ag::Threading.get_parent_message_id(message) + + from = Ag::Utils.fix_encoding(message[:from].formatted.first) + from_realname = from.gsub(/<(.*)>/, '').strip + + to = '' + if message[:to] + to = Ag::Utils.fix_encoding(message[:to].formatted.join(',')) + end + + cc = '' + if message[:cc] + cc = Ag::Utils.fix_encoding(message[:cc].formatted.join(',')) + end + + subject = Ag::Utils.fix_encoding(message.subject) + + attachments = [] + if message.has_attachments? + message.attachments.each do |attachment| + attachments << { + filename: attachment.filename, + mime: attachment.mime_type + } + end + end + + $es.index( + index: 'ml-' + list, + type: 'message', + id: identifier, + body: { + raw_message_id: message.message_id, + subject: subject, + to: to, + cc: cc, + from: from, + from_realname: from_realname, + date: message.date, + month: ("%i%02i" % [message.date.year, message.date.month]).to_i, # this is a sortable number! + content: content, + attachments: attachments, + raw_parent: raw_parent + } + ) + end + + def fix_threading(list) + result = $es.search( + index: 'ml-' + list, + size: 100000, + body: { + size: 100000, + query: { + filtered: { + filter: { + and: [ + { + missing: { + field: 'parent' + } + }, + { + exists: { + field: 'raw_parent' + } + } + ] + } + } + } + } + ) + + result['hits']['hits'].each do |hit| + msg = get_parent_message(list, hit['_source']['raw_parent']) + + unless msg == nil + $es.update( + index: 'ml-' + list, + type: 'message', + id: hit['_id'], + body: { + doc: { + parent: msg + } + } + ) + end + end + + result['hits']['total'] + end +end
\ No newline at end of file diff --git a/lib/threading.rb b/lib/threading.rb new file mode 100644 index 0000000..8988f23 --- /dev/null +++ b/lib/threading.rb @@ -0,0 +1,68 @@ +module Ag + module Threading + module_function + # Figures out the Message-Id of the parent message, + # or returns nil if we asusme this message is not a reply + def get_parent_message_id(mail) + # No headers -> no parent message + if mail.in_reply_to == nil and mail.references == nil + return nil + else + irt_value = nil + + if mail.in_reply_to.is_a? Array + irt_value = mail.in_reply_to.last + elsif mail.in_reply_to.is_a? String + irt_value = mail.in_reply_to + + # Gnus/Emacs specialty du jour + # => "<1075186049.4264.1.camel@TesterTop.tester.ca> (Olivier CrĂȘte's message of \"Tue, 27 Jan 2004 07:47:29 +0100\")" + if irt_value.start_with? '<' + irt_value = irt_value[0..irt_value.rindex('>')] unless irt_value.end_with? '>' + irt_value.gsub!(/(^<|>$)/, '') + end + elsif mail.in_reply_to == nil + # nothing to do + else + $stderr.puts "In-Reply-To is a weird type: #{mail.message_id}" if $options.debug + end + + ref_value = nil + if mail.references.is_a? Array + ref_value = mail.references.last + elsif mail.references.is_a? String + ref_value = mail.references + elsif mail.references == nil + # nothing to do + else + $stderr.puts "References is a weird type: #{mail.message_id}" if $options.debug + end + + if irt_value == ref_value + return irt_value.to_s + elsif irt_value == nil + return ref_value.to_s + elsif ref_value == nil + return irt_value.to_s + else + $stderr.puts "In-Reply-To and References disagree: #{mail.message_id}" if $options.debug + # If in doubt, let In-Reply-To win + return irt_value.to_s + end + end + + $stderr.puts "Couldn't find a parent id for Message-Id: #{mail.message_id}" if $options.debug + nil + end + + def calc(list) + number_of_root_threads = -1 + loop do + new_num = Ag::Storage.fix_threading(list) + + break if new_num == number_of_root_threads + number_of_root_threads = new_num + end + end + end +end
\ No newline at end of file diff --git a/lib/utils.rb b/lib/utils.rb new file mode 100644 index 0000000..109a6a5 --- /dev/null +++ b/lib/utils.rb @@ -0,0 +1,14 @@ +require 'charlock_holmes' + +module Ag + module Utils + module_function + def fix_encoding(str) + detection = CharlockHolmes::EncodingDetector.detect(str) + CharlockHolmes::Converter.convert(str, detection[:encoding], 'UTF-8') + rescue => e + $stderr.puts e.message if $options.debug + 'Encoding could not be reliably detected. Message contents not available.' + end + end +end
\ No newline at end of file |