chore: Improve email parsing using email trimmer gem (#3611)

Email parsing using email_trimmer gem

Fixes: #3539 , #2954, #3572
This commit is contained in:
Tejaswini Chile 2021-12-22 18:16:40 +05:30 committed by GitHub
parent 009abc1948
commit 44486fc8e1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 1232 additions and 32 deletions

View file

@ -121,6 +121,10 @@ gem 'hairtrigger'
gem 'procore-sift'
# parse email
gem 'email_reply_trimmer'
gem 'html2text'
group :production, :staging do
# we dont want request timing out in development while using byebug
gem 'rack-timeout'

View file

@ -179,6 +179,7 @@ GEM
addressable (~> 2.8)
ecma-re-validator (0.3.0)
regexp_parser (~> 2.0)
email_reply_trimmer (0.1.13)
erubi (1.10.0)
erubis (2.7.0)
et-orbi (1.2.5)
@ -290,6 +291,8 @@ GEM
hashdiff (1.0.1)
hashie (4.1.0)
hkdf (0.3.0)
html2text (0.2.1)
nokogiri (~> 1.6)
http-accept (1.7.0)
http-cookie (1.0.4)
domain_name (~> 0.5)
@ -668,6 +671,7 @@ DEPENDENCIES
devise_token_auth
dotenv-rails
down (~> 5.0)
email_reply_trimmer
facebook-messenger
factory_bot_rails
faker
@ -682,6 +686,7 @@ DEPENDENCIES
haikunator
hairtrigger
hashie
html2text
image_processing
jbuilder
json_refs

View file

@ -0,0 +1,31 @@
class HtmlParser
def self.parse_reply(raw_body)
new(raw_body).filtered_text
end
attr_reader :raw_body
def initialize(raw_body)
@raw_body = raw_body
end
def document
@document ||= Nokogiri::HTML(raw_body)
end
def filter_replies!
document.xpath('//blockquote').each { |n| n.replace('> ') }
document.xpath('//table').each(&:remove)
end
def filtered_html
@filtered_html ||= begin
filter_replies!
document.inner_html
end
end
def filtered_text
@filtered_text ||= Html2Text.convert(filtered_html)
end
end

View file

@ -8,30 +8,48 @@ class MailPresenter < SimpleDelegator
end
def subject
encode_to_unicode(@mail.subject || '')
encode_to_unicode(@mail.subject)
end
def text_content
@decoded_text_content ||= encode_to_unicode(text_part&.decoded || decoded_message || '')
@decoded_text_content = select_body || ''
encoding = @decoded_text_content.encoding
body = EmailReplyTrimmer.trim(@decoded_text_content)
return {} if @decoded_text_content.blank?
@text_content ||= {
full: @decoded_text_content,
reply: extract_reply(@decoded_text_content)[:reply],
quoted: extract_reply(@decoded_text_content)[:quoted_text]
full: select_body,
reply: @decoded_text_content,
quoted: body.force_encoding(encoding).encode('UTF-8')
}
end
def select_body
message = mail.text_part || mail.html_part || mail
decoded = encode_to_unicode(message.decoded)
# Certain trigger phrases that means we didn't parse correctly
return '' if %r{(Content-Type: multipart/alternative|text/plain)}.match?(decoded)
if (mail.content_type || '').include? 'text/html'
::HtmlParser.parse_reply(decoded)
else
decoded
end
end
def html_content
@decoded_html_content ||= encode_to_unicode(html_part&.decoded)
@decoded_html_content = select_body || ''
return {} if @decoded_html_content.blank?
body = EmailReplyTrimmer.trim(@decoded_html_content)
@html_content ||= {
full: @decoded_html_content,
reply: extract_reply(@decoded_html_content)[:reply],
quoted: extract_reply(@decoded_html_content)[:quoted_text]
full: select_body,
reply: @decoded_html_content,
quoted: body
}
end
@ -47,14 +65,6 @@ class MailPresenter < SimpleDelegator
end
end
def decoded_message
if mail.multipart?
return mail.text_part ? mail.text_part.decoded : nil
end
mail.decoded
end
def number_of_attachments
mail.attachments.count
end
@ -114,21 +124,8 @@ class MailPresenter < SimpleDelegator
return str if current_encoding == 'UTF-8'
str.encode(current_encoding, 'UTF-8', invalid: :replace, undef: :replace, replace: '?')
end
def extract_reply(content)
@regex_arr ||= quoted_text_regexes
content_length = content.length
# calculates the matching regex closest to top of page
index = @regex_arr.inject(content_length) do |min, regex|
[(content.index(regex) || content_length), min].min
end
{
reply: content[0..(index - 1)].strip,
quoted_text: content[index..].strip
}
rescue StandardError
''
end
def quoted_text_regexes

47
spec/fixtures/files/mail_with_quote.eml vendored Normal file
View file

@ -0,0 +1,47 @@
MIME-Version: 1.0
Date: Thu, 19 Aug 2021 14:14:31 +0530
References: <CAFkiBVxGoURoqdkY-O_25F-8b41kb-GWBc6hh4Djd5ynwOikXA@mail.gmail.com> <0100017b5d8efc70-c7f18809-aa55-48f6-91fd-b626092ed8b3-000000@email.amazonses.com>
In-Reply-To: <0100017b5d8efc70-c7f18809-aa55-48f6-91fd-b626092ed8b3-000000@email.amazonses.com>
Message-ID: <CAFkiBVwJjO_k_e-LpiKi7MAQAKbHX5nkEPcf0y1R=bjcEHogMg@mail.gmail.com>
Subject: Re: Checking mail forwarding to cw inbox
From: Sony Mathew <sony@chatwoot.com>
To: Tejaswini <reply+6bdc3f4d-0bec-4515-a284-5d916fdde489@example.com>
Content-Type: multipart/alternative; boundary="0000000000004af64505c9e58f03"
--0000000000004af64505c9e58f03
Content-Type: text/plain; charset="UTF-8"
Yes, I am providing you step how to reproduce this issue
On Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test <
tejaswini@chatwoot.com> wrote:
> Any update on this?
>
>
--
* Sony Mathew*
Software developer
*Mob:9999999999
--0000000000004af64505c9e58f03
Content-Type: text/html; charset="UTF-8"
Content-Transfer-Encoding: quoted-printable
<div dir=3D"ltr">Yes, I am providing you step how to reproduce this issue</=
div><br><div class=3D"gmail_quote"><div dir=3D"ltr" class=3D"gmail_attr">On=
Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test &l=
t;<a href=3D"mailto:tejaswini@chatwoot.com">tejaswini@chatwoot.com</a>&gt; wrot=
e:<br></div><blockquote class=3D"gmail_quote" style=3D"margin:0px 0px 0px 0=
.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> <p>
</p><p>Any update on this?</p>
<p></p>
</blockquote></div><br clear=3D"all"><div><br></div>-- <br><div dir=3D"ltr"=
class=3D"gmail_signature"><div dir=3D"ltr"><div><div dir=3D"ltr"><div><div=
><b>Sony Mathew.</b><br></div><span style=3D"font-family:&quot;times ne=
w roman&quot;,serif"><span></span><span></span>Software developer</span><br=
></div><b>Mob:9999999999</b></div></div></div></div>
--0000000000004af64505c9e58f03--

1061
spec/fixtures/files/welcome_html.eml vendored Normal file

File diff suppressed because it is too large Load diff

View file

@ -7,6 +7,7 @@ RSpec.describe ReplyMailbox, type: :mailbox do
let(:account) { create(:account) }
let(:agent) { create(:user, email: 'agent1@example.com', account: account) }
let(:reply_mail) { create_inbound_email_from_fixture('reply.eml') }
let(:mail_with_quote) { create_inbound_email_from_fixture('mail_with_quote.eml') }
let(:conversation) { create(:conversation, assignee: agent, inbox: create(:inbox, account: account, greeting_enabled: false), account: account) }
let(:described_subject) { described_class.receive reply_mail }
let(:serialized_attributes) do
@ -95,5 +96,35 @@ RSpec.describe ReplyMailbox, type: :mailbox do
expect(conversation_1.messages.last.content).to eq("Let's talk about these images:")
end
end
context 'with quotes in email' do
let(:described_subject) { described_class.receive mail_with_quote }
before do
# this UUID is hardcoded in the reply.eml, that's why we are updating this
conversation.uuid = '6bdc3f4d-0bec-4515-a284-5d916fdde489'
conversation.save
end
it 'add the mail content as new message on the conversation' do
described_subject
expect(conversation.messages.last.content).to eq(
<<-BODY.strip_heredoc.chomp
Yes, I am providing you step how to reproduce this issue
On Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test < tejaswini@chatwoot.com> wrote:
> Any update on this?
>
>
--
* Sony Mathew*
Software developer
*Mob:9999999999
BODY
)
end
end
end
end

View file

@ -0,0 +1,15 @@
require 'rails_helper'
RSpec.describe HtmlParser do
include ActionMailbox::TestHelper
describe 'parsed mail decorator' do
let(:html_mail) { create_inbound_email_from_fixture('welcome_html.eml').mail }
it 'parse html content in the mail' do
decorated_html_mail = described_class.parse_reply(html_mail.text_part.decoded)
expect(decorated_html_mail[0..70]).to eq(
"I'm learning English as a first language for the past 13 years, but to "
)
end
end
end

View file

@ -4,6 +4,7 @@ RSpec.describe MailPresenter do
describe 'parsed mail decorator' do
let(:mail) { create_inbound_email_from_fixture('welcome.eml').mail }
let(:html_mail) { create_inbound_email_from_fixture('welcome_html.eml').mail }
let(:decorated_mail) { described_class.new(mail) }
let(:mail_with_no_subject) { create_inbound_email_from_fixture('mail_with_no_subject.eml').mail }
@ -56,5 +57,13 @@ RSpec.describe MailPresenter do
it 'give email from in downcased format' do
expect(decorated_mail.from.first.eql?(mail.from.first.downcase)).to eq true
end
it 'parse html content in the mail' do
decorated_html_mail = described_class.new(html_mail)
expect(decorated_html_mail.subject).to eq('Fwd: How good are you in English? How did you improve your English?')
expect(decorated_html_mail.text_content[:reply][0..70]).to eq(
"I'm learning English as a first language for the past 13 years, but to "
)
end
end
end