chore: Improve email parsing using email trimmer gem (#3611)
Email parsing using email_trimmer gem Fixes: #3539 , #2954, #3572
This commit is contained in:
parent
009abc1948
commit
44486fc8e1
9 changed files with 1232 additions and 32 deletions
4
Gemfile
4
Gemfile
|
@ -121,6 +121,10 @@ gem 'hairtrigger'
|
|||
|
||||
gem 'procore-sift'
|
||||
|
||||
# parse email
|
||||
gem 'email_reply_trimmer'
|
||||
gem 'html2text'
|
||||
|
||||
group :production, :staging do
|
||||
# we dont want request timing out in development while using byebug
|
||||
gem 'rack-timeout'
|
||||
|
|
|
@ -179,6 +179,7 @@ GEM
|
|||
addressable (~> 2.8)
|
||||
ecma-re-validator (0.3.0)
|
||||
regexp_parser (~> 2.0)
|
||||
email_reply_trimmer (0.1.13)
|
||||
erubi (1.10.0)
|
||||
erubis (2.7.0)
|
||||
et-orbi (1.2.5)
|
||||
|
@ -290,6 +291,8 @@ GEM
|
|||
hashdiff (1.0.1)
|
||||
hashie (4.1.0)
|
||||
hkdf (0.3.0)
|
||||
html2text (0.2.1)
|
||||
nokogiri (~> 1.6)
|
||||
http-accept (1.7.0)
|
||||
http-cookie (1.0.4)
|
||||
domain_name (~> 0.5)
|
||||
|
@ -668,6 +671,7 @@ DEPENDENCIES
|
|||
devise_token_auth
|
||||
dotenv-rails
|
||||
down (~> 5.0)
|
||||
email_reply_trimmer
|
||||
facebook-messenger
|
||||
factory_bot_rails
|
||||
faker
|
||||
|
@ -682,6 +686,7 @@ DEPENDENCIES
|
|||
haikunator
|
||||
hairtrigger
|
||||
hashie
|
||||
html2text
|
||||
image_processing
|
||||
jbuilder
|
||||
json_refs
|
||||
|
|
31
app/presenters/html_parser.rb
Normal file
31
app/presenters/html_parser.rb
Normal file
|
@ -0,0 +1,31 @@
|
|||
class HtmlParser
|
||||
def self.parse_reply(raw_body)
|
||||
new(raw_body).filtered_text
|
||||
end
|
||||
|
||||
attr_reader :raw_body
|
||||
|
||||
def initialize(raw_body)
|
||||
@raw_body = raw_body
|
||||
end
|
||||
|
||||
def document
|
||||
@document ||= Nokogiri::HTML(raw_body)
|
||||
end
|
||||
|
||||
def filter_replies!
|
||||
document.xpath('//blockquote').each { |n| n.replace('> ') }
|
||||
document.xpath('//table').each(&:remove)
|
||||
end
|
||||
|
||||
def filtered_html
|
||||
@filtered_html ||= begin
|
||||
filter_replies!
|
||||
document.inner_html
|
||||
end
|
||||
end
|
||||
|
||||
def filtered_text
|
||||
@filtered_text ||= Html2Text.convert(filtered_html)
|
||||
end
|
||||
end
|
|
@ -8,30 +8,48 @@ class MailPresenter < SimpleDelegator
|
|||
end
|
||||
|
||||
def subject
|
||||
encode_to_unicode(@mail.subject || '')
|
||||
encode_to_unicode(@mail.subject)
|
||||
end
|
||||
|
||||
def text_content
|
||||
@decoded_text_content ||= encode_to_unicode(text_part&.decoded || decoded_message || '')
|
||||
@decoded_text_content = select_body || ''
|
||||
encoding = @decoded_text_content.encoding
|
||||
|
||||
body = EmailReplyTrimmer.trim(@decoded_text_content)
|
||||
|
||||
return {} if @decoded_text_content.blank?
|
||||
|
||||
@text_content ||= {
|
||||
full: @decoded_text_content,
|
||||
reply: extract_reply(@decoded_text_content)[:reply],
|
||||
quoted: extract_reply(@decoded_text_content)[:quoted_text]
|
||||
full: select_body,
|
||||
reply: @decoded_text_content,
|
||||
quoted: body.force_encoding(encoding).encode('UTF-8')
|
||||
}
|
||||
end
|
||||
|
||||
def select_body
|
||||
message = mail.text_part || mail.html_part || mail
|
||||
decoded = encode_to_unicode(message.decoded)
|
||||
# Certain trigger phrases that means we didn't parse correctly
|
||||
return '' if %r{(Content-Type: multipart/alternative|text/plain)}.match?(decoded)
|
||||
|
||||
if (mail.content_type || '').include? 'text/html'
|
||||
::HtmlParser.parse_reply(decoded)
|
||||
else
|
||||
decoded
|
||||
end
|
||||
end
|
||||
|
||||
def html_content
|
||||
@decoded_html_content ||= encode_to_unicode(html_part&.decoded)
|
||||
@decoded_html_content = select_body || ''
|
||||
|
||||
return {} if @decoded_html_content.blank?
|
||||
|
||||
body = EmailReplyTrimmer.trim(@decoded_html_content)
|
||||
|
||||
@html_content ||= {
|
||||
full: @decoded_html_content,
|
||||
reply: extract_reply(@decoded_html_content)[:reply],
|
||||
quoted: extract_reply(@decoded_html_content)[:quoted_text]
|
||||
full: select_body,
|
||||
reply: @decoded_html_content,
|
||||
quoted: body
|
||||
}
|
||||
end
|
||||
|
||||
|
@ -47,14 +65,6 @@ class MailPresenter < SimpleDelegator
|
|||
end
|
||||
end
|
||||
|
||||
def decoded_message
|
||||
if mail.multipart?
|
||||
return mail.text_part ? mail.text_part.decoded : nil
|
||||
end
|
||||
|
||||
mail.decoded
|
||||
end
|
||||
|
||||
def number_of_attachments
|
||||
mail.attachments.count
|
||||
end
|
||||
|
@ -114,21 +124,8 @@ class MailPresenter < SimpleDelegator
|
|||
return str if current_encoding == 'UTF-8'
|
||||
|
||||
str.encode(current_encoding, 'UTF-8', invalid: :replace, undef: :replace, replace: '?')
|
||||
end
|
||||
|
||||
def extract_reply(content)
|
||||
@regex_arr ||= quoted_text_regexes
|
||||
|
||||
content_length = content.length
|
||||
# calculates the matching regex closest to top of page
|
||||
index = @regex_arr.inject(content_length) do |min, regex|
|
||||
[(content.index(regex) || content_length), min].min
|
||||
end
|
||||
|
||||
{
|
||||
reply: content[0..(index - 1)].strip,
|
||||
quoted_text: content[index..].strip
|
||||
}
|
||||
rescue StandardError
|
||||
''
|
||||
end
|
||||
|
||||
def quoted_text_regexes
|
||||
|
|
47
spec/fixtures/files/mail_with_quote.eml
vendored
Normal file
47
spec/fixtures/files/mail_with_quote.eml
vendored
Normal file
|
@ -0,0 +1,47 @@
|
|||
MIME-Version: 1.0
|
||||
Date: Thu, 19 Aug 2021 14:14:31 +0530
|
||||
References: <CAFkiBVxGoURoqdkY-O_25F-8b41kb-GWBc6hh4Djd5ynwOikXA@mail.gmail.com> <0100017b5d8efc70-c7f18809-aa55-48f6-91fd-b626092ed8b3-000000@email.amazonses.com>
|
||||
In-Reply-To: <0100017b5d8efc70-c7f18809-aa55-48f6-91fd-b626092ed8b3-000000@email.amazonses.com>
|
||||
Message-ID: <CAFkiBVwJjO_k_e-LpiKi7MAQAKbHX5nkEPcf0y1R=bjcEHogMg@mail.gmail.com>
|
||||
Subject: Re: Checking mail forwarding to cw inbox
|
||||
From: Sony Mathew <sony@chatwoot.com>
|
||||
To: Tejaswini <reply+6bdc3f4d-0bec-4515-a284-5d916fdde489@example.com>
|
||||
Content-Type: multipart/alternative; boundary="0000000000004af64505c9e58f03"
|
||||
|
||||
--0000000000004af64505c9e58f03
|
||||
Content-Type: text/plain; charset="UTF-8"
|
||||
|
||||
Yes, I am providing you step how to reproduce this issue
|
||||
|
||||
On Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test <
|
||||
tejaswini@chatwoot.com> wrote:
|
||||
|
||||
> Any update on this?
|
||||
>
|
||||
>
|
||||
|
||||
--
|
||||
* Sony Mathew*
|
||||
Software developer
|
||||
*Mob:9999999999
|
||||
|
||||
--0000000000004af64505c9e58f03
|
||||
Content-Type: text/html; charset="UTF-8"
|
||||
Content-Transfer-Encoding: quoted-printable
|
||||
|
||||
<div dir=3D"ltr">Yes, I am providing you step how to reproduce this issue</=
|
||||
div><br><div class=3D"gmail_quote"><div dir=3D"ltr" class=3D"gmail_attr">On=
|
||||
Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test &l=
|
||||
t;<a href=3D"mailto:tejaswini@chatwoot.com">tejaswini@chatwoot.com</a>> wrot=
|
||||
e:<br></div><blockquote class=3D"gmail_quote" style=3D"margin:0px 0px 0px 0=
|
||||
.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex"> <p>
|
||||
</p><p>Any update on this?</p>
|
||||
|
||||
<p></p>
|
||||
</blockquote></div><br clear=3D"all"><div><br></div>-- <br><div dir=3D"ltr"=
|
||||
class=3D"gmail_signature"><div dir=3D"ltr"><div><div dir=3D"ltr"><div><div=
|
||||
><b>Sony Mathew.</b><br></div><span style=3D"font-family:"times ne=
|
||||
w roman",serif"><span></span><span></span>Software developer</span><br=
|
||||
></div><b>Mob:9999999999</b></div></div></div></div>
|
||||
|
||||
--0000000000004af64505c9e58f03--
|
1061
spec/fixtures/files/welcome_html.eml
vendored
Normal file
1061
spec/fixtures/files/welcome_html.eml
vendored
Normal file
File diff suppressed because it is too large
Load diff
|
@ -7,6 +7,7 @@ RSpec.describe ReplyMailbox, type: :mailbox do
|
|||
let(:account) { create(:account) }
|
||||
let(:agent) { create(:user, email: 'agent1@example.com', account: account) }
|
||||
let(:reply_mail) { create_inbound_email_from_fixture('reply.eml') }
|
||||
let(:mail_with_quote) { create_inbound_email_from_fixture('mail_with_quote.eml') }
|
||||
let(:conversation) { create(:conversation, assignee: agent, inbox: create(:inbox, account: account, greeting_enabled: false), account: account) }
|
||||
let(:described_subject) { described_class.receive reply_mail }
|
||||
let(:serialized_attributes) do
|
||||
|
@ -95,5 +96,35 @@ RSpec.describe ReplyMailbox, type: :mailbox do
|
|||
expect(conversation_1.messages.last.content).to eq("Let's talk about these images:")
|
||||
end
|
||||
end
|
||||
|
||||
context 'with quotes in email' do
|
||||
let(:described_subject) { described_class.receive mail_with_quote }
|
||||
|
||||
before do
|
||||
# this UUID is hardcoded in the reply.eml, that's why we are updating this
|
||||
conversation.uuid = '6bdc3f4d-0bec-4515-a284-5d916fdde489'
|
||||
conversation.save
|
||||
end
|
||||
|
||||
it 'add the mail content as new message on the conversation' do
|
||||
described_subject
|
||||
expect(conversation.messages.last.content).to eq(
|
||||
<<-BODY.strip_heredoc.chomp
|
||||
Yes, I am providing you step how to reproduce this issue
|
||||
|
||||
On Thu, Aug 19, 2021 at 2:07 PM Tejaswini from Email sender test < tejaswini@chatwoot.com> wrote:
|
||||
|
||||
> Any update on this?
|
||||
>
|
||||
>
|
||||
|
||||
--
|
||||
* Sony Mathew*
|
||||
Software developer
|
||||
*Mob:9999999999
|
||||
BODY
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
15
spec/presenters/html_parser_spec.rb
Normal file
15
spec/presenters/html_parser_spec.rb
Normal file
|
@ -0,0 +1,15 @@
|
|||
require 'rails_helper'
|
||||
RSpec.describe HtmlParser do
|
||||
include ActionMailbox::TestHelper
|
||||
|
||||
describe 'parsed mail decorator' do
|
||||
let(:html_mail) { create_inbound_email_from_fixture('welcome_html.eml').mail }
|
||||
|
||||
it 'parse html content in the mail' do
|
||||
decorated_html_mail = described_class.parse_reply(html_mail.text_part.decoded)
|
||||
expect(decorated_html_mail[0..70]).to eq(
|
||||
"I'm learning English as a first language for the past 13 years, but to "
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
|
@ -4,6 +4,7 @@ RSpec.describe MailPresenter do
|
|||
|
||||
describe 'parsed mail decorator' do
|
||||
let(:mail) { create_inbound_email_from_fixture('welcome.eml').mail }
|
||||
let(:html_mail) { create_inbound_email_from_fixture('welcome_html.eml').mail }
|
||||
let(:decorated_mail) { described_class.new(mail) }
|
||||
|
||||
let(:mail_with_no_subject) { create_inbound_email_from_fixture('mail_with_no_subject.eml').mail }
|
||||
|
@ -56,5 +57,13 @@ RSpec.describe MailPresenter do
|
|||
it 'give email from in downcased format' do
|
||||
expect(decorated_mail.from.first.eql?(mail.from.first.downcase)).to eq true
|
||||
end
|
||||
|
||||
it 'parse html content in the mail' do
|
||||
decorated_html_mail = described_class.new(html_mail)
|
||||
expect(decorated_html_mail.subject).to eq('Fwd: How good are you in English? How did you improve your English?')
|
||||
expect(decorated_html_mail.text_content[:reply][0..70]).to eq(
|
||||
"I'm learning English as a first language for the past 13 years, but to "
|
||||
)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
Loading…
Reference in a new issue