Optimize text sanitation

This commit is contained in:
Klaus-Uwe Mitterer 2017-02-21 16:10:39 +01:00
parent 4d76ee0116
commit 46a42222c8

View file

@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import dbtools, setuptools, twitools import dbtools, setuptools, twitools
import argparse, html, markovify, nltk, operator, random, re, sys import argparse, html, markovify, nltk, operator, random, re, string, sys
class Possy(markovify.NewlineText): class Possy(markovify.NewlineText):
def word_split(self, sentence): def word_split(self, sentence):
@ -14,18 +14,22 @@ class Possy(markovify.NewlineText):
return sentence return sentence
def sanitizeText(text): def sanitizeText(text):
split = text.split()
try: try:
if text[0] == "@": if "@" in (text[0], text[1]):
return sanitizeText(text.partition(" ")[2]) if split[1][0] not in string.ascii_lowercase:
if text.split()[-1][0] == "@": return sanitizeText(text.partition(" ")[2])
return sanitizeText(" ".join(text.split()[:-1])) if split[-1][0] == "@":
return sanitizeText(" ".join(split[:-1]))
if text[:4] == "RT @":
return sanitizeText(text.partition(":")[2])
except: except:
return "" return ""
return text return text
def getText(db = dbtools.dbHelper()): def getText(db = dbtools.dbHelper()):
text = "" text = ""
for string in db.executeQuery('SELECT text FROM tweets WHERE text NOT LIKE "RT %";'): for string in db.executeQuery('SELECT text FROM tweets;'):
text += sanitizeText(string[0]) + "\n" text += sanitizeText(string[0]) + "\n"
return html.unescape("".join([s for s in text.strip().splitlines(True) if s.strip()])) return html.unescape("".join([s for s in text.strip().splitlines(True) if s.strip()]))