Optimize text sanitation

This commit is contained in:
Klaus-Uwe Mitterer 2017-02-21 16:10:39 +01:00
parent a8494ebb42
commit ec37a8a88c

View file

@ -1,7 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import dbtools, setuptools, twitools import dbtools, setuptools, twitools
import argparse, html, markovify, nltk, operator, random, re, sys import argparse, html, markovify, nltk, operator, random, re, string, sys
class Possy(markovify.NewlineText): class Possy(markovify.NewlineText):
def word_split(self, sentence): def word_split(self, sentence):
@ -14,18 +14,22 @@ class Possy(markovify.NewlineText):
return sentence return sentence
def sanitizeText(text): def sanitizeText(text):
split = text.split()
try: try:
if text[0] == "@": if "@" in (text[0], text[1]):
return sanitizeText(text.partition(" ")[2]) if split[1][0] not in string.ascii_lowercase:
if text.split()[-1][0] == "@": return sanitizeText(text.partition(" ")[2])
return sanitizeText(" ".join(text.split()[:-1])) if split[-1][0] == "@":
return sanitizeText(" ".join(split[:-1]))
if text[:4] == "RT @":
return sanitizeText(text.partition(":")[2])
except: except:
return "" return ""
return text return text
def getText(db = dbtools.dbHelper()): def getText(db = dbtools.dbHelper()):
text = "" text = ""
for string in db.executeQuery('SELECT text FROM tweets WHERE text NOT LIKE "RT %";'): for string in db.executeQuery('SELECT text FROM tweets;'):
text += sanitizeText(string[0]) + "\n" text += sanitizeText(string[0]) + "\n"
return html.unescape("".join([s for s in text.strip().splitlines(True) if s.strip()])) return html.unescape("".join([s for s in text.strip().splitlines(True) if s.strip()]))