From ec37a8a88c9c0787c98fc5d5b0baae25aee50536 Mon Sep 17 00:00:00 2001 From: Klaus-Uwe Mitterer Date: Tue, 21 Feb 2017 16:10:39 +0100 Subject: [PATCH] Optimize text sanitation --- markov.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/markov.py b/markov.py index 0eeb2ff..d220e3b 100755 --- a/markov.py +++ b/markov.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 import dbtools, setuptools, twitools -import argparse, html, markovify, nltk, operator, random, re, sys +import argparse, html, markovify, nltk, operator, random, re, string, sys class Possy(markovify.NewlineText): def word_split(self, sentence): @@ -14,18 +14,22 @@ class Possy(markovify.NewlineText): return sentence def sanitizeText(text): + split = text.split() try: - if text[0] == "@": - return sanitizeText(text.partition(" ")[2]) - if text.split()[-1][0] == "@": - return sanitizeText(" ".join(text.split()[:-1])) + if "@" in (text[0], text[1]): + if split[1][0] not in string.ascii_lowercase: + return sanitizeText(text.partition(" ")[2]) + if split[-1][0] == "@": + return sanitizeText(" ".join(split[:-1])) + if text[:4] == "RT @": + return sanitizeText(text.partition(":")[2]) except: return "" return text def getText(db = dbtools.dbHelper()): text = "" - for string in db.executeQuery('SELECT text FROM tweets WHERE text NOT LIKE "RT %";'): + for string in db.executeQuery('SELECT text FROM tweets;'): text += sanitizeText(string[0]) + "\n" return html.unescape("".join([s for s in text.strip().splitlines(True) if s.strip()]))