From ec37a8a88c9c0787c98fc5d5b0baae25aee50536 Mon Sep 17 00:00:00 2001
From: Klaus-Uwe Mitterer <git@klaus-uwe.me>
Date: Tue, 21 Feb 2017 16:10:39 +0100
Subject: [PATCH] Optimize text sanitation

---
 markov.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/markov.py b/markov.py
index 0eeb2ff..d220e3b 100755
--- a/markov.py
+++ b/markov.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 
 import dbtools, setuptools, twitools
-import argparse, html, markovify, nltk, operator, random, re, sys
+import argparse, html, markovify, nltk, operator, random, re, string, sys
 
 class Possy(markovify.NewlineText):
  def word_split(self, sentence):
@@ -14,18 +14,22 @@ class Possy(markovify.NewlineText):
   return sentence
 
 def sanitizeText(text):
+ split = text.split()
  try:
-  if text[0] == "@":
-   return sanitizeText(text.partition(" ")[2])
-  if text.split()[-1][0] == "@":
-   return sanitizeText(" ".join(text.split()[:-1]))
+  if "@" in (text[0], text[1]):
+   if split[1][0] not in string.ascii_lowercase:
+    return sanitizeText(text.partition(" ")[2])
+  if split[-1][0] == "@":
+   return sanitizeText(" ".join(split[:-1]))
+  if text[:4] == "RT @":
+   return sanitizeText(text.partition(":")[2])
  except:
   return ""
  return text
 
 def getText(db = dbtools.dbHelper()):
  text = ""
- for string in db.executeQuery('SELECT text FROM tweets WHERE text NOT LIKE "RT %";'):
+ for string in db.executeQuery('SELECT text FROM tweets;'):
   text += sanitizeText(string[0]) + "\n"
  return html.unescape("".join([s for s in text.strip().splitlines(True) if s.strip()]))