Optimize text sanitation
This commit is contained in:
parent
a8494ebb42
commit
ec37a8a88c
1 changed files with 10 additions and 6 deletions
16
markov.py
16
markov.py
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import dbtools, setuptools, twitools
|
||||
import argparse, html, markovify, nltk, operator, random, re, sys
|
||||
import argparse, html, markovify, nltk, operator, random, re, string, sys
|
||||
|
||||
class Possy(markovify.NewlineText):
|
||||
def word_split(self, sentence):
|
||||
|
@ -14,18 +14,22 @@ class Possy(markovify.NewlineText):
|
|||
return sentence
|
||||
|
||||
def sanitizeText(text):
|
||||
split = text.split()
|
||||
try:
|
||||
if text[0] == "@":
|
||||
return sanitizeText(text.partition(" ")[2])
|
||||
if text.split()[-1][0] == "@":
|
||||
return sanitizeText(" ".join(text.split()[:-1]))
|
||||
if "@" in (text[0], text[1]):
|
||||
if split[1][0] not in string.ascii_lowercase:
|
||||
return sanitizeText(text.partition(" ")[2])
|
||||
if split[-1][0] == "@":
|
||||
return sanitizeText(" ".join(split[:-1]))
|
||||
if text[:4] == "RT @":
|
||||
return sanitizeText(text.partition(":")[2])
|
||||
except:
|
||||
return ""
|
||||
return text
|
||||
|
||||
def getText(db = dbtools.dbHelper()):
|
||||
text = ""
|
||||
for string in db.executeQuery('SELECT text FROM tweets WHERE text NOT LIKE "RT %";'):
|
||||
for string in db.executeQuery('SELECT text FROM tweets;'):
|
||||
text += sanitizeText(string[0]) + "\n"
|
||||
return html.unescape("".join([s for s in text.strip().splitlines(True) if s.strip()]))
|
||||
|
||||
|
|
Loading…
Reference in a new issue