Optimize text sanitation
This commit is contained in:
parent
a8494ebb42
commit
ec37a8a88c
1 changed files with 10 additions and 6 deletions
14
markov.py
14
markov.py
|
@ -1,7 +1,7 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import dbtools, setuptools, twitools
|
import dbtools, setuptools, twitools
|
||||||
import argparse, html, markovify, nltk, operator, random, re, sys
|
import argparse, html, markovify, nltk, operator, random, re, string, sys
|
||||||
|
|
||||||
class Possy(markovify.NewlineText):
|
class Possy(markovify.NewlineText):
|
||||||
def word_split(self, sentence):
|
def word_split(self, sentence):
|
||||||
|
@ -14,18 +14,22 @@ class Possy(markovify.NewlineText):
|
||||||
return sentence
|
return sentence
|
||||||
|
|
||||||
def sanitizeText(text):
|
def sanitizeText(text):
|
||||||
|
split = text.split()
|
||||||
try:
|
try:
|
||||||
if text[0] == "@":
|
if "@" in (text[0], text[1]):
|
||||||
|
if split[1][0] not in string.ascii_lowercase:
|
||||||
return sanitizeText(text.partition(" ")[2])
|
return sanitizeText(text.partition(" ")[2])
|
||||||
if text.split()[-1][0] == "@":
|
if split[-1][0] == "@":
|
||||||
return sanitizeText(" ".join(text.split()[:-1]))
|
return sanitizeText(" ".join(split[:-1]))
|
||||||
|
if text[:4] == "RT @":
|
||||||
|
return sanitizeText(text.partition(":")[2])
|
||||||
except:
|
except:
|
||||||
return ""
|
return ""
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def getText(db = dbtools.dbHelper()):
|
def getText(db = dbtools.dbHelper()):
|
||||||
text = ""
|
text = ""
|
||||||
for string in db.executeQuery('SELECT text FROM tweets WHERE text NOT LIKE "RT %";'):
|
for string in db.executeQuery('SELECT text FROM tweets;'):
|
||||||
text += sanitizeText(string[0]) + "\n"
|
text += sanitizeText(string[0]) + "\n"
|
||||||
return html.unescape("".join([s for s in text.strip().splitlines(True) if s.strip()]))
|
return html.unescape("".join([s for s in text.strip().splitlines(True) if s.strip()]))
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue