Loads of changes. Filling the database seems to work now.

This commit is contained in:
Klaus-Uwe Mitterer 2016-08-07 01:46:20 +02:00
parent 59fe1a23f4
commit be09282609
2 changed files with 80 additions and 36 deletions

View file

@ -63,13 +63,24 @@ class dbObject:
except:
return False
def getLatestMessage(db):
db.executeQuery("SELECT max(id) FROM messages")
def getLatestMessage(db, mode = 0, user = setuptools.user()):
if mode == 0:
db.executeQuery("SELECT max(id) FROM messages WHERE recipient_id='%s'" % user)
else:
db.executeQuery("SELECT max(id) FROM messages WHERE sender_id='%s'" % user)
try:
return int(db.getNext()[0])
except:
return 0
def checkID(db, mid):
db.executeQuery("SELECT * FROM messages WHERE id=%s" % mid)
try:
db.getNext()[0]
return True
except:
return False
def dbHelper():
if setuptools.dbtype() == SQLITE:
return dbObject(dbtype=SQLITE, path=setuptools.dbpath())

View file

@ -1,11 +1,13 @@
#!/usr/bin/env python3
from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
import urllib.request, urllib.error, urllib.parse, time, os
import setuptools
import dbtools, setuptools
currentRun = []
def status(driver):
if "/main/login.php" not in driver.page_source:
@ -13,14 +15,15 @@ def status(driver):
else:
return False
def loadPage(url,period=5,init=False, driver=driver):
def loadPage(url, driver, period=5,init=False):
if not (init or status(driver)):
login()
login(driver)
driver.get(url)
time.sleep(period)
def loginHandler(user = setuptools.user, password = setuptools.password, driver = driver):
loadPage("http://www.planetromeo.com/",10,True,driver)
def loginHandler(driver, user = setuptools.user(), password = setuptools.password()):
loadPage("https://www.planetromeo.com/",driver,3,True)
loadPage("https://www.planetromeo.com/main/login.php",driver,3,True)
curfield = driver.find_element_by_name("username")
curfield.send_keys(user)
@ -29,59 +32,89 @@ def loginHandler(user = setuptools.user, password = setuptools.password, driver
curfield.send_keys(password)
curfield.send_keys(Keys.RETURN)
time.sleep(10)
time.sleep(3)
return status()
return status(driver)
class LoginError(Exception):
pass
def login():
if not (status() or loginHandler()):
def login(driver):
if not (status(driver) or loginHandler(driver)):
raise LoginError("Login failed.")
return True
def messageID(url):
return url.split("=")[1]
return url.split("=")[-1]
def messageHandler(mid, driver):
loadPage("https://www.planetromeo.com/msg/?id=" + mid, driver=driver)
def messageHandler(sender, recipient, mid, date, driver, mode = 0, db = dbtools.dbHelper()):
global currentRun
if mode == 0:
loadPage("https://www.planetromeo.com/msg/?id=" + mid, driver)
else:
loadPage("https://www.planetromeo.com/msg/?type=sent&id=" + mid, driver)
juha = BeautifulSoup(driver.page_source, "html5lib")
text = juha.select("div.msg div")[0]
db.executeQuery("INSERT INTO messages(id, text, sender_id, recipient_id, created_at) VALUES('%s', '%s', '%s', '%s', '%s');" % (mid, setuptools.unescapeText(text.string or "").strip(), sender, recipient, date))
db.commit()
try:
links = driver.find_elements_by_partial_link_text('pix/popup.php/')
links = juha.findAll("a")
for link in links:
phototools.processURL(link.get_attribute('href'), mid)
except NoSuchElementException as e:
pass
if "/pix/popup.php/" in link["href"]:
phototools.processURL(link["href"], sender)
except:
pass
def pageHandler(driver):
webpage = driver.page_source
links = BeautifulSoup(webpage).findAll('a')
currentRun += [mid]
def pageHandler(driver, db = dbtools.dbHelper()):
global currentRun
count = 0
juha = BeautifulSoup(driver.page_source, "html5lib")
for l in links:
url = l['href']
if "/msg/?id=" in url:
count += 1
mid = messageID(url)
if mid <= dbtools.getLatestMessage()
return False
messageHandler(mid, driver)
mode = 0
if "sent" in driver.current_url:
mode = 1
try:
for msg in juha.select("table.messageCenter tr")[1:]:
try:
data = msg.findAll('td')
user = data[1].string
mid = messageID(data[2].find("a")["href"])
date = data[3].string
if not db.checkID(mid):
if mode == 1:
messageHandler(user, setuptools.user(), mid, date, driver, mode, db)
else:
messageHandler(setuptools.user(), user, mid, date, driver, mode, db)
count += 1
except IndexError:
pass
except IndexError:
return False
if count == 0:
return False
return True
def siteHandler(p = 0, driver = driver):
loadPage("https://www.planetromeo.com/mitglieder/messages/uebersicht.php?seite=" + str(p), driver=driver)
if pageHandler(driver):
siteHandler(p+1, driver)
def siteHandler(driver, mode = 0, p = 0, db = dbtools.dbHelper()):
if mode == 0:
loadPage("https://www.planetromeo.com/mitglieder/messages/uebersicht.php?view=all&seite=" + str(p), driver)
else:
loadPage("https://www.planetromeo.com/mitglieder/messages/uebersicht.php?view=sent&seite=" + str(p), driver)
if pageHandler(driver, db):
siteHandler(driver, mode, p+1, db)
if __name__ == "__main__":
db = dbtools.dbHelper()
driver = webdriver.Firefox()
if login(driver):
siteHandler(driver=driver)
if loginHandler(driver):
siteHandler(driver, db=db)
siteHandler(driver, 1, db=db)
print("KTHXBAI")