full-feed-financial-times/main.py

378 lines
11 KiB
Python
Executable File

#!/usr/bin/python
#
import mysql.connector
import sys
import feedparser
import time
from threading import Timer, Lock
import selenium
from selenium import webdriver
import lxml.etree as etree
from cssselect import HTMLTranslator, SelectorError
import requests as req
from io import StringIO
import math
#import web
DB_HOST = "127.0.0.1"
DB_USER = "insecure"
DB_PASSWD = "insecure"
DB_DATABASE = "insecure"
ACCOUNT_USERNAME = "foooo@google.com"
ACCOUNT_PASSWORD = "foorg53"
def getDb():
db = mysql.connector.connect(
host=DB_HOST,
user=DB_USER,
passwd=DB_PASSWD,
database=DB_DATABASE)
return db
def tableExist(db, name):
cursor = db.cursor()
cursor.execute("""
SELECT COUNT(*)
FROM information_schema.tables
WHERE table_name = '{0}'
""".format(name))
if cursor.fetchone()[0] == 1:
cursor.close()
return True
def createTable(db, name):
cursor = db.cursor()
cursor.execute("""
CREATE TABLE {0}
(id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(255),
author VARCHAR(255),
content TEXT(65535))
""".format(name))
cursor.close()
def createTableMa(db, name):
cursor = db.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS {}
(id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255),
url VARCHAR(255),
pass VARCHAR(255),
user VARCHAR(255),
selector VARCHAR(255),
selector_login_user VARCHAR(255),
selector_login_pass VARCHAR(255),
selector_login_verify VARCHAR(255),
selector_login_url VARCHAR(255),
selector_login_verify_url VARCHAR(255))
""".format(name))
cursor.close()
def createTableIm(db, name):
if tableExist(db, name):
return True
else:
createTable(db, name)
createTableIm(db, name)
def die():
sys.exit()
def insertFeed(db, table, title, date, author, content):
cursor = db.cursor()
cursor.execute("INSERT INTO {0} (title, date, author, content) VALUES ( {1},{2},{3},{4} ) ".format(table,title,date,author,content))
cursor.close()
def queryFeed(db, table, title, date, author):
cursor = db.cursor()
cursor.execute("SELECT id FROM {} WHERE tistle = '{}' AND date = '{}'".format(table, title, date))
d = cursor.fetchone()
if d is None:
return -1
else:
return int(d)
def queryRss(url, rss):
if rss:
feed = feedparser.parse(url)
return feed
def getContent(entryUrl, session_cookies, selector):
r = req.get(entryUrl, cookies=session_cookies)
print(r.status_code)
if r.status_code != 200:
print("Err fetching: " + entryUrl)
die()
selector = HTMLTranslator().css_to_xpath(selector)
parser = etree.HTMLParser()
tree = etree.parse(StringIO(r.text), parser)
a = tree.xpath(selector)
return a.tostring()
#p1 = (23.,23.)
def move_mouse(p1, p2, driver, time=1000, n=400, click=False):
action = ActionChains(driver);
points = bspline(gcv(p1,p2,5), degree=3, n=n)
for point in points:
action.move_to_element(point[0],point[1]);
action.perform();
sleep(time/(1000*n))
if click:
action.click()
def get_loc(e):
return (e.location['x']+(e.size['width']/3),e.location['y']+(e.size['height']/3))
def getSession(user, password, url="https://accounts.ft.com/login"):
#selenium
driver = selenium.webdriver.Chrome()
driver.get(url)
email = driver.find_element_by_id("enter-email")
pos1 = get_loc(email)
move_mouse((128.3, 345.2),pos1,driver, n=200)
email.send_keys(user)
time.sleep(2)
email_submit = driver.find_element_by_id("enter-email-next")
pos2 = get_loc(email,submit)
move_mouse(pos1,pos2,driver, n=50)
email_submit.submit()
time.sleep(3)
password_el = driver.find_element_by_id("enter-password").send_keys(password)
pos3 = get_loc(password_el)
move_mouse(pos2,pos3,driver, n=50)
password_el.send_keys(password)
time.sleep(5)
button = driver.find_element_by_css_selector(".o-buttons--primary.o-buttons--big.main-button")
pos4 = get_loc(button)
move_mouse(pos3,pos4,driver, n=50, click=True)
time.sleep(20)
cookies = driver.get_cookies()
cookies_dict = {}
for cookie in cookies:
cookies_dict[cookie['name']] = cookie['value']
print(cookies_dict)
die()
return cookies
#url: https://www.ft.com/myaccount
def verifySession(session, user, url="https://www.ft.com/myaccount"):
driver = selenium.webdriver.Chrome()
driver.add_cookie(session)
driver.get(url)
q = driver.find_elements_by_id("rightRailEmailAddress")
for i in q:
if i.text.find(user) is not None:
return True
def updateFeed(db, table):
cookies = getSession(ACCOUNT_USERNAME, ACCOUNT_PASSWORD, "https://accounts.ft.com/login")
if not verifySession(cookies, ACCOUNT_USERNAME, "https://www.ft.com/myaccount"):
print("Session couldnt be verified")
return
feed = queryRss("https://www.ft.com/world?format=rss", True)
for post in feed.entries:
if( queryFeed(db,table, post.title, post.created, post.author) < 0):
content = getContent(post.link, cookies,"img.n-image,div.article__content-body.n-content-body.js-article__content-bod" )
insertFeed(db, table, post.title, post.created, post.author, content)
def updateFeedFromDb(db, table, feed):
cookies = getSession(feed.username, feed.password, feed.selector_login_url)
if not verifySession(cookies, feed.username, feed.selector_verify_url):
print("Session couldnt be verified")
return
feed = queryRss(feed.url, True)
for post in feed.entries:
if( queryFeed(db,table, post.title, post.created, post.author) < 0):
content = getContent(post.link, cookies,feed.selector )
insertFeed(db, table, post.title, post.created, post.author, content)
def main():
db = getDb()
createTableMa(db, "feeds")
if not createTableIm(db, "feed01"):
die()
updateFeed(db,"feed01")
#rt = Periodic(60*60*1, updateFeed, [db, "feed01"])
while True:
time.sleep(5)
if __name__ == "__main__":
main()
class lf:
def __init__(self, db, name):
self.name = name
self.db = db
self.url = ""
self.password = ""
self.username = ""
self.selector = ""
self.selector_login_user = ""
self.selector_login_pass = ""
self.selector_login_verify = ""
self.selector_login_url = ""
self.selector_login_verify_url = ""
self.init_from_db()
def init_from_db(self):
# Assumes it works
self.url = self.query_db("url")
self.password = self.query_db("password")
self.username = self.query_db("username")
self.selector = self.query_db("selector")
self.selector_login_user = self.query_db("selector_login_user")
self.selector_login_pass = self.query_db("selector_login_pass")
self.selector_login_verify = self.query_db("selector_login_verify")
self.selector_login_url = self.query_db("selector_login_url")
self.selector_login_verify_url = self.query_db("selector_login_verify_url")
def init_from_scratch(
self,
name,
url,
password,
username,
selector,
selector_login_user,
selector_login_pass,
selector_login_verify,
selector_login_verify_url,
selector_login_url):
self.name = name
self.url = url
self.password = password
self.username = username
self.selector = selector
self.selector_login_user = selector_login_user
self.selector_login_pass = selector_login_pass
self.selector_login_verify = selector_login_verify
self.selector_login_url = selector_login_url
self.selector_login_verify_url = selector_login_verify_url
def query_db(self, string):
cursor = self.db.cursor()
cursor.execute("""
SELECT {} FROM feeds
WHERE name = '{}'
""".format(string,self.name)
)
result = cursor.fetchone()
cursor.close()
if result is not None:
return result
else:
return ""
def query_db(self, string, value):
cursor = self.db.cursor()
cursor.execute("""
UPDATE SET {}='{}' FROM feeds
WHERE name = '{}'
""".format(string,value,self.name)
)
cursor.close()
def write_db(self):
cursor = self.db.cursor()
cursor.execute("SELECT id FROM feeds WHERE name = '{}'".format(self.name))
if cursor.fetchone() is None:
cursor.execute("""
INSERT INTO {} (name, url, password, username,
selector, selector_login_user,selector_login_pass,
selector_login_verify,selector_login_url,
selector_login_verify_url)
VALUES ({},{},{},{},{},{},{},{},{},{},{})
""".format(
"feeds",self.name,self.url,self.password,
self.username,self.selector,
self.selector_login_user,self.selector_login_pass,
self.selector_login_verify,self.selector_login_url,
self.selector_login_verify_url)
)
else:
cursor.execute("""
UPDATE feeds SET url = '{}', password = '{}', username = '{}',
selector = '{}', selector_login_user = '{}',selector_login_pass = '{}',
selector_login_verify = '{}',selector_login_url = '{}',
selector_login_verify_url = '{}')
VALUES ({},{},{},{},{},{},{},{},{})
""".format(
self.url,self.password,
self.username,self.selector,
self.selector_login_user,self.selector_login_pass,
self.selector_login_verify,self.selector_login_url,
self.selector_login_verify_url)
)
class Periodic(object):
"""
A periodic task running in threading.Timers
"""
def __init__(self, interval, function, *args, **kwargs):
self._lock = Lock()
self._timer = None
self.function = function
self.interval = interval
self.args = args
self.kwargs = kwargs
self._stopped = True
if kwargs.pop('autostart', True):
self.start()
def start(self, from_run=False):
self._lock.acquire()
if from_run or self._stopped:
self._stopped = False
self._timer = Timer(self.interval, self._run)
self._timer.start()
self._lock.release()
def _run(self):
self.start(from_run=True)
self.function(*self.args, **self.kwargs)
def stop(self):
self._lock.acquire()
self._stopped = True
self._timer.cancel()
self._lock.release()