full-feed-financial-times/main.py

386 lines
12 KiB
Python
Executable File

#!/usr/bin/python
#
import mysql.connector
import sys
import feedparser
import time
from threading import Timer, Lock
import selenium
from selenium import webdriver
from selenium.webdriver.common.action_chains import ActionChains
import lxml.etree as etree
from cssselect import HTMLTranslator, SelectorError
import requests as req
from io import StringIO
from spline import gcv,bspline
#import web
DB_HOST = "127.0.0.1"
DB_USER = "insecure"
DB_PASSWD = "insecure"
DB_DATABASE = "insecure"
ACCOUNT_USERNAME = "foooo@google.com"
ACCOUNT_PASSWORD = "foorg53"
def getDb():
db = mysql.connector.connect(
host=DB_HOST,
user=DB_USER,
passwd=DB_PASSWD,
database=DB_DATABASE)
return db
def tableExist(db, name):
cursor = db.cursor()
cursor.execute("""
SELECT COUNT(*)
FROM information_schema.tables
WHERE table_name = '{0}'
""".format(name))
if cursor.fetchone()[0] == 1:
cursor.close()
return True
def createTable(db, name):
cursor = db.cursor()
cursor.execute("""
CREATE TABLE {0}
(id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(255),
author VARCHAR(255),
content TEXT(65535))
""".format(name))
cursor.close()
def createTableMa(db, name):
cursor = db.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS {}
(id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255),
url VARCHAR(255),
pass VARCHAR(255),
user VARCHAR(255),
selector VARCHAR(255),
selector_login_user VARCHAR(255),
selector_login_pass VARCHAR(255),
selector_login_verify VARCHAR(255),
selector_login_url VARCHAR(255),
selector_login_verify_url VARCHAR(255))
""".format(name))
cursor.close()
def createTableIm(db, name):
if tableExist(db, name):
return True
else:
createTable(db, name)
createTableIm(db, name)
def die():
sys.exit()
def insertFeed(db, table, title, date, author, content):
cursor = db.cursor()
cursor.execute("INSERT INTO {0} (title, date, author, content) VALUES ( {1},{2},{3},{4} ) ".format(table,title,date,author,content))
cursor.close()
def queryFeed(db, table, title, date, author):
cursor = db.cursor()
cursor.execute("SELECT id FROM {} WHERE tistle = '{}' AND date = '{}'".format(table, title, date))
d = cursor.fetchone()
if d is None:
return -1
else:
return int(d)
def queryRss(url, rss):
if rss:
feed = feedparser.parse(url)
return feed
def getContent(entryUrl, session_cookies, selector):
r = req.get(entryUrl, cookies=session_cookies)
print(r.status_code)
if r.status_code != 200:
print("Err fetching: " + entryUrl)
die()
selector = HTMLTranslator().css_to_xpath(selector)
parser = etree.HTMLParser()
tree = etree.parse(StringIO(r.text), parser)
a = tree.xpath(selector)
return a.tostring()
#p1 = (23.,23.)
def move_mouse(p1, p2, driver, time_=1000, n=400, click=False):
action = ActionChains(driver);
points = bspline(gcv(p1,p2,12), degree=5, n=n)
print(points)
for point in points:
if point[0] < 0 or point[1] < 0:
continue
print(point[0],point[1])
action.move_by_offset(int(point[0]),int(point[1]));
action.perform();
time.sleep(time_/(1000*n))
if click:
action.click()
def get_loc(e):
return (e.location['x']+(e.size['width']/3),e.location['y']+(e.size['height']/3))
def getSession(user, password, url="https://accounts.ft.com/login"):
#selenium
driver = selenium.webdriver.Chrome()
driver.set_window_position(0, 0)
driver.set_window_size(3840, 1920)
driver.get(url)
email = driver.find_element_by_id("enter-email")
pos1 = get_loc(email)
print(pos1)
move_mouse((123.3, 334.2),pos1,driver, n=200)
email.send_keys(user)
time.sleep(2)
email_submit = driver.find_element_by_id("enter-email-next")
pos2 = get_loc(email,submit)
move_mouse(pos1,pos2,driver, n=50)
email_submit.submit()
time.sleep(3)
password_el = driver.find_element_by_id("enter-password").send_keys(password)
pos3 = get_loc(password_el)
move_mouse(pos2,pos3,driver, n=50)
password_el.send_keys(password)
time.sleep(5)
button = driver.find_element_by_css_selector(".o-buttons--primary.o-buttons--big.main-button")
pos4 = get_loc(button)
move_mouse(pos3,pos4,driver, n=50, click=True)
time.sleep(20)
cookies = driver.get_cookies()
cookies_dict = {}
for cookie in cookies:
cookies_dict[cookie['name']] = cookie['value']
print(cookies_dict)
die()
return cookies
#url: https://www.ft.com/myaccount
def verifySession(session, user, url="https://www.ft.com/myaccount"):
driver = selenium.webdriver.Chrome()
driver.add_cookie(session)
driver.get(url)
q = driver.find_elements_by_id("rightRailEmailAddress")
for i in q:
if i.text.find(user) is not None:
return True
def updateFeed(db, table):
cookies = getSession(ACCOUNT_USERNAME, ACCOUNT_PASSWORD, "https://accounts.ft.com/login")
if not verifySession(cookies, ACCOUNT_USERNAME, "https://www.ft.com/myaccount"):
print("Session couldnt be verified")
return
feed = queryRss("https://www.ft.com/world?format=rss", True)
for post in feed.entries:
if( queryFeed(db,table, post.title, post.created, post.author) < 0):
content = getContent(post.link, cookies,"img.n-image,div.article__content-body.n-content-body.js-article__content-bod" )
insertFeed(db, table, post.title, post.created, post.author, content)
def updateFeedFromDb(db, table, feed):
cookies = getSession(feed.username, feed.password, feed.selector_login_url)
if not verifySession(cookies, feed.username, feed.selector_verify_url):
print("Session couldnt be verified")
return
feed = queryRss(feed.url, True)
for post in feed.entries:
if( queryFeed(db,table, post.title, post.created, post.author) < 0):
content = getContent(post.link, cookies,feed.selector )
insertFeed(db, table, post.title, post.created, post.author, content)
def main():
db = getDb()
createTableMa(db, "feeds")
if not createTableIm(db, "feed01"):
die()
updateFeed(db,"feed01")
#rt = Periodic(60*60*1, updateFeed, [db, "feed01"])
while True:
time.sleep(5)
if __name__ == "__main__":
main()
class lf:
def __init__(self, db, name):
self.name = name
self.db = db
self.url = ""
self.password = ""
self.username = ""
self.selector = ""
self.selector_login_user = ""
self.selector_login_pass = ""
self.selector_login_verify = ""
self.selector_login_url = ""
self.selector_login_verify_url = ""
self.init_from_db()
def init_from_db(self):
# Assumes it works
self.url = self.query_db("url")
self.password = self.query_db("password")
self.username = self.query_db("username")
self.selector = self.query_db("selector")
self.selector_login_user = self.query_db("selector_login_user")
self.selector_login_pass = self.query_db("selector_login_pass")
self.selector_login_verify = self.query_db("selector_login_verify")
self.selector_login_url = self.query_db("selector_login_url")
self.selector_login_verify_url = self.query_db("selector_login_verify_url")
def init_from_scratch(
self,
name,
url,
password,
username,
selector,
selector_login_user,
selector_login_pass,
selector_login_verify,
selector_login_verify_url,
selector_login_url):
self.name = name
self.url = url
self.password = password
self.username = username
self.selector = selector
self.selector_login_user = selector_login_user
self.selector_login_pass = selector_login_pass
self.selector_login_verify = selector_login_verify
self.selector_login_url = selector_login_url
self.selector_login_verify_url = selector_login_verify_url
def query_db(self, string):
cursor = self.db.cursor()
cursor.execute("""
SELECT {} FROM feeds
WHERE name = '{}'
""".format(string,self.name)
)
result = cursor.fetchone()
cursor.close()
if result is not None:
return result
else:
return ""
def query_db(self, string, value):
cursor = self.db.cursor()
cursor.execute("""
UPDATE SET {}='{}' FROM feeds
WHERE name = '{}'
""".format(string,value,self.name)
)
cursor.close()
def write_db(self):
cursor = self.db.cursor()
cursor.execute("SELECT id FROM feeds WHERE name = '{}'".format(self.name))
if cursor.fetchone() is None:
cursor.execute("""
INSERT INTO {} (name, url, password, username,
selector, selector_login_user,selector_login_pass,
selector_login_verify,selector_login_url,
selector_login_verify_url)
VALUES ({},{},{},{},{},{},{},{},{},{},{})
""".format(
"feeds",self.name,self.url,self.password,
self.username,self.selector,
self.selector_login_user,self.selector_login_pass,
self.selector_login_verify,self.selector_login_url,
self.selector_login_verify_url)
)
else:
cursor.execute("""
UPDATE feeds SET url = '{}', password = '{}', username = '{}',
selector = '{}', selector_login_user = '{}',selector_login_pass = '{}',
selector_login_verify = '{}',selector_login_url = '{}',
selector_login_verify_url = '{}')
VALUES ({},{},{},{},{},{},{},{},{})
""".format(
self.url,self.password,
self.username,self.selector,
self.selector_login_user,self.selector_login_pass,
self.selector_login_verify,self.selector_login_url,
self.selector_login_verify_url)
)
class Periodic(object):
"""
A periodic task running in threading.Timers
"""
def __init__(self, interval, function, *args, **kwargs):
self._lock = Lock()
self._timer = None
self.function = function
self.interval = interval
self.args = args
self.kwargs = kwargs
self._stopped = True
if kwargs.pop('autostart', True):
self.start()
def start(self, from_run=False):
self._lock.acquire()
if from_run or self._stopped:
self._stopped = False
self._timer = Timer(self.interval, self._run)
self._timer.start()
self._lock.release()
def _run(self):
self.start(from_run=True)
self.function(*self.args, **self.kwargs)
def stop(self):
self._lock.acquire()
self._stopped = True
self._timer.cancel()
self._lock.release()