From 889ea50657e1e583628c974dd0d8120336be70e2 Mon Sep 17 00:00:00 2001 From: printfuck Date: Thu, 16 Apr 2020 04:11:14 +0200 Subject: [PATCH] Fetching and storing RSS full content feeds from ft.com --- main.py | 288 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100755 main.py diff --git a/main.py b/main.py new file mode 100755 index 0000000..057d8f7 --- /dev/null +++ b/main.py @@ -0,0 +1,288 @@ +#!/usr/bin/python +# + +import mysql.connector +import sys +import feedparser +import time +from threading import Timer, Lock + +import selenium +import lxml.etree as etree +from cssselect import HTMLTranslator, SelectorError +import requests as req +from io import StringIO + +DB_HOST = "127.0.0.1" +DB_USER = "dbuser" +DB_PASSWD = "dbpasswd" +DB_DATABASE = "base" +ACCOUNT_USERNAME = "foo" +ACCOUNT_PASSWORD = "foo" + + +def getDb(): + db = mysql.connector.connect( + host=DB_HOST, + user=DB_USER, + passwd=DB_PASSWD, + database=DB_DATABASE) + return db + +def tableExist(db, name): + cursor = db.cursor() + cursor.execute(""" + SELECT COUNT(*) + FROM information_schema.tables + WHERE table_name = '{0}' + """.format(name)) + if cursor.fetchone()[0] == 1: + cursor.close() + return True + +def createTable(db, name): + cursor = db.cursor() + cursor.execute(""" + CREATE TABLE {0} + (id INT AUTO_INCREMENT PRIMARY KEY, + title VARCHAR(255), + author VARCHAR(255), + content VARCHAR(65535)) + """.format(name)) + cursor.close() + +def createTableMa(db, name): + cursor = db.cursor() + cursor.execute(""" + CREATE TABLE IF NOT EXISTS {} + (id INT AUTO_INCREMENT PRIMARY KEY, + name VARCHAR(255), + url VARCHAR(255), + pass VARCHAR(255), + user VARCHAR(255)), + selector VARCHAR(255), + selector_login_user VARCHAR(255), + selector_login_pass VARCHAR(255), + selector_login_verify VARCHAR(255), + selector_login_url VARCHAR(255), + selector_login_verify_url VARCHAR(255) + """.format(name)) + cursor.close() + +def createTableIm(db, name): + if tableExist(db, name): + return True + else: + createTable(db, name) + createTableIm(db, name) + +def die(): + sys.exit() + +def insertFeed(db, table, title, date, author, content): + cursor = db.cursor() + cursor.execute("INSERT INTO {0} (title, date, author, content) VALUES ( {1},{2},{3},{4} ) ".format(table,title,date,author,content)) + cursor.close() + +def queryFeed(db, table, title, date, author): + cursor = db.cursor() + cursor.execute("SELECT id FROM {} WHERE tistle = '{}' AND date = '{}'".format(table, title, date)) + d = cursor.fetchone() + if d is None: + return -1 + else: + return int(d) + +def queryRss(url, rss): + if rss: + feed = feedparser.parse(url) + return feed + +def getContent(entryUrl, session_cookies, selector): + r = req.get(entryUrl, cookies=session_cookies) + print(r.status_code) + if r.status_code != 200: + print("Err fetching: " + entryUrl) + die() + + selector = HTMLTranslator().css_to_xpath(selector) + + parser = etree.HTMLParser() + tree = etree.parse(StringIO(r.text), parser) + a = tree.xpath(selector) + + return a.tostring() + +#url: https://accounts.ft.com/login + +def getSession(user, password, url="https://accounts.ft.com/login"): + #selenium + driver = selenium.webdriver.Chrome() + driver.get(url) + driver.find_element_by_id("enter-email").send_keys(user) + #sleep + driver.find_element_by_id("enter-email-next").submit() + #sleep + driver.find_element_by_id("enter-password").send_keys(password) + driver.find_element_by_id("enter-password").submit() + + cookies = driver.get_cookies() + return cookies + +#url: https://www.ft.com/myaccount + +def verifySession(session, user, url="https://www.ft.com/myaccount"): + + driver = selenium.webdriver.Chrome() + driver.add_cookie(session) + driver.get(url) + q = driver.find_elements_by_id("rightRailEmailAddress") + for i in q: + if i.text.find(user) is not None: + return True + +def updateFeed(db, table): + cookies = getSession(ACCOUNT_USERNAME, ACCOUNT_PASSWORD, "https://accounts.ft.com/login") + if not verifySession(cookies, ACCOUNT_USERNAME, "https://www.ft.com/myaccount"): + print("Session couldnt be verified") + return + feed = queryRss("https://www.ft.com/world?format=rss", True) + for post in feed.entries: + if( queryFeed(db,table, post.title, post.created, post.author) < 0): + content = getContent(post.link, cookies,"img.n-image,div.article__content-body.n-content-body.js-article__content-bod" ) + insertFeed(db, table, post.title, post.created, post.author, content) + +def updateFeedFromDb(db, table, feed): + cookies = getSession(feed.username, feed.password, feed.selector_login_url) + if not verifySession(cookies, feed.username, feed.selector_verify_url): + print("Session couldnt be verified") + return + feed = queryRss(feed.url, True) + for post in feed.entries: + if( queryFeed(db,table, post.title, post.created, post.author) < 0): + content = getContent(post.link, cookies,feed.selector ) + insertFeed(db, table, post.title, post.created, post.author, content) + + + +def main(): + db = getDb() + createTableMa(db, "feeds") + if not createTableIm(db, "feed01"): + die() + rt = Periodic(60*60*1, updateFeed, [db, "feed01"]) + while True: + time.sleep(5) + + +class lf: + + def __init__(self, db, name): + self.name = name + self.db = db + self.url = "" + self.password = "" + self.username = "" + self.selector = "" + self.selector_login_user = "" + self.selector_login_pass = "" + self.selector_login_verify = "" + self.selector_login_url = "" + self.selector_login_verify_url = "" + self.init_from_db() + + def init_from_db(self): + # Assumes it works + self.url = self.query_db(self.db, "url") + self.password = self.query_db(self.db, "password") + self.username = self.query_db(self.db, "username") + self.selector = self.query_db(self.db, "selector") + self.selector_login_user = self.query_db(self.db, "selector_login_user") + self.selector_login_pass = self.query_db(self.db, "selector_login_pass") + self.selector_login_verify = self.query_db(self.db, "selector_login_verify") + self.selector_login_url = self.query_db(self.db, "selector_login_url") + self.selector_login_verify_url = self.query_db(self.db, "selector_login_verify_url") + + def init_from_scratch( + self, + name, + url, + password, + username, + selector, + selector_login_user, + selector_login_pass, + selector_login_verify, + selector_login_verify_url, + selector_login_url): + self.name = name + self.url = url + self.password = password + self.username = username + self.selector = selector + self.selector_login_user = selector_login_user + self.selector_login_pass = selector_login_pass + self.selector_login_verify = selector_login_verify + self.selector_login_url = selector_login_url + self.selector_login_verify_url = selector_login_verify_url + + def query_db(self, db, string): + cursor = db.cursor() + cursor.execute(""" + SELECT {} FROM feeds + WHERE name = '{}' + """.format(string,self.name) + ) + result = cursor.fetchone() + cursor.close() + if result is not None: + return result + else: + return "" + + + + +class Periodic(object): + """ + A periodic task running in threading.Timers + """ + + def __init__(self, interval, function, *args, **kwargs): + self._lock = Lock() + self._timer = None + self.function = function + self.interval = interval + self.args = args + self.kwargs = kwargs + self._stopped = True + if kwargs.pop('autostart', True): + self.start() + + def start(self, from_run=False): + self._lock.acquire() + if from_run or self._stopped: + self._stopped = False + self._timer = Timer(self.interval, self._run) + self._timer.start() + self._lock.release() + + def _run(self): + self.start(from_run=True) + self.function(*self.args, **self.kwargs) + + def stop(self): + self._lock.acquire() + self._stopped = True + self._timer.cancel() + self._lock.release() + + + + + + + + + +