Fetching and storing RSS full content feeds from ft.com

This commit is contained in:
printfuck 2020-04-16 04:11:14 +02:00
commit 889ea50657

288
main.py Executable file
View file

@ -0,0 +1,288 @@
#!/usr/bin/python
#
import mysql.connector
import sys
import feedparser
import time
from threading import Timer, Lock
import selenium
import lxml.etree as etree
from cssselect import HTMLTranslator, SelectorError
import requests as req
from io import StringIO
DB_HOST = "127.0.0.1"
DB_USER = "dbuser"
DB_PASSWD = "dbpasswd"
DB_DATABASE = "base"
ACCOUNT_USERNAME = "foo"
ACCOUNT_PASSWORD = "foo"
def getDb():
db = mysql.connector.connect(
host=DB_HOST,
user=DB_USER,
passwd=DB_PASSWD,
database=DB_DATABASE)
return db
def tableExist(db, name):
cursor = db.cursor()
cursor.execute("""
SELECT COUNT(*)
FROM information_schema.tables
WHERE table_name = '{0}'
""".format(name))
if cursor.fetchone()[0] == 1:
cursor.close()
return True
def createTable(db, name):
cursor = db.cursor()
cursor.execute("""
CREATE TABLE {0}
(id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(255),
author VARCHAR(255),
content VARCHAR(65535))
""".format(name))
cursor.close()
def createTableMa(db, name):
cursor = db.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS {}
(id INT AUTO_INCREMENT PRIMARY KEY,
name VARCHAR(255),
url VARCHAR(255),
pass VARCHAR(255),
user VARCHAR(255)),
selector VARCHAR(255),
selector_login_user VARCHAR(255),
selector_login_pass VARCHAR(255),
selector_login_verify VARCHAR(255),
selector_login_url VARCHAR(255),
selector_login_verify_url VARCHAR(255)
""".format(name))
cursor.close()
def createTableIm(db, name):
if tableExist(db, name):
return True
else:
createTable(db, name)
createTableIm(db, name)
def die():
sys.exit()
def insertFeed(db, table, title, date, author, content):
cursor = db.cursor()
cursor.execute("INSERT INTO {0} (title, date, author, content) VALUES ( {1},{2},{3},{4} ) ".format(table,title,date,author,content))
cursor.close()
def queryFeed(db, table, title, date, author):
cursor = db.cursor()
cursor.execute("SELECT id FROM {} WHERE tistle = '{}' AND date = '{}'".format(table, title, date))
d = cursor.fetchone()
if d is None:
return -1
else:
return int(d)
def queryRss(url, rss):
if rss:
feed = feedparser.parse(url)
return feed
def getContent(entryUrl, session_cookies, selector):
r = req.get(entryUrl, cookies=session_cookies)
print(r.status_code)
if r.status_code != 200:
print("Err fetching: " + entryUrl)
die()
selector = HTMLTranslator().css_to_xpath(selector)
parser = etree.HTMLParser()
tree = etree.parse(StringIO(r.text), parser)
a = tree.xpath(selector)
return a.tostring()
#url: https://accounts.ft.com/login
def getSession(user, password, url="https://accounts.ft.com/login"):
#selenium
driver = selenium.webdriver.Chrome()
driver.get(url)
driver.find_element_by_id("enter-email").send_keys(user)
#sleep
driver.find_element_by_id("enter-email-next").submit()
#sleep
driver.find_element_by_id("enter-password").send_keys(password)
driver.find_element_by_id("enter-password").submit()
cookies = driver.get_cookies()
return cookies
#url: https://www.ft.com/myaccount
def verifySession(session, user, url="https://www.ft.com/myaccount"):
driver = selenium.webdriver.Chrome()
driver.add_cookie(session)
driver.get(url)
q = driver.find_elements_by_id("rightRailEmailAddress")
for i in q:
if i.text.find(user) is not None:
return True
def updateFeed(db, table):
cookies = getSession(ACCOUNT_USERNAME, ACCOUNT_PASSWORD, "https://accounts.ft.com/login")
if not verifySession(cookies, ACCOUNT_USERNAME, "https://www.ft.com/myaccount"):
print("Session couldnt be verified")
return
feed = queryRss("https://www.ft.com/world?format=rss", True)
for post in feed.entries:
if( queryFeed(db,table, post.title, post.created, post.author) < 0):
content = getContent(post.link, cookies,"img.n-image,div.article__content-body.n-content-body.js-article__content-bod" )
insertFeed(db, table, post.title, post.created, post.author, content)
def updateFeedFromDb(db, table, feed):
cookies = getSession(feed.username, feed.password, feed.selector_login_url)
if not verifySession(cookies, feed.username, feed.selector_verify_url):
print("Session couldnt be verified")
return
feed = queryRss(feed.url, True)
for post in feed.entries:
if( queryFeed(db,table, post.title, post.created, post.author) < 0):
content = getContent(post.link, cookies,feed.selector )
insertFeed(db, table, post.title, post.created, post.author, content)
def main():
db = getDb()
createTableMa(db, "feeds")
if not createTableIm(db, "feed01"):
die()
rt = Periodic(60*60*1, updateFeed, [db, "feed01"])
while True:
time.sleep(5)
class lf:
def __init__(self, db, name):
self.name = name
self.db = db
self.url = ""
self.password = ""
self.username = ""
self.selector = ""
self.selector_login_user = ""
self.selector_login_pass = ""
self.selector_login_verify = ""
self.selector_login_url = ""
self.selector_login_verify_url = ""
self.init_from_db()
def init_from_db(self):
# Assumes it works
self.url = self.query_db(self.db, "url")
self.password = self.query_db(self.db, "password")
self.username = self.query_db(self.db, "username")
self.selector = self.query_db(self.db, "selector")
self.selector_login_user = self.query_db(self.db, "selector_login_user")
self.selector_login_pass = self.query_db(self.db, "selector_login_pass")
self.selector_login_verify = self.query_db(self.db, "selector_login_verify")
self.selector_login_url = self.query_db(self.db, "selector_login_url")
self.selector_login_verify_url = self.query_db(self.db, "selector_login_verify_url")
def init_from_scratch(
self,
name,
url,
password,
username,
selector,
selector_login_user,
selector_login_pass,
selector_login_verify,
selector_login_verify_url,
selector_login_url):
self.name = name
self.url = url
self.password = password
self.username = username
self.selector = selector
self.selector_login_user = selector_login_user
self.selector_login_pass = selector_login_pass
self.selector_login_verify = selector_login_verify
self.selector_login_url = selector_login_url
self.selector_login_verify_url = selector_login_verify_url
def query_db(self, db, string):
cursor = db.cursor()
cursor.execute("""
SELECT {} FROM feeds
WHERE name = '{}'
""".format(string,self.name)
)
result = cursor.fetchone()
cursor.close()
if result is not None:
return result
else:
return ""
class Periodic(object):
"""
A periodic task running in threading.Timers
"""
def __init__(self, interval, function, *args, **kwargs):
self._lock = Lock()
self._timer = None
self.function = function
self.interval = interval
self.args = args
self.kwargs = kwargs
self._stopped = True
if kwargs.pop('autostart', True):
self.start()
def start(self, from_run=False):
self._lock.acquire()
if from_run or self._stopped:
self._stopped = False
self._timer = Timer(self.interval, self._run)
self._timer.start()
self._lock.release()
def _run(self):
self.start(from_run=True)
self.function(*self.args, **self.kwargs)
def stop(self):
self._lock.acquire()
self._stopped = True
self._timer.cancel()
self._lock.release()