Fetching and storing RSS full content feeds from ft.com
This commit is contained in:
commit
889ea50657
288
main.py
Executable file
288
main.py
Executable file
|
@ -0,0 +1,288 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
#
|
||||||
|
|
||||||
|
import mysql.connector
|
||||||
|
import sys
|
||||||
|
import feedparser
|
||||||
|
import time
|
||||||
|
from threading import Timer, Lock
|
||||||
|
|
||||||
|
import selenium
|
||||||
|
import lxml.etree as etree
|
||||||
|
from cssselect import HTMLTranslator, SelectorError
|
||||||
|
import requests as req
|
||||||
|
from io import StringIO
|
||||||
|
|
||||||
|
DB_HOST = "127.0.0.1"
|
||||||
|
DB_USER = "dbuser"
|
||||||
|
DB_PASSWD = "dbpasswd"
|
||||||
|
DB_DATABASE = "base"
|
||||||
|
ACCOUNT_USERNAME = "foo"
|
||||||
|
ACCOUNT_PASSWORD = "foo"
|
||||||
|
|
||||||
|
|
||||||
|
def getDb():
|
||||||
|
db = mysql.connector.connect(
|
||||||
|
host=DB_HOST,
|
||||||
|
user=DB_USER,
|
||||||
|
passwd=DB_PASSWD,
|
||||||
|
database=DB_DATABASE)
|
||||||
|
return db
|
||||||
|
|
||||||
|
def tableExist(db, name):
|
||||||
|
cursor = db.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT COUNT(*)
|
||||||
|
FROM information_schema.tables
|
||||||
|
WHERE table_name = '{0}'
|
||||||
|
""".format(name))
|
||||||
|
if cursor.fetchone()[0] == 1:
|
||||||
|
cursor.close()
|
||||||
|
return True
|
||||||
|
|
||||||
|
def createTable(db, name):
|
||||||
|
cursor = db.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE {0}
|
||||||
|
(id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
title VARCHAR(255),
|
||||||
|
author VARCHAR(255),
|
||||||
|
content VARCHAR(65535))
|
||||||
|
""".format(name))
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
def createTableMa(db, name):
|
||||||
|
cursor = db.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS {}
|
||||||
|
(id INT AUTO_INCREMENT PRIMARY KEY,
|
||||||
|
name VARCHAR(255),
|
||||||
|
url VARCHAR(255),
|
||||||
|
pass VARCHAR(255),
|
||||||
|
user VARCHAR(255)),
|
||||||
|
selector VARCHAR(255),
|
||||||
|
selector_login_user VARCHAR(255),
|
||||||
|
selector_login_pass VARCHAR(255),
|
||||||
|
selector_login_verify VARCHAR(255),
|
||||||
|
selector_login_url VARCHAR(255),
|
||||||
|
selector_login_verify_url VARCHAR(255)
|
||||||
|
""".format(name))
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
def createTableIm(db, name):
|
||||||
|
if tableExist(db, name):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
createTable(db, name)
|
||||||
|
createTableIm(db, name)
|
||||||
|
|
||||||
|
def die():
|
||||||
|
sys.exit()
|
||||||
|
|
||||||
|
def insertFeed(db, table, title, date, author, content):
|
||||||
|
cursor = db.cursor()
|
||||||
|
cursor.execute("INSERT INTO {0} (title, date, author, content) VALUES ( {1},{2},{3},{4} ) ".format(table,title,date,author,content))
|
||||||
|
cursor.close()
|
||||||
|
|
||||||
|
def queryFeed(db, table, title, date, author):
|
||||||
|
cursor = db.cursor()
|
||||||
|
cursor.execute("SELECT id FROM {} WHERE tistle = '{}' AND date = '{}'".format(table, title, date))
|
||||||
|
d = cursor.fetchone()
|
||||||
|
if d is None:
|
||||||
|
return -1
|
||||||
|
else:
|
||||||
|
return int(d)
|
||||||
|
|
||||||
|
def queryRss(url, rss):
|
||||||
|
if rss:
|
||||||
|
feed = feedparser.parse(url)
|
||||||
|
return feed
|
||||||
|
|
||||||
|
def getContent(entryUrl, session_cookies, selector):
|
||||||
|
r = req.get(entryUrl, cookies=session_cookies)
|
||||||
|
print(r.status_code)
|
||||||
|
if r.status_code != 200:
|
||||||
|
print("Err fetching: " + entryUrl)
|
||||||
|
die()
|
||||||
|
|
||||||
|
selector = HTMLTranslator().css_to_xpath(selector)
|
||||||
|
|
||||||
|
parser = etree.HTMLParser()
|
||||||
|
tree = etree.parse(StringIO(r.text), parser)
|
||||||
|
a = tree.xpath(selector)
|
||||||
|
|
||||||
|
return a.tostring()
|
||||||
|
|
||||||
|
#url: https://accounts.ft.com/login
|
||||||
|
|
||||||
|
def getSession(user, password, url="https://accounts.ft.com/login"):
|
||||||
|
#selenium
|
||||||
|
driver = selenium.webdriver.Chrome()
|
||||||
|
driver.get(url)
|
||||||
|
driver.find_element_by_id("enter-email").send_keys(user)
|
||||||
|
#sleep
|
||||||
|
driver.find_element_by_id("enter-email-next").submit()
|
||||||
|
#sleep
|
||||||
|
driver.find_element_by_id("enter-password").send_keys(password)
|
||||||
|
driver.find_element_by_id("enter-password").submit()
|
||||||
|
|
||||||
|
cookies = driver.get_cookies()
|
||||||
|
return cookies
|
||||||
|
|
||||||
|
#url: https://www.ft.com/myaccount
|
||||||
|
|
||||||
|
def verifySession(session, user, url="https://www.ft.com/myaccount"):
|
||||||
|
|
||||||
|
driver = selenium.webdriver.Chrome()
|
||||||
|
driver.add_cookie(session)
|
||||||
|
driver.get(url)
|
||||||
|
q = driver.find_elements_by_id("rightRailEmailAddress")
|
||||||
|
for i in q:
|
||||||
|
if i.text.find(user) is not None:
|
||||||
|
return True
|
||||||
|
|
||||||
|
def updateFeed(db, table):
|
||||||
|
cookies = getSession(ACCOUNT_USERNAME, ACCOUNT_PASSWORD, "https://accounts.ft.com/login")
|
||||||
|
if not verifySession(cookies, ACCOUNT_USERNAME, "https://www.ft.com/myaccount"):
|
||||||
|
print("Session couldnt be verified")
|
||||||
|
return
|
||||||
|
feed = queryRss("https://www.ft.com/world?format=rss", True)
|
||||||
|
for post in feed.entries:
|
||||||
|
if( queryFeed(db,table, post.title, post.created, post.author) < 0):
|
||||||
|
content = getContent(post.link, cookies,"img.n-image,div.article__content-body.n-content-body.js-article__content-bod" )
|
||||||
|
insertFeed(db, table, post.title, post.created, post.author, content)
|
||||||
|
|
||||||
|
def updateFeedFromDb(db, table, feed):
|
||||||
|
cookies = getSession(feed.username, feed.password, feed.selector_login_url)
|
||||||
|
if not verifySession(cookies, feed.username, feed.selector_verify_url):
|
||||||
|
print("Session couldnt be verified")
|
||||||
|
return
|
||||||
|
feed = queryRss(feed.url, True)
|
||||||
|
for post in feed.entries:
|
||||||
|
if( queryFeed(db,table, post.title, post.created, post.author) < 0):
|
||||||
|
content = getContent(post.link, cookies,feed.selector )
|
||||||
|
insertFeed(db, table, post.title, post.created, post.author, content)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
db = getDb()
|
||||||
|
createTableMa(db, "feeds")
|
||||||
|
if not createTableIm(db, "feed01"):
|
||||||
|
die()
|
||||||
|
rt = Periodic(60*60*1, updateFeed, [db, "feed01"])
|
||||||
|
while True:
|
||||||
|
time.sleep(5)
|
||||||
|
|
||||||
|
|
||||||
|
class lf:
|
||||||
|
|
||||||
|
def __init__(self, db, name):
|
||||||
|
self.name = name
|
||||||
|
self.db = db
|
||||||
|
self.url = ""
|
||||||
|
self.password = ""
|
||||||
|
self.username = ""
|
||||||
|
self.selector = ""
|
||||||
|
self.selector_login_user = ""
|
||||||
|
self.selector_login_pass = ""
|
||||||
|
self.selector_login_verify = ""
|
||||||
|
self.selector_login_url = ""
|
||||||
|
self.selector_login_verify_url = ""
|
||||||
|
self.init_from_db()
|
||||||
|
|
||||||
|
def init_from_db(self):
|
||||||
|
# Assumes it works
|
||||||
|
self.url = self.query_db(self.db, "url")
|
||||||
|
self.password = self.query_db(self.db, "password")
|
||||||
|
self.username = self.query_db(self.db, "username")
|
||||||
|
self.selector = self.query_db(self.db, "selector")
|
||||||
|
self.selector_login_user = self.query_db(self.db, "selector_login_user")
|
||||||
|
self.selector_login_pass = self.query_db(self.db, "selector_login_pass")
|
||||||
|
self.selector_login_verify = self.query_db(self.db, "selector_login_verify")
|
||||||
|
self.selector_login_url = self.query_db(self.db, "selector_login_url")
|
||||||
|
self.selector_login_verify_url = self.query_db(self.db, "selector_login_verify_url")
|
||||||
|
|
||||||
|
def init_from_scratch(
|
||||||
|
self,
|
||||||
|
name,
|
||||||
|
url,
|
||||||
|
password,
|
||||||
|
username,
|
||||||
|
selector,
|
||||||
|
selector_login_user,
|
||||||
|
selector_login_pass,
|
||||||
|
selector_login_verify,
|
||||||
|
selector_login_verify_url,
|
||||||
|
selector_login_url):
|
||||||
|
self.name = name
|
||||||
|
self.url = url
|
||||||
|
self.password = password
|
||||||
|
self.username = username
|
||||||
|
self.selector = selector
|
||||||
|
self.selector_login_user = selector_login_user
|
||||||
|
self.selector_login_pass = selector_login_pass
|
||||||
|
self.selector_login_verify = selector_login_verify
|
||||||
|
self.selector_login_url = selector_login_url
|
||||||
|
self.selector_login_verify_url = selector_login_verify_url
|
||||||
|
|
||||||
|
def query_db(self, db, string):
|
||||||
|
cursor = db.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT {} FROM feeds
|
||||||
|
WHERE name = '{}'
|
||||||
|
""".format(string,self.name)
|
||||||
|
)
|
||||||
|
result = cursor.fetchone()
|
||||||
|
cursor.close()
|
||||||
|
if result is not None:
|
||||||
|
return result
|
||||||
|
else:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Periodic(object):
|
||||||
|
"""
|
||||||
|
A periodic task running in threading.Timers
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, interval, function, *args, **kwargs):
|
||||||
|
self._lock = Lock()
|
||||||
|
self._timer = None
|
||||||
|
self.function = function
|
||||||
|
self.interval = interval
|
||||||
|
self.args = args
|
||||||
|
self.kwargs = kwargs
|
||||||
|
self._stopped = True
|
||||||
|
if kwargs.pop('autostart', True):
|
||||||
|
self.start()
|
||||||
|
|
||||||
|
def start(self, from_run=False):
|
||||||
|
self._lock.acquire()
|
||||||
|
if from_run or self._stopped:
|
||||||
|
self._stopped = False
|
||||||
|
self._timer = Timer(self.interval, self._run)
|
||||||
|
self._timer.start()
|
||||||
|
self._lock.release()
|
||||||
|
|
||||||
|
def _run(self):
|
||||||
|
self.start(from_run=True)
|
||||||
|
self.function(*self.args, **self.kwargs)
|
||||||
|
|
||||||
|
def stop(self):
|
||||||
|
self._lock.acquire()
|
||||||
|
self._stopped = True
|
||||||
|
self._timer.cancel()
|
||||||
|
self._lock.release()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue