Fetching and storing RSS full content feeds from ft.com
This commit is contained in:
commit
889ea50657
288
main.py
Executable file
288
main.py
Executable file
|
@ -0,0 +1,288 @@
|
|||
#!/usr/bin/python
|
||||
#
|
||||
|
||||
import mysql.connector
|
||||
import sys
|
||||
import feedparser
|
||||
import time
|
||||
from threading import Timer, Lock
|
||||
|
||||
import selenium
|
||||
import lxml.etree as etree
|
||||
from cssselect import HTMLTranslator, SelectorError
|
||||
import requests as req
|
||||
from io import StringIO
|
||||
|
||||
DB_HOST = "127.0.0.1"
|
||||
DB_USER = "dbuser"
|
||||
DB_PASSWD = "dbpasswd"
|
||||
DB_DATABASE = "base"
|
||||
ACCOUNT_USERNAME = "foo"
|
||||
ACCOUNT_PASSWORD = "foo"
|
||||
|
||||
|
||||
def getDb():
|
||||
db = mysql.connector.connect(
|
||||
host=DB_HOST,
|
||||
user=DB_USER,
|
||||
passwd=DB_PASSWD,
|
||||
database=DB_DATABASE)
|
||||
return db
|
||||
|
||||
def tableExist(db, name):
|
||||
cursor = db.cursor()
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*)
|
||||
FROM information_schema.tables
|
||||
WHERE table_name = '{0}'
|
||||
""".format(name))
|
||||
if cursor.fetchone()[0] == 1:
|
||||
cursor.close()
|
||||
return True
|
||||
|
||||
def createTable(db, name):
|
||||
cursor = db.cursor()
|
||||
cursor.execute("""
|
||||
CREATE TABLE {0}
|
||||
(id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
title VARCHAR(255),
|
||||
author VARCHAR(255),
|
||||
content VARCHAR(65535))
|
||||
""".format(name))
|
||||
cursor.close()
|
||||
|
||||
def createTableMa(db, name):
|
||||
cursor = db.cursor()
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS {}
|
||||
(id INT AUTO_INCREMENT PRIMARY KEY,
|
||||
name VARCHAR(255),
|
||||
url VARCHAR(255),
|
||||
pass VARCHAR(255),
|
||||
user VARCHAR(255)),
|
||||
selector VARCHAR(255),
|
||||
selector_login_user VARCHAR(255),
|
||||
selector_login_pass VARCHAR(255),
|
||||
selector_login_verify VARCHAR(255),
|
||||
selector_login_url VARCHAR(255),
|
||||
selector_login_verify_url VARCHAR(255)
|
||||
""".format(name))
|
||||
cursor.close()
|
||||
|
||||
def createTableIm(db, name):
|
||||
if tableExist(db, name):
|
||||
return True
|
||||
else:
|
||||
createTable(db, name)
|
||||
createTableIm(db, name)
|
||||
|
||||
def die():
|
||||
sys.exit()
|
||||
|
||||
def insertFeed(db, table, title, date, author, content):
|
||||
cursor = db.cursor()
|
||||
cursor.execute("INSERT INTO {0} (title, date, author, content) VALUES ( {1},{2},{3},{4} ) ".format(table,title,date,author,content))
|
||||
cursor.close()
|
||||
|
||||
def queryFeed(db, table, title, date, author):
|
||||
cursor = db.cursor()
|
||||
cursor.execute("SELECT id FROM {} WHERE tistle = '{}' AND date = '{}'".format(table, title, date))
|
||||
d = cursor.fetchone()
|
||||
if d is None:
|
||||
return -1
|
||||
else:
|
||||
return int(d)
|
||||
|
||||
def queryRss(url, rss):
|
||||
if rss:
|
||||
feed = feedparser.parse(url)
|
||||
return feed
|
||||
|
||||
def getContent(entryUrl, session_cookies, selector):
|
||||
r = req.get(entryUrl, cookies=session_cookies)
|
||||
print(r.status_code)
|
||||
if r.status_code != 200:
|
||||
print("Err fetching: " + entryUrl)
|
||||
die()
|
||||
|
||||
selector = HTMLTranslator().css_to_xpath(selector)
|
||||
|
||||
parser = etree.HTMLParser()
|
||||
tree = etree.parse(StringIO(r.text), parser)
|
||||
a = tree.xpath(selector)
|
||||
|
||||
return a.tostring()
|
||||
|
||||
#url: https://accounts.ft.com/login
|
||||
|
||||
def getSession(user, password, url="https://accounts.ft.com/login"):
|
||||
#selenium
|
||||
driver = selenium.webdriver.Chrome()
|
||||
driver.get(url)
|
||||
driver.find_element_by_id("enter-email").send_keys(user)
|
||||
#sleep
|
||||
driver.find_element_by_id("enter-email-next").submit()
|
||||
#sleep
|
||||
driver.find_element_by_id("enter-password").send_keys(password)
|
||||
driver.find_element_by_id("enter-password").submit()
|
||||
|
||||
cookies = driver.get_cookies()
|
||||
return cookies
|
||||
|
||||
#url: https://www.ft.com/myaccount
|
||||
|
||||
def verifySession(session, user, url="https://www.ft.com/myaccount"):
|
||||
|
||||
driver = selenium.webdriver.Chrome()
|
||||
driver.add_cookie(session)
|
||||
driver.get(url)
|
||||
q = driver.find_elements_by_id("rightRailEmailAddress")
|
||||
for i in q:
|
||||
if i.text.find(user) is not None:
|
||||
return True
|
||||
|
||||
def updateFeed(db, table):
|
||||
cookies = getSession(ACCOUNT_USERNAME, ACCOUNT_PASSWORD, "https://accounts.ft.com/login")
|
||||
if not verifySession(cookies, ACCOUNT_USERNAME, "https://www.ft.com/myaccount"):
|
||||
print("Session couldnt be verified")
|
||||
return
|
||||
feed = queryRss("https://www.ft.com/world?format=rss", True)
|
||||
for post in feed.entries:
|
||||
if( queryFeed(db,table, post.title, post.created, post.author) < 0):
|
||||
content = getContent(post.link, cookies,"img.n-image,div.article__content-body.n-content-body.js-article__content-bod" )
|
||||
insertFeed(db, table, post.title, post.created, post.author, content)
|
||||
|
||||
def updateFeedFromDb(db, table, feed):
|
||||
cookies = getSession(feed.username, feed.password, feed.selector_login_url)
|
||||
if not verifySession(cookies, feed.username, feed.selector_verify_url):
|
||||
print("Session couldnt be verified")
|
||||
return
|
||||
feed = queryRss(feed.url, True)
|
||||
for post in feed.entries:
|
||||
if( queryFeed(db,table, post.title, post.created, post.author) < 0):
|
||||
content = getContent(post.link, cookies,feed.selector )
|
||||
insertFeed(db, table, post.title, post.created, post.author, content)
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
db = getDb()
|
||||
createTableMa(db, "feeds")
|
||||
if not createTableIm(db, "feed01"):
|
||||
die()
|
||||
rt = Periodic(60*60*1, updateFeed, [db, "feed01"])
|
||||
while True:
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
class lf:
|
||||
|
||||
def __init__(self, db, name):
|
||||
self.name = name
|
||||
self.db = db
|
||||
self.url = ""
|
||||
self.password = ""
|
||||
self.username = ""
|
||||
self.selector = ""
|
||||
self.selector_login_user = ""
|
||||
self.selector_login_pass = ""
|
||||
self.selector_login_verify = ""
|
||||
self.selector_login_url = ""
|
||||
self.selector_login_verify_url = ""
|
||||
self.init_from_db()
|
||||
|
||||
def init_from_db(self):
|
||||
# Assumes it works
|
||||
self.url = self.query_db(self.db, "url")
|
||||
self.password = self.query_db(self.db, "password")
|
||||
self.username = self.query_db(self.db, "username")
|
||||
self.selector = self.query_db(self.db, "selector")
|
||||
self.selector_login_user = self.query_db(self.db, "selector_login_user")
|
||||
self.selector_login_pass = self.query_db(self.db, "selector_login_pass")
|
||||
self.selector_login_verify = self.query_db(self.db, "selector_login_verify")
|
||||
self.selector_login_url = self.query_db(self.db, "selector_login_url")
|
||||
self.selector_login_verify_url = self.query_db(self.db, "selector_login_verify_url")
|
||||
|
||||
def init_from_scratch(
|
||||
self,
|
||||
name,
|
||||
url,
|
||||
password,
|
||||
username,
|
||||
selector,
|
||||
selector_login_user,
|
||||
selector_login_pass,
|
||||
selector_login_verify,
|
||||
selector_login_verify_url,
|
||||
selector_login_url):
|
||||
self.name = name
|
||||
self.url = url
|
||||
self.password = password
|
||||
self.username = username
|
||||
self.selector = selector
|
||||
self.selector_login_user = selector_login_user
|
||||
self.selector_login_pass = selector_login_pass
|
||||
self.selector_login_verify = selector_login_verify
|
||||
self.selector_login_url = selector_login_url
|
||||
self.selector_login_verify_url = selector_login_verify_url
|
||||
|
||||
def query_db(self, db, string):
|
||||
cursor = db.cursor()
|
||||
cursor.execute("""
|
||||
SELECT {} FROM feeds
|
||||
WHERE name = '{}'
|
||||
""".format(string,self.name)
|
||||
)
|
||||
result = cursor.fetchone()
|
||||
cursor.close()
|
||||
if result is not None:
|
||||
return result
|
||||
else:
|
||||
return ""
|
||||
|
||||
|
||||
|
||||
|
||||
class Periodic(object):
|
||||
"""
|
||||
A periodic task running in threading.Timers
|
||||
"""
|
||||
|
||||
def __init__(self, interval, function, *args, **kwargs):
|
||||
self._lock = Lock()
|
||||
self._timer = None
|
||||
self.function = function
|
||||
self.interval = interval
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
self._stopped = True
|
||||
if kwargs.pop('autostart', True):
|
||||
self.start()
|
||||
|
||||
def start(self, from_run=False):
|
||||
self._lock.acquire()
|
||||
if from_run or self._stopped:
|
||||
self._stopped = False
|
||||
self._timer = Timer(self.interval, self._run)
|
||||
self._timer.start()
|
||||
self._lock.release()
|
||||
|
||||
def _run(self):
|
||||
self.start(from_run=True)
|
||||
self.function(*self.args, **self.kwargs)
|
||||
|
||||
def stop(self):
|
||||
self._lock.acquire()
|
||||
self._stopped = True
|
||||
self._timer.cancel()
|
||||
self._lock.release()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
Loading…
Reference in a new issue