#!/usr/bin/python # import mysql.connector import sys import feedparser import time from threading import Timer, Lock import selenium import lxml.etree as etree from cssselect import HTMLTranslator, SelectorError import requests as req from io import StringIO #import web DB_HOST = "127.0.0.1" DB_USER = "dbuser" DB_PASSWD = "dbpasswd" DB_DATABASE = "base" ACCOUNT_USERNAME = "foo" ACCOUNT_PASSWORD = "foo" def getDb(): db = mysql.connector.connect( host=DB_HOST, user=DB_USER, passwd=DB_PASSWD, database=DB_DATABASE) return db def tableExist(db, name): cursor = db.cursor() cursor.execute(""" SELECT COUNT(*) FROM information_schema.tables WHERE table_name = '{0}' """.format(name)) if cursor.fetchone()[0] == 1: cursor.close() return True def createTable(db, name): cursor = db.cursor() cursor.execute(""" CREATE TABLE {0} (id INT AUTO_INCREMENT PRIMARY KEY, title VARCHAR(255), author VARCHAR(255), content VARCHAR(65535)) """.format(name)) cursor.close() def createTableMa(db, name): cursor = db.cursor() cursor.execute(""" CREATE TABLE IF NOT EXISTS {} (id INT AUTO_INCREMENT PRIMARY KEY, name VARCHAR(255), url VARCHAR(255), pass VARCHAR(255), user VARCHAR(255)), selector VARCHAR(255), selector_login_user VARCHAR(255), selector_login_pass VARCHAR(255), selector_login_verify VARCHAR(255), selector_login_url VARCHAR(255), selector_login_verify_url VARCHAR(255) """.format(name)) cursor.close() def createTableIm(db, name): if tableExist(db, name): return True else: createTable(db, name) createTableIm(db, name) def die(): sys.exit() def insertFeed(db, table, title, date, author, content): cursor = db.cursor() cursor.execute("INSERT INTO {0} (title, date, author, content) VALUES ( {1},{2},{3},{4} ) ".format(table,title,date,author,content)) cursor.close() def queryFeed(db, table, title, date, author): cursor = db.cursor() cursor.execute("SELECT id FROM {} WHERE tistle = '{}' AND date = '{}'".format(table, title, date)) d = cursor.fetchone() if d is None: return -1 else: return int(d) def queryRss(url, rss): if rss: feed = feedparser.parse(url) return feed def getContent(entryUrl, session_cookies, selector): r = req.get(entryUrl, cookies=session_cookies) print(r.status_code) if r.status_code != 200: print("Err fetching: " + entryUrl) die() selector = HTMLTranslator().css_to_xpath(selector) parser = etree.HTMLParser() tree = etree.parse(StringIO(r.text), parser) a = tree.xpath(selector) return a.tostring() #url: https://accounts.ft.com/login def getSession(user, password, url="https://accounts.ft.com/login"): #selenium driver = selenium.webdriver.Chrome() driver.get(url) driver.find_element_by_id("enter-email").send_keys(user) #sleep driver.find_element_by_id("enter-email-next").submit() #sleep driver.find_element_by_id("enter-password").send_keys(password) driver.find_element_by_id("enter-password").submit() cookies = driver.get_cookies() return cookies #url: https://www.ft.com/myaccount def verifySession(session, user, url="https://www.ft.com/myaccount"): driver = selenium.webdriver.Chrome() driver.add_cookie(session) driver.get(url) q = driver.find_elements_by_id("rightRailEmailAddress") for i in q: if i.text.find(user) is not None: return True def updateFeed(db, table): cookies = getSession(ACCOUNT_USERNAME, ACCOUNT_PASSWORD, "https://accounts.ft.com/login") if not verifySession(cookies, ACCOUNT_USERNAME, "https://www.ft.com/myaccount"): print("Session couldnt be verified") return feed = queryRss("https://www.ft.com/world?format=rss", True) for post in feed.entries: if( queryFeed(db,table, post.title, post.created, post.author) < 0): content = getContent(post.link, cookies,"img.n-image,div.article__content-body.n-content-body.js-article__content-bod" ) insertFeed(db, table, post.title, post.created, post.author, content) def updateFeedFromDb(db, table, feed): cookies = getSession(feed.username, feed.password, feed.selector_login_url) if not verifySession(cookies, feed.username, feed.selector_verify_url): print("Session couldnt be verified") return feed = queryRss(feed.url, True) for post in feed.entries: if( queryFeed(db,table, post.title, post.created, post.author) < 0): content = getContent(post.link, cookies,feed.selector ) insertFeed(db, table, post.title, post.created, post.author, content) def main(): db = getDb() createTableMa(db, "feeds") if not createTableIm(db, "feed01"): die() updateFeed(db,"feed01") #rt = Periodic(60*60*1, updateFeed, [db, "feed01"]) while True: time.sleep(5) if __name__ == "__main__": main() class lf: def __init__(self, db, name): self.name = name self.db = db self.url = "" self.password = "" self.username = "" self.selector = "" self.selector_login_user = "" self.selector_login_pass = "" self.selector_login_verify = "" self.selector_login_url = "" self.selector_login_verify_url = "" self.init_from_db() def init_from_db(self): # Assumes it works self.url = self.query_db("url") self.password = self.query_db("password") self.username = self.query_db("username") self.selector = self.query_db("selector") self.selector_login_user = self.query_db("selector_login_user") self.selector_login_pass = self.query_db("selector_login_pass") self.selector_login_verify = self.query_db("selector_login_verify") self.selector_login_url = self.query_db("selector_login_url") self.selector_login_verify_url = self.query_db("selector_login_verify_url") def init_from_scratch( self, name, url, password, username, selector, selector_login_user, selector_login_pass, selector_login_verify, selector_login_verify_url, selector_login_url): self.name = name self.url = url self.password = password self.username = username self.selector = selector self.selector_login_user = selector_login_user self.selector_login_pass = selector_login_pass self.selector_login_verify = selector_login_verify self.selector_login_url = selector_login_url self.selector_login_verify_url = selector_login_verify_url def query_db(self, string): cursor = self.db.cursor() cursor.execute(""" SELECT {} FROM feeds WHERE name = '{}' """.format(string,self.name) ) result = cursor.fetchone() cursor.close() if result is not None: return result else: return "" def query_db(self, string, value): cursor = self.db.cursor() cursor.execute(""" UPDATE SET {}='{}' FROM feeds WHERE name = '{}' """.format(string,value,self.name) ) cursor.close() def write_db(self): cursor = self.db.cursor() cursor.execute("SELECT id FROM feeds WHERE name = '{}'".format(self.name)) if cursor.fetchone() is None: cursor.execute(""" INSERT INTO {} (name, url, password, username, selector, selector_login_user,selector_login_pass, selector_login_verify,selector_login_url, selector_login_verify_url) VALUES ({},{},{},{},{},{},{},{},{},{},{}) """.format( "feeds",self.name,self.url,self.password, self.username,self.selector, self.selector_login_user,self.selector_login_pass, self.selector_login_verify,self.selector_login_url, self.selector_login_verify_url) ) else: cursor.execute(""" UPDATE feeds SET url = '{}', password = '{}', username = '{}', selector = '{}', selector_login_user = '{}',selector_login_pass = '{}', selector_login_verify = '{}',selector_login_url = '{}', selector_login_verify_url = '{}') VALUES ({},{},{},{},{},{},{},{},{}) """.format( self.url,self.password, self.username,self.selector, self.selector_login_user,self.selector_login_pass, self.selector_login_verify,self.selector_login_url, self.selector_login_verify_url) ) class Periodic(object): """ A periodic task running in threading.Timers """ def __init__(self, interval, function, *args, **kwargs): self._lock = Lock() self._timer = None self.function = function self.interval = interval self.args = args self.kwargs = kwargs self._stopped = True if kwargs.pop('autostart', True): self.start() def start(self, from_run=False): self._lock.acquire() if from_run or self._stopped: self._stopped = False self._timer = Timer(self.interval, self._run) self._timer.start() self._lock.release() def _run(self): self.start(from_run=True) self.function(*self.args, **self.kwargs) def stop(self): self._lock.acquire() self._stopped = True self._timer.cancel() self._lock.release()