rss_scrape/server.py

#!/usr/bin/python
# coding: utf-8
from http.server import BaseHTTPRequestHandler,HTTPServer
from lxml import etree
import feedgenerator
import requests
import datetime
import os

def fetchThalia():

    # fetching the html page
    headers = { 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36',
                'accept-encoding': 'gzip, deflate',
                'accept-language': 'en-US,en;q=0.5',
                'cookie': 'GESCHICHTENENTDECKERAB=false; SPIELWARENVISAB=false; SFORMAT_BUCH=SMO2cmJ1Y2gtRG93bmxvYWRfTVAz; gcor="SIDYN1yqhjMJDyVK45QHJCdvQAAAKQ"; ab_bucket=9; ab_container=3; FDCSESSION=7D7CF1AF51E48F70EF1EF81105D0C25C.shoptc6; KUNDE_LAYOUT=FLAT; WARENKORB-XSRF-TOKEN=97a702ea-5ea1-4431-902f-616cd350f417; abokaufen-XSRF-TOKEN=aec0b4e1-6fb9-4965-82f4-b5856a471c27; affiliate-XSRF-TOKEN=59ebc500-e4e0-486f-805e-0e375d810ca1; SUCHE_LAYOUT=FLAT; club=KEIN_MITGLIED'
                }

    response = requests.get('https://www.thalia.de/bz/hoerbuch-downloads-neuheiten/201567-201859-213891/?sort=sfed&filterIM_ABO_VERFUEGBAR_HBDL=IAV&ajax=false&asn=true&allayout=FLAT', headers=headers)
    doc = response.text

    # getting the items
    tree = etree.HTML(doc)
    items = tree.xpath("//ul[contains(concat(' ', @class, ' '), 'suchergebnis-liste')]/li")

    # creating a feed
    feed = feedgenerator.Rss201rev2Feed(title="Foo",
            link="https://rss.eris.cc/thalia/new",
            description="Thalia Neuerscheinungen aus der Kategorie Hoerbuecher",
            language="de")

    # for each line in the table
    for i in items:
        # getting the identifier
        ids = i.xpath('@data-ean')
        post_id = 'empty' if len(ids) == 0 else ids[0]

        # getting the link
        links = i.xpath('a/@href')
        link = 'empty' if len(links) == 0 else 'https://www.thalia.de' + links[0]

        # getting the description
        descriptions = i.xpath('ul/li/p[3]/text()')
        description = 'empty' if len(descriptions) == 0 else descriptions[0]

        # getting the description
        descriptions = i.xpath('ul/li/a/@product-price')
        if len(descriptions) > 0:
            description += ", Preis: " + descriptions[0]

        # getting the title
        titles = i.xpath('section/h3/text()')
        title = 'empty' if len(titles) == 0 else titles[0]

        #author
        authors = i.xpath('section/ul/li')
        author = ''
        for j in authors:
            author += ( j.xpath('a/text()')[0] + " " )

        feed.add_item(
            title=(author + "- " + title),
            link=link,
            description=description,
            unique_id=post_id,
            author_name=author
        )

    return(bytes(feed.writeString('utf-8'),'utf-8'))
    # f = open("out.xml", "a")
    # f.write(feed.writeString('utf-8'))
    # f.close()

def fetchBoerse():

    # fetching the html page
    headers = { 'user-agent': 'Mozilla/5.0 (X11; Fedora; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36',
                'accept-encoding': 'gzip, deflate',
                'accept-language': 'en-US,en;q=0.5',
                'cookie': 'bbsessionhash=5189a2ed214f57e9bbe77ae72ee2f668; bblastvisit=1625125458; bblastactivity=0; bbuserid=7485156; bbpassword=8574e2c835d63e85d75aac1ea733c213'
                }

    response = requests.get('https://boerse.im/boerse/audioboerse/hoerbuecher-und-hoerspiele/', headers=headers)
    doc = response.text

    # getting the items
    tree = etree.HTML(doc)
    items = tree.xpath('//*[@id="threadbits_forum_29"]/tr')

    # creating a feed
    feed = feedgenerator.Rss201rev2Feed(title="Boerse - Hoerbuecher",
            link="https://rss.eris.cc/boerse/audiobooks",
            description="Aktuelle Releases aus der Boerse",
            language="de")

    # for each line in the table
    for i in items:
        # getting the identifier
        ids = i.xpath('td[2]/@id')
        post_id = 'empty' if len(ids) == 0 else ids[0]

        # getting the link
        links = i.xpath('td[2]//a/@href')
        link = 'empty' if len(links) == 0 else 'https://boerse.im/' + links[0]

        # getting the description
        descriptions = i.xpath('td[2]/@title')
        description = 'empty' if len(descriptions) == 0 else descriptions[0]

        # getting the title
        titles = i.xpath('td[2]//a/text()')
        title = 'empty' if len(titles) == 0 else titles[0]

        #time
        dates = i.xpath('td[3]//span[1]/text()')
        if len(dates) > 0:
            n = datetime.datetime.strptime(dates[0], '%H:%M')
            date = datetime.timedelta(hours=n.hour,minutes=n.minute) + datetime.datetime.today().replace(hour=0,minute=0,second=0,microsecond=0)

        #date
        whens = i.xpath('td[3]/div/text()')
        when = 0
        if len(whens) != 0:
            when = 0 if (whens[0].find("Heute") > -1) else -1
            date.replace(day=date.day+when)

        feed.add_item(
            title=title,
            link=link,
            description=description,
            unique_id=post_id,
            pubdate=date
        )

    return(bytes(feed.writeString('utf-8'),'utf-8'))
    # f = open("out.xml", "a")
    # f.write(feed.writeString('utf-8'))
    # f.close()

class Handler(BaseHTTPRequestHandler):

        def do_GET(self):
            if self.path == '/boerse/audiobooks':
                self.send_response(200)
                self.send_header('Content-type','application/rss+xml; charset=utf8')
                self.end_headers()
                self.wfile.write(fetchBoerse())
            elif self.path == '/thalia/new':
                self.send_response(200)
                self.send_header('Content-type','application/rss+xml; charset=utf8')
                self.end_headers()
                self.wfile.write(fetchThalia())
            else:
                self.send_response(404)
                self.send_header('Content-type','text/html; charset=utf8')
                self.end_headers()
                self.wfile.write(b"<h2>nothing to see here</h2>")

server = HTTPServer(('0.0.0.0', 3000), Handler)
server.serve_forever()