nyaa/import_to_es.py

#!/usr/bin/env python
"""
Bulk load torents from mysql into elasticsearch `nyaav2` index,
which is assumed to already exist.
This is a one-shot deal, so you'd either need to complement it
with a cron job or some binlog-reading thing (TODO)
"""
import sys
import json
from nyaa import app, db, models

from elasticsearch import Elasticsearch
from elasticsearch.client import IndicesClient
from elasticsearch import helpers

# This should be progressbar33
import progressbar

es = Elasticsearch(timeout=30)
ic = IndicesClient(es)

# turn into thing that elasticsearch indexes. We flatten in
# the stats (seeders/leechers) so we can order by them in es naturally.
# we _don't_ dereference uploader_id to the user's display name however,
# instead doing that at query time. I _think_ this is right because
# we don't want to reindex all the user's torrents just because they
# changed their name, and we don't really want to FTS search on the user anyway.
# Maybe it's more convenient to derefence though.
def mk_es(t, index_name):
    return {
        "_id": t.id,
        "_type": "torrent",
        "_index": index_name,
        "_source": {
            # we're also indexing the id as a number so you can
            # order by it. seems like this is just equivalent to
            # order by created_time, but oh well
            "id": t.id,
            "display_name": t.display_name,
            "created_time": t.created_time,
            # not analyzed but included so we can render magnet links
            # without querying sql again.
            "info_hash": t.info_hash.hex(),
            "filesize": t.filesize,
            "uploader_id": t.uploader_id,
            "main_category_id": t.main_category_id,
            "sub_category_id": t.sub_category_id,
            "comment_count": t.comment_count,
            # XXX all the bitflags are numbers
            "anonymous": bool(t.anonymous),
            "trusted": bool(t.trusted),
            "remake": bool(t.remake),
            "complete": bool(t.complete),
            # TODO instead of indexing and filtering later
            # could delete from es entirely. Probably won't matter
            # for at least a few months.
            "hidden": bool(t.hidden),
            "deleted": bool(t.deleted),
            "has_torrent": t.has_torrent,
            # Stats
            "download_count": t.stats.download_count,
            "leech_count": t.stats.leech_count,
            "seed_count": t.stats.seed_count,
        }
    }

# page through an sqlalchemy query, like the per_fetch but
# doesn't break the eager joins its doing against the stats table.
# annoying that this isn't built in somehow.
def page_query(query, limit=sys.maxsize, batch_size=10000, progress_bar=None):
    start = 0
    while True:
        # XXX very inelegant way to do this, i'm confus
        stop = min(limit, start + batch_size)
        if stop == start:
            break
        things = query.slice(start, stop)
        if not things:
            break
        had_things = False
        for thing in things:
            had_things = True
            yield(thing)
        if not had_things or stop == limit:
            break
        if progress_bar:
            progress_bar.update(start)
        start = min(limit, start + batch_size)

FLAVORS = [
    ('nyaa', models.NyaaTorrent),
    ('sukebei', models.SukebeiTorrent)
]

# Get binlog status from mysql
master_status = db.engine.execute('SHOW MASTER STATUS;').fetchone()

position_json = {
    'log_file': master_status[0], 
    'log_pos': master_status[1]
}

print('Save the following in the file configured in your ES sync config JSON:')
print(json.dumps(position_json))

for flavor, torrent_class in FLAVORS:
    print('Importing torrents for index', flavor, 'from', torrent_class)
    bar = progressbar.ProgressBar(
        maxval=torrent_class.query.count(),
        widgets=[ progressbar.SimpleProgress(),
                  ' [', progressbar.Timer(), '] ',
                  progressbar.Bar(),
                  ' (', progressbar.ETA(), ') ',
            ])

    # turn off refreshes while bulk loading
    ic.put_settings(body={'index': {'refresh_interval': '-1'}}, index=flavor)

    bar.start()
    helpers.bulk(es, (mk_es(t, flavor) for t in page_query(torrent_class.query, progress_bar=bar)), chunk_size=10000)
    bar.finish()

    # Refresh the index immideately
    ic.refresh(index=flavor)
    print('Index refresh done.')

    # restore to near-enough real time
    ic.put_settings(body={'index': {'refresh_interval': '30s'}}, index=flavor)
WIP es stuff 2017-05-14 06:48:17 +00:00			`#!/usr/bin/env python`
			`"""`
			Bulk load torents from mysql into elasticsearch `nyaav2` index,
			`which is assumed to already exist.`
			`This is a one-shot deal, so you'd either need to complement it`
			`with a cron job or some binlog-reading thing (TODO)`
			`"""`
Update import_to_es.py to index both torrent flavors, rename sync_es config import_to_es.py will now also SHOW MASTER STATUS now Changed progressbar dependency 2017-05-27 23:12:48 +00:00			`import sys`
			`import json`
			`from nyaa import app, db, models`

WIP es stuff 2017-05-14 06:48:17 +00:00			`from elasticsearch import Elasticsearch`
import_to_es: munge refresh_interval for speed 2017-05-17 05:00:58 +00:00			`from elasticsearch.client import IndicesClient`
WIP es stuff 2017-05-14 06:48:17 +00:00			`from elasticsearch import helpers`

Update import_to_es.py to index both torrent flavors, rename sync_es config import_to_es.py will now also SHOW MASTER STATUS now Changed progressbar dependency 2017-05-27 23:12:48 +00:00			`# This should be progressbar33`
			`import progressbar`
WIP es stuff 2017-05-14 06:48:17 +00:00
added timeout to import and sync es 2017-05-17 06:15:48 +00:00			`es = Elasticsearch(timeout=30)`
import_to_es: munge refresh_interval for speed 2017-05-17 05:00:58 +00:00			`ic = IndicesClient(es)`
WIP es stuff 2017-05-14 06:48:17 +00:00
			`# turn into thing that elasticsearch indexes. We flatten in`
			`# the stats (seeders/leechers) so we can order by them in es naturally.`
			`# we _don't_ dereference uploader_id to the user's display name however,`
			`# instead doing that at query time. I _think_ this is right because`
			`# we don't want to reindex all the user's torrents just because they`
			`# changed their name, and we don't really want to FTS search on the user anyway.`
			`# Maybe it's more convenient to derefence though.`
Update import_to_es.py to index both torrent flavors, rename sync_es config import_to_es.py will now also SHOW MASTER STATUS now Changed progressbar dependency 2017-05-27 23:12:48 +00:00			`def mk_es(t, index_name):`
WIP es stuff 2017-05-14 06:48:17 +00:00			`return {`
			`"_id": t.id,`
			`"_type": "torrent",`
Update import_to_es.py to index both torrent flavors, rename sync_es config import_to_es.py will now also SHOW MASTER STATUS now Changed progressbar dependency 2017-05-27 23:12:48 +00:00			`"_index": index_name,`
WIP es stuff 2017-05-14 06:48:17 +00:00			`"_source": {`
WIP hack in es as the provider for search results real sketch. lots of stuff is still broken. But! you can make elasticsearch q= style queries and it shows up properly. only first page works; need to adapt pager to elasticsearch's "total-hits" thing. 2017-05-14 08:01:26 +00:00			`# we're also indexing the id as a number so you can`
			`# order by it. seems like this is just equivalent to`
			`# order by created_time, but oh well`
			`"id": t.id,`
WIP es stuff 2017-05-14 06:48:17 +00:00			`"display_name": t.display_name,`
			`"created_time": t.created_time,`
			`# not analyzed but included so we can render magnet links`
			`# without querying sql again.`
			`"info_hash": t.info_hash.hex(),`
			`"filesize": t.filesize,`
			`"uploader_id": t.uploader_id,`
			`"main_category_id": t.main_category_id,`
			`"sub_category_id": t.sub_category_id,`
Update ElasticSeach index and scripts for comment_count 2017-05-26 13:12:47 +00:00			`"comment_count": t.comment_count,`
WIP es stuff 2017-05-14 06:48:17 +00:00			`# XXX all the bitflags are numbers`
			`"anonymous": bool(t.anonymous),`
			`"trusted": bool(t.trusted),`
			`"remake": bool(t.remake),`
			`"complete": bool(t.complete),`
			`# TODO instead of indexing and filtering later`
			`# could delete from es entirely. Probably won't matter`
			`# for at least a few months.`
			`"hidden": bool(t.hidden),`
			`"deleted": bool(t.deleted),`
			`"has_torrent": t.has_torrent,`
some more elasticsearch work, including index mapping and analyzer 2017-05-15 18:14:01 +00:00			`# Stats`
WIP es stuff 2017-05-14 06:48:17 +00:00			`"download_count": t.stats.download_count,`
			`"leech_count": t.stats.leech_count,`
			`"seed_count": t.stats.seed_count,`
			`}`
			`}`

			`# page through an sqlalchemy query, like the per_fetch but`
			`# doesn't break the eager joins its doing against the stats table.`
			`# annoying that this isn't built in somehow.`
Update import_to_es.py to index both torrent flavors, rename sync_es config import_to_es.py will now also SHOW MASTER STATUS now Changed progressbar dependency 2017-05-27 23:12:48 +00:00			`def page_query(query, limit=sys.maxsize, batch_size=10000, progress_bar=None):`
WIP es stuff 2017-05-14 06:48:17 +00:00			`start = 0`
			`while True:`
			`# XXX very inelegant way to do this, i'm confus`
			`stop = min(limit, start + batch_size)`
			`if stop == start:`
			`break`
			`things = query.slice(start, stop)`
			`if not things:`
			`break`
			`had_things = False`
			`for thing in things:`
			`had_things = True`
			`yield(thing)`
			`if not had_things or stop == limit:`
			`break`
Update import_to_es.py to index both torrent flavors, rename sync_es config import_to_es.py will now also SHOW MASTER STATUS now Changed progressbar dependency 2017-05-27 23:12:48 +00:00			`if progress_bar:`
			`progress_bar.update(start)`
WIP es stuff 2017-05-14 06:48:17 +00:00			`start = min(limit, start + batch_size)`

Update import_to_es.py to index both torrent flavors, rename sync_es config import_to_es.py will now also SHOW MASTER STATUS now Changed progressbar dependency 2017-05-27 23:12:48 +00:00			`FLAVORS = [`
			`('nyaa', models.NyaaTorrent),`
			`('sukebei', models.SukebeiTorrent)`
			`]`

			`# Get binlog status from mysql`
			`master_status = db.engine.execute('SHOW MASTER STATUS;').fetchone()`

			`position_json = {`
			`'log_file': master_status[0],`
			`'log_pos': master_status[1]`
			`}`

			`print('Save the following in the file configured in your ES sync config JSON:')`
			`print(json.dumps(position_json))`

			`for flavor, torrent_class in FLAVORS:`
			`print('Importing torrents for index', flavor, 'from', torrent_class)`
			`bar = progressbar.ProgressBar(`
			`maxval=torrent_class.query.count(),`
			`widgets=[ progressbar.SimpleProgress(),`
			`' [', progressbar.Timer(), '] ',`
			`progressbar.Bar(),`
			`' (', progressbar.ETA(), ') ',`
			`])`

			`# turn off refreshes while bulk loading`
			`ic.put_settings(body={'index': {'refresh_interval': '-1'}}, index=flavor)`

			`bar.start()`
			`helpers.bulk(es, (mk_es(t, flavor) for t in page_query(torrent_class.query, progress_bar=bar)), chunk_size=10000)`
			`bar.finish()`
import_to_es: munge refresh_interval for speed 2017-05-17 05:00:58 +00:00
Update import_to_es.py to index both torrent flavors, rename sync_es config import_to_es.py will now also SHOW MASTER STATUS now Changed progressbar dependency 2017-05-27 23:12:48 +00:00			`# Refresh the index immideately`
			`ic.refresh(index=flavor)`
			`print('Index refresh done.')`
import_to_es: munge refresh_interval for speed 2017-05-17 05:00:58 +00:00
Update import_to_es.py to index both torrent flavors, rename sync_es config import_to_es.py will now also SHOW MASTER STATUS now Changed progressbar dependency 2017-05-27 23:12:48 +00:00			`# restore to near-enough real time`
			`ic.put_settings(body={'index': {'refresh_interval': '30s'}}, index=flavor)`