mirror of
https://gitlab.com/SIGBUS/nyaa.git
synced 2024-12-21 13:20:00 +00:00
merged elasticsearch, let's hope this doesn't break shit
This commit is contained in:
commit
00c768c722
38
README.md
38
README.md
|
@ -44,5 +44,43 @@
|
|||
- Start the dev server with `python run.py`
|
||||
- Deactivate `source deactivate`
|
||||
|
||||
# Enabling ElasticSearch
|
||||
|
||||
## Basics
|
||||
- Install jdk `sudo apt-get install openjdk-8-jdk`
|
||||
- Install elasticsearch https://www.elastic.co/guide/en/elasticsearch/reference/current/deb.html
|
||||
- `sudo systemctl enable elasticsearch.service`
|
||||
- `sudo systemctl start elasticsearch.service`
|
||||
- Run `curl -XGET 'localhost:9200'` and make sure ES is running
|
||||
- Optional: install Kabana as a search frontend for ES
|
||||
|
||||
## Enable MySQL Binlogging
|
||||
- Add the `[mariadb]` bin-log section to my.cnf and reload mysql server
|
||||
- Connect to mysql
|
||||
- `SHOW VARIABLES LIKE 'binlog_format';`
|
||||
- Make sure it shows ROW
|
||||
- Connect to root user
|
||||
- `GRANT REPLICATION SLAVE ON *.* TO 'test'@'localhost';` where test is the user you will be running `sync_es.py` with
|
||||
|
||||
## Setting up ES
|
||||
- Run `./create_es.sh` and this creates two indicies: `nyaa` and `sukebei`
|
||||
- The output should show `akncolwedged: true` twice
|
||||
- The safest bet is to disable the webapp here to ensure there's no database writes
|
||||
- Run `python import_to_es.py` with `SITE_FLAVOR` set to `nyaa`
|
||||
- Run `python import_to_es.py` with `SITE_FLAVOR` set to `sukebei`
|
||||
- These will take some time to run as it's indexing
|
||||
|
||||
## Setting up sync_es.py
|
||||
- Sync_es.py keeps the ElasticSearch index updated by reading the BinLog
|
||||
- Configure the MySQL options with the user where you granted the REPLICATION permissions
|
||||
- Connect to MySQL, run `SHOW MASTER STATUS;`.
|
||||
- Copy the output to `/var/lib/sync_es_position.json` with the contents `{"log_file": "FILE", "log_pos": POSITION}` and replace FILENAME with File (something like master1-bin.000002) in the SQL output and POSITION (something like 892528513) with Position
|
||||
- Set up `sync_es.py` as a service and run it, preferably as the system/root
|
||||
- Make sure `sync_es.py` runs within venv with the right dependencies
|
||||
|
||||
## Good to go!
|
||||
- After that, enable the `USE_ELASTIC_SEARCH` flag and restart the webapp and you're good to go
|
||||
|
||||
|
||||
## Code Quality:
|
||||
- Remember to follow PEP8 style guidelines and run `./lint.sh` before committing.
|
||||
|
|
|
@ -33,8 +33,6 @@ MAIL_FROM_ADDRESS = '***'
|
|||
SMTP_USERNAME = '***'
|
||||
SMTP_PASSWORD = '***'
|
||||
|
||||
RESULTS_PER_PAGE = 75
|
||||
|
||||
# What the site identifies itself as.
|
||||
SITE_NAME = 'Nyaa'
|
||||
|
||||
|
@ -49,3 +47,14 @@ ENFORCE_MAIN_ANNOUNCE_URL = False
|
|||
MAIN_ANNOUNCE_URL = ''
|
||||
|
||||
BACKUP_TORRENT_FOLDER = 'torrents'
|
||||
|
||||
#
|
||||
# Search Options
|
||||
#
|
||||
# Max ES search results, do not set over 10000
|
||||
RESULTS_PER_PAGE = 75
|
||||
|
||||
USE_ELASTIC_SEARCH = False
|
||||
ENABLE_ELASTIC_SEARCH_HIGHLIGHT = False
|
||||
ES_MAX_SEARCH_RESULT = 1000
|
||||
ES_INDEX_NAME = SITE_FLAVOR # we create indicies named nyaa or sukebei
|
5
create_es.sh
Executable file
5
create_es.sh
Executable file
|
@ -0,0 +1,5 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# create indicies named "nyaa" and "sukebei", these are hardcoded
|
||||
curl -v -XPUT 'localhost:9200/nyaa?pretty' -H"Content-Type: application/yaml" --data-binary @es_mapping.yml
|
||||
curl -v -XPUT 'localhost:9200/sukebei?pretty' -H"Content-Type: application/yaml" --data-binary @es_mapping.yml
|
91
es_mapping.yml
Normal file
91
es_mapping.yml
Normal file
|
@ -0,0 +1,91 @@
|
|||
---
|
||||
# CREATE DTABASE/TABLE equivalent for elasticsearch, in yaml
|
||||
# fo inline comments.
|
||||
settings:
|
||||
analysis:
|
||||
analyzer:
|
||||
my_search_analyzer:
|
||||
type: custom
|
||||
tokenizer: standard
|
||||
char_filter:
|
||||
- my_char_filter
|
||||
filter:
|
||||
- standard
|
||||
- lowercase
|
||||
my_index_analyzer:
|
||||
type: custom
|
||||
tokenizer: standard
|
||||
char_filter:
|
||||
- my_char_filter
|
||||
filter:
|
||||
- lowercase
|
||||
- my_ngram
|
||||
filter:
|
||||
my_ngram:
|
||||
type: edgeNGram
|
||||
min_gram: 1
|
||||
max_gram: 15
|
||||
char_filter:
|
||||
my_char_filter:
|
||||
type: mapping
|
||||
mappings: ["-=>_", "!=>_"]
|
||||
index:
|
||||
# we're running a single es node, so no sharding necessary,
|
||||
# plus replicas don't really help either.
|
||||
number_of_shards: 1
|
||||
number_of_replicas : 0
|
||||
mapper:
|
||||
# disable elasticsearch's "helpful" autoschema
|
||||
dynamic: false
|
||||
# since we disabled the _all field, default query the
|
||||
# name of the torrent.
|
||||
query:
|
||||
default_field: display_name
|
||||
mappings:
|
||||
torrent:
|
||||
# don't want everything concatenated
|
||||
_all:
|
||||
enabled: false
|
||||
properties:
|
||||
id:
|
||||
type: long
|
||||
display_name:
|
||||
# TODO could do a fancier tokenizer here to parse out the
|
||||
# the scene convention of stuff in brackets, plus stuff like k-on
|
||||
type: text
|
||||
analyzer: my_index_analyzer
|
||||
fielddata: true
|
||||
created_time:
|
||||
type: date
|
||||
# Only in the ES index for generating magnet links
|
||||
info_hash:
|
||||
enabled: false
|
||||
filesize:
|
||||
type: long
|
||||
anonymous:
|
||||
type: boolean
|
||||
trusted:
|
||||
type: boolean
|
||||
remake:
|
||||
type: boolean
|
||||
complete:
|
||||
type: boolean
|
||||
hidden:
|
||||
type: boolean
|
||||
deleted:
|
||||
type: boolean
|
||||
has_torrent:
|
||||
type: boolean
|
||||
download_count:
|
||||
type: long
|
||||
leech_count:
|
||||
type: long
|
||||
seed_count:
|
||||
type: long
|
||||
# these ids are really only for filtering, thus keyword
|
||||
uploader_id:
|
||||
type: keyword
|
||||
main_category_id:
|
||||
type: keyword
|
||||
sub_category_id:
|
||||
type: keyword
|
92
import_to_es.py
Normal file
92
import_to_es.py
Normal file
|
@ -0,0 +1,92 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
Bulk load torents from mysql into elasticsearch `nyaav2` index,
|
||||
which is assumed to already exist.
|
||||
This is a one-shot deal, so you'd either need to complement it
|
||||
with a cron job or some binlog-reading thing (TODO)
|
||||
"""
|
||||
from nyaa import app
|
||||
from nyaa.models import Torrent
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch import helpers
|
||||
import progressbar
|
||||
import sys
|
||||
|
||||
bar = progressbar.ProgressBar(
|
||||
max_value=Torrent.query.count(),
|
||||
widgets=[
|
||||
progressbar.SimpleProgress(),
|
||||
' [', progressbar.Timer(), '] ',
|
||||
progressbar.Bar(),
|
||||
' (', progressbar.ETA(), ') ',
|
||||
])
|
||||
|
||||
es = Elasticsearch()
|
||||
|
||||
# turn into thing that elasticsearch indexes. We flatten in
|
||||
# the stats (seeders/leechers) so we can order by them in es naturally.
|
||||
# we _don't_ dereference uploader_id to the user's display name however,
|
||||
# instead doing that at query time. I _think_ this is right because
|
||||
# we don't want to reindex all the user's torrents just because they
|
||||
# changed their name, and we don't really want to FTS search on the user anyway.
|
||||
# Maybe it's more convenient to derefence though.
|
||||
def mk_es(t):
|
||||
return {
|
||||
"_id": t.id,
|
||||
"_type": "torrent",
|
||||
"_index": app.config['ES_INDEX_NAME'],
|
||||
"_source": {
|
||||
# we're also indexing the id as a number so you can
|
||||
# order by it. seems like this is just equivalent to
|
||||
# order by created_time, but oh well
|
||||
"id": t.id,
|
||||
"display_name": t.display_name,
|
||||
"created_time": t.created_time,
|
||||
# not analyzed but included so we can render magnet links
|
||||
# without querying sql again.
|
||||
"info_hash": t.info_hash.hex(),
|
||||
"filesize": t.filesize,
|
||||
"uploader_id": t.uploader_id,
|
||||
"main_category_id": t.main_category_id,
|
||||
"sub_category_id": t.sub_category_id,
|
||||
# XXX all the bitflags are numbers
|
||||
"anonymous": bool(t.anonymous),
|
||||
"trusted": bool(t.trusted),
|
||||
"remake": bool(t.remake),
|
||||
"complete": bool(t.complete),
|
||||
# TODO instead of indexing and filtering later
|
||||
# could delete from es entirely. Probably won't matter
|
||||
# for at least a few months.
|
||||
"hidden": bool(t.hidden),
|
||||
"deleted": bool(t.deleted),
|
||||
"has_torrent": t.has_torrent,
|
||||
# Stats
|
||||
"download_count": t.stats.download_count,
|
||||
"leech_count": t.stats.leech_count,
|
||||
"seed_count": t.stats.seed_count,
|
||||
}
|
||||
}
|
||||
|
||||
# page through an sqlalchemy query, like the per_fetch but
|
||||
# doesn't break the eager joins its doing against the stats table.
|
||||
# annoying that this isn't built in somehow.
|
||||
def page_query(query, limit=sys.maxsize, batch_size=10000):
|
||||
start = 0
|
||||
while True:
|
||||
# XXX very inelegant way to do this, i'm confus
|
||||
stop = min(limit, start + batch_size)
|
||||
if stop == start:
|
||||
break
|
||||
things = query.slice(start, stop)
|
||||
if not things:
|
||||
break
|
||||
had_things = False
|
||||
for thing in things:
|
||||
had_things = True
|
||||
yield(thing)
|
||||
if not had_things or stop == limit:
|
||||
break
|
||||
bar.update(start)
|
||||
start = min(limit, start + batch_size)
|
||||
|
||||
helpers.bulk(es, (mk_es(t) for t in page_query(Torrent.query)), chunk_size=10000)
|
6
my.cnf
6
my.cnf
|
@ -4,3 +4,9 @@ ft_min_word_len=2
|
|||
innodb_ft_cache_size = 80000000
|
||||
innodb_ft_total_cache_size = 1600000000
|
||||
max_allowed_packet = 100M
|
||||
|
||||
[mariadb]
|
||||
log-bin
|
||||
server_id=1
|
||||
log-basename=master1
|
||||
binlog-format = row
|
||||
|
|
309
nyaa/routes.py
309
nyaa/routes.py
|
@ -6,18 +6,16 @@ from nyaa import bencode, utils
|
|||
from nyaa import torrents
|
||||
from nyaa import backend
|
||||
from nyaa import api_handler
|
||||
from nyaa.search import search_elastic, search_db
|
||||
import config
|
||||
|
||||
import json
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
import ipaddress
|
||||
import os.path
|
||||
import base64
|
||||
from urllib.parse import quote
|
||||
import sqlalchemy_fulltext.modes as FullTextMode
|
||||
from sqlalchemy_fulltext import FullTextSearch
|
||||
import shlex
|
||||
import math
|
||||
from werkzeug import url_encode
|
||||
|
||||
from itsdangerous import URLSafeSerializer, BadSignature
|
||||
|
@ -27,7 +25,14 @@ from email.mime.multipart import MIMEMultipart
|
|||
from email.mime.text import MIMEText
|
||||
from email.utils import formatdate
|
||||
|
||||
from flask_paginate import Pagination
|
||||
|
||||
|
||||
DEBUG_API = False
|
||||
DEFAULT_MAX_SEARCH_RESULT = 1000
|
||||
DEFAULT_PER_PAGE = 75
|
||||
SERACH_PAGINATE_DISPLAY_MSG = '''Displaying results {start}-{end} out of {total} results.<br>
|
||||
Please refine your search results if you can't find what you were looking for.'''
|
||||
|
||||
|
||||
def redirect_url():
|
||||
|
@ -48,144 +53,13 @@ def modify_query(**new_values):
|
|||
|
||||
return '{}?{}'.format(flask.request.path, url_encode(args))
|
||||
|
||||
|
||||
@app.template_global()
|
||||
def filter_truthy(input_list):
|
||||
''' Jinja2 can't into list comprehension so this is for
|
||||
the search_results.html template '''
|
||||
return [item for item in input_list if item]
|
||||
|
||||
def search(term='', user=None, sort='id', order='desc', category='0_0', quality_filter='0', page=1, rss=False, admin=False):
|
||||
sort_keys = {
|
||||
'id': models.Torrent.id,
|
||||
'size': models.Torrent.filesize,
|
||||
'name': models.Torrent.display_name,
|
||||
'seeders': models.Statistic.seed_count,
|
||||
'leechers': models.Statistic.leech_count,
|
||||
'downloads': models.Statistic.download_count
|
||||
}
|
||||
|
||||
sort_ = sort.lower()
|
||||
if sort_ not in sort_keys:
|
||||
flask.abort(400)
|
||||
sort = sort_keys[sort]
|
||||
|
||||
order_keys = {
|
||||
'desc': 'desc',
|
||||
'asc': 'asc'
|
||||
}
|
||||
|
||||
order_ = order.lower()
|
||||
if order_ not in order_keys:
|
||||
flask.abort(400)
|
||||
|
||||
filter_keys = {
|
||||
'0': None,
|
||||
'1': (models.TorrentFlags.REMAKE, False),
|
||||
'2': (models.TorrentFlags.TRUSTED, True),
|
||||
'3': (models.TorrentFlags.COMPLETE, True)
|
||||
}
|
||||
|
||||
sentinel = object()
|
||||
filter_tuple = filter_keys.get(quality_filter.lower(), sentinel)
|
||||
if filter_tuple is sentinel:
|
||||
flask.abort(400)
|
||||
|
||||
if user:
|
||||
user = models.User.by_id(user)
|
||||
if not user:
|
||||
flask.abort(404)
|
||||
user = user.id
|
||||
|
||||
main_category = None
|
||||
sub_category = None
|
||||
main_cat_id = 0
|
||||
sub_cat_id = 0
|
||||
if category:
|
||||
cat_match = re.match(r'^(\d+)_(\d+)$', category)
|
||||
if not cat_match:
|
||||
flask.abort(400)
|
||||
|
||||
main_cat_id = int(cat_match.group(1))
|
||||
sub_cat_id = int(cat_match.group(2))
|
||||
|
||||
if main_cat_id > 0:
|
||||
if sub_cat_id > 0:
|
||||
sub_category = models.SubCategory.by_category_ids(main_cat_id, sub_cat_id)
|
||||
else:
|
||||
main_category = models.MainCategory.by_id(main_cat_id)
|
||||
|
||||
if not category:
|
||||
flask.abort(400)
|
||||
|
||||
# Force sort by id desc if rss
|
||||
if rss:
|
||||
sort = sort_keys['id']
|
||||
order = 'desc'
|
||||
|
||||
same_user = False
|
||||
if flask.g.user:
|
||||
same_user = flask.g.user.id == user
|
||||
|
||||
if term:
|
||||
query = db.session.query(models.TorrentNameSearch)
|
||||
else:
|
||||
query = models.Torrent.query
|
||||
|
||||
# User view (/user/username)
|
||||
if user:
|
||||
query = query.filter(models.Torrent.uploader_id == user)
|
||||
|
||||
if not admin:
|
||||
# Hide all DELETED torrents if regular user
|
||||
query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.DELETED)).is_(False))
|
||||
# If logged in user is not the same as the user being viewed, show only torrents that aren't hidden or anonymous
|
||||
# If logged in user is the same as the user being viewed, show all torrents including hidden and anonymous ones
|
||||
# On RSS pages in user view, show only torrents that aren't hidden or anonymous no matter what
|
||||
if not same_user or rss:
|
||||
query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN |
|
||||
models.TorrentFlags.ANONYMOUS)).is_(False))
|
||||
# General view (homepage, general search view)
|
||||
else:
|
||||
if not admin:
|
||||
# Hide all DELETED torrents if regular user
|
||||
query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.DELETED)).is_(False))
|
||||
# If logged in, show all torrents that aren't hidden unless they belong to you
|
||||
# On RSS pages, show all public torrents and nothing more.
|
||||
if flask.g.user and not rss:
|
||||
query = query.filter((models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN)).is_(False)) |
|
||||
(models.Torrent.uploader_id == flask.g.user.id))
|
||||
# Otherwise, show all torrents that aren't hidden
|
||||
else:
|
||||
query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN)).is_(False))
|
||||
|
||||
if main_category:
|
||||
query = query.filter(models.Torrent.main_category_id == main_cat_id)
|
||||
elif sub_category:
|
||||
query = query.filter((models.Torrent.main_category_id == main_cat_id) &
|
||||
(models.Torrent.sub_category_id == sub_cat_id))
|
||||
|
||||
if filter_tuple:
|
||||
query = query.filter(models.Torrent.flags.op('&')(int(filter_tuple[0])).is_(filter_tuple[1]))
|
||||
|
||||
if term:
|
||||
for item in shlex.split(term, posix=False):
|
||||
if len(item) >= 2:
|
||||
query = query.filter(FullTextSearch(
|
||||
item, models.TorrentNameSearch, FullTextMode.NATURAL))
|
||||
|
||||
# Sort and order
|
||||
if sort.class_ != models.Torrent:
|
||||
query = query.join(sort.class_)
|
||||
|
||||
query = query.order_by(getattr(sort, order)())
|
||||
|
||||
if rss:
|
||||
query = query.limit(app.config['RESULTS_PER_PAGE'])
|
||||
else:
|
||||
query = query.paginate_faste(page, per_page=app.config['RESULTS_PER_PAGE'], step=5)
|
||||
|
||||
return query
|
||||
|
||||
|
||||
@app.errorhandler(404)
|
||||
def not_found(error):
|
||||
|
@ -203,7 +77,6 @@ def before_request():
|
|||
flask.g.user = user
|
||||
|
||||
if not 'timeout' in flask.session or flask.session['timeout'] < datetime.now():
|
||||
print("hio")
|
||||
flask.session['timeout'] = datetime.now() + timedelta(days=7)
|
||||
flask.session.permanent = True
|
||||
flask.session.modified = True
|
||||
|
@ -225,6 +98,18 @@ def _generate_query_string(term, category, filter, user):
|
|||
return params
|
||||
|
||||
|
||||
@app.template_filter('utc_time')
|
||||
def get_utc_timestamp(datetime_str):
|
||||
''' Returns a UTC POSIX timestamp, as seconds '''
|
||||
UTC_EPOCH = datetime.utcfromtimestamp(0)
|
||||
return int((datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S') - UTC_EPOCH).total_seconds())
|
||||
|
||||
|
||||
@app.template_filter('display_time')
|
||||
def get_display_time(datetime_str):
|
||||
return datetime.strptime(datetime_str, '%Y-%m-%dT%H:%M:%S').strftime('%Y-%m-%d %H:%M')
|
||||
|
||||
|
||||
@app.route('/rss', defaults={'rss': True})
|
||||
@app.route('/', defaults={'rss': False})
|
||||
def home(rss):
|
||||
|
@ -241,6 +126,10 @@ def home(rss):
|
|||
if page:
|
||||
page = int(page)
|
||||
|
||||
per_page = app.config.get('RESULTS_PER_PAGE')
|
||||
if not per_page:
|
||||
per_page = DEFAULT_PER_PAGE
|
||||
|
||||
user_id = None
|
||||
if user_name:
|
||||
user = models.User.by_username(user_name)
|
||||
|
@ -249,30 +138,72 @@ def home(rss):
|
|||
user_id = user.id
|
||||
|
||||
query_args = {
|
||||
'term': term or '',
|
||||
'user': user_id,
|
||||
'sort': sort or 'id',
|
||||
'order': order or 'desc',
|
||||
'category': category or '0_0',
|
||||
'quality_filter': quality_filter or '0',
|
||||
'page': page or 1,
|
||||
'rss': rss
|
||||
'rss': rss,
|
||||
'per_page': per_page
|
||||
}
|
||||
|
||||
# God mode
|
||||
if flask.g.user and flask.g.user.is_admin:
|
||||
query_args['admin'] = True
|
||||
if flask.g.user:
|
||||
query_args['logged_in_user'] = flask.g.user
|
||||
if flask.g.user.is_admin: # God mode
|
||||
query_args['admin'] = True
|
||||
|
||||
query = search(**query_args)
|
||||
# If searching, we get results from elastic search
|
||||
use_elastic = app.config.get('USE_ELASTIC_SEARCH')
|
||||
if use_elastic and term:
|
||||
query_args['term'] = term
|
||||
|
||||
if rss:
|
||||
return render_rss('/', query)
|
||||
max_search_results = app.config.get('ES_MAX_SEARCH_RESULT')
|
||||
if not max_search_results:
|
||||
max_search_results = DEFAULT_MAX_SEARCH_RESULT
|
||||
|
||||
max_page = min(query_args['page'], int(math.ceil(max_search_results / float(per_page)))) # Only allow up to (max_search_results / page) pages
|
||||
|
||||
query_args['page'] = max_page
|
||||
query_args['max_search_results'] = max_search_results
|
||||
|
||||
query_results = search_elastic(**query_args)
|
||||
|
||||
if rss:
|
||||
return render_rss('/', query_results, use_elastic=True)
|
||||
else:
|
||||
rss_query_string = _generate_query_string(term, category, quality_filter, user_name)
|
||||
max_results = min(max_search_results, query_results['hits']['total'])
|
||||
# change p= argument to whatever you change page_parameter to or pagination breaks
|
||||
pagination = Pagination(p=query_args['page'], per_page=per_page,
|
||||
total=max_results, bs_version=3, page_parameter='p',
|
||||
display_msg=SERACH_PAGINATE_DISPLAY_MSG)
|
||||
return flask.render_template('home.html',
|
||||
use_elastic=True,
|
||||
pagination=pagination,
|
||||
torrent_query=query_results,
|
||||
search=query_args,
|
||||
rss_filter=rss_query_string)
|
||||
else:
|
||||
rss_query_string = _generate_query_string(term, category, quality_filter, user_name)
|
||||
return flask.render_template('home.html',
|
||||
torrent_query=query,
|
||||
search=query_args,
|
||||
rss_filter=rss_query_string)
|
||||
# If ES is enabled, default to db search for browsing
|
||||
if use_elastic:
|
||||
query_args['term'] = ''
|
||||
else: # Otherwise, use db search for everything
|
||||
query_args['term'] = term or ''
|
||||
|
||||
query = search_db(**query_args)
|
||||
if rss:
|
||||
return render_rss('/', query, use_elastic=False)
|
||||
else:
|
||||
rss_query_string = _generate_query_string(term, category, quality_filter, user_name)
|
||||
# Use elastic is always false here because we only hit this section
|
||||
# if we're browsing without a search term (which means we default to DB)
|
||||
# or if ES is disabled
|
||||
return flask.render_template('home.html',
|
||||
use_elastic=False,
|
||||
torrent_query=query,
|
||||
search=query_args,
|
||||
rss_filter=rss_query_string)
|
||||
|
||||
|
||||
@app.route('/user/<user_name>')
|
||||
|
@ -291,6 +222,10 @@ def view_user(user_name):
|
|||
if page:
|
||||
page = int(page)
|
||||
|
||||
per_page = app.config.get('RESULTS_PER_PAGE')
|
||||
if not per_page:
|
||||
per_page = DEFAULT_PER_PAGE
|
||||
|
||||
query_args = {
|
||||
'term': term or '',
|
||||
'user': user.id,
|
||||
|
@ -299,40 +234,82 @@ def view_user(user_name):
|
|||
'category': category or '0_0',
|
||||
'quality_filter': quality_filter or '0',
|
||||
'page': page or 1,
|
||||
'rss': False
|
||||
'rss': False,
|
||||
'per_page': per_page
|
||||
}
|
||||
|
||||
# God mode
|
||||
if flask.g.user and flask.g.user.is_admin:
|
||||
query_args['admin'] = True
|
||||
|
||||
query = search(**query_args)
|
||||
if flask.g.user:
|
||||
query_args['logged_in_user'] = flask.g.user
|
||||
if flask.g.user.is_admin: # God mode
|
||||
query_args['admin'] = True
|
||||
|
||||
# Use elastic search for term searching
|
||||
rss_query_string = _generate_query_string(term, category, quality_filter, user_name)
|
||||
return flask.render_template('user.html',
|
||||
torrent_query=query,
|
||||
search=query_args,
|
||||
user=user,
|
||||
user_page=True,
|
||||
rss_filter=rss_query_string)
|
||||
use_elastic = app.config.get('USE_ELASTIC_SEARCH')
|
||||
if use_elastic and term:
|
||||
query_args['term'] = term
|
||||
|
||||
max_search_results = app.config.get('ES_MAX_SEARCH_RESULT')
|
||||
if not max_search_results:
|
||||
max_search_results = DEFAULT_MAX_SEARCH_RESULT
|
||||
|
||||
max_page = min(query_args['page'], int(math.ceil(max_search_results / float(per_page)))) # Only allow up to (max_search_results / page) pages
|
||||
|
||||
query_args['page'] = max_page
|
||||
query_args['max_search_results'] = max_search_results
|
||||
|
||||
query_results = search_elastic(**query_args)
|
||||
|
||||
max_results = min(max_search_results, query_results['hits']['total'])
|
||||
# change p= argument to whatever you change page_parameter to or pagination breaks
|
||||
pagination = Pagination(p=query_args['page'], per_page=per_page,
|
||||
total=max_results, bs_version=3, page_parameter='p',
|
||||
display_msg=SERACH_PAGINATE_DISPLAY_MSG)
|
||||
return flask.render_template('user.html',
|
||||
use_elastic=True,
|
||||
pagination=pagination,
|
||||
torrent_query=query_results,
|
||||
search=query_args,
|
||||
user=user,
|
||||
user_page=True,
|
||||
rss_filter=rss_query_string)
|
||||
# Similar logic as home page
|
||||
else:
|
||||
if use_elastic:
|
||||
query_args['term'] = ''
|
||||
else:
|
||||
query_args['term'] = term or ''
|
||||
query = search_db(**query_args)
|
||||
return flask.render_template('user.html',
|
||||
use_elastic=False,
|
||||
torrent_query=query,
|
||||
search=query_args,
|
||||
user=user,
|
||||
user_page=True,
|
||||
rss_filter=rss_query_string)
|
||||
|
||||
|
||||
@app.template_filter('rfc822')
|
||||
def _jinja2_filter_rfc822(date, fmt=None):
|
||||
return formatdate(float(date.strftime('%s')))
|
||||
|
||||
@app.template_filter('rfc822_es')
|
||||
def _jinja2_filter_rfc822(datestr, fmt=None):
|
||||
return formatdate(float(datetime.strptime(datestr, '%Y-%m-%dT%H:%M:%S').strftime('%s')))
|
||||
|
||||
def render_rss(label, query):
|
||||
|
||||
def render_rss(label, query, use_elastic):
|
||||
rss_xml = flask.render_template('rss.xml',
|
||||
use_elastic=use_elastic,
|
||||
term=label,
|
||||
site_url=flask.request.url_root,
|
||||
query=query)
|
||||
torrent_query=query)
|
||||
response = flask.make_response(rss_xml)
|
||||
response.headers['Content-Type'] = 'application/xml'
|
||||
return response
|
||||
|
||||
|
||||
#@app.route('/about', methods=['GET'])
|
||||
# @app.route('/about', methods=['GET'])
|
||||
# def about():
|
||||
# return flask.render_template('about.html')
|
||||
|
||||
|
@ -645,4 +622,4 @@ def site_help():
|
|||
@app.route('/api/upload', methods = ['POST'])
|
||||
def api_upload():
|
||||
api_response = api_handler.api_upload(flask.request)
|
||||
return api_response
|
||||
return api_response
|
317
nyaa/search.py
Normal file
317
nyaa/search.py
Normal file
|
@ -0,0 +1,317 @@
|
|||
import flask
|
||||
import re
|
||||
import math
|
||||
import json
|
||||
import shlex
|
||||
|
||||
from nyaa import app, db
|
||||
from nyaa import models
|
||||
|
||||
import sqlalchemy_fulltext.modes as FullTextMode
|
||||
from sqlalchemy_fulltext import FullTextSearch
|
||||
from elasticsearch import Elasticsearch
|
||||
from elasticsearch_dsl import Search, Q
|
||||
|
||||
|
||||
def search_elastic(term='', user=None, sort='id', order='desc',
|
||||
category='0_0', quality_filter='0', page=1,
|
||||
rss=False, admin=False, logged_in_user=None,
|
||||
per_page=75, max_search_results=1000):
|
||||
# This function can easily be memcached now
|
||||
|
||||
es_client = Elasticsearch()
|
||||
|
||||
es_sort_keys = {
|
||||
'id': 'id',
|
||||
'size': 'filesize',
|
||||
# 'name': 'display_name', # This is slow and buggy
|
||||
'seeders': 'seed_count',
|
||||
'leechers': 'leech_count',
|
||||
'downloads': 'download_count'
|
||||
}
|
||||
|
||||
sort_ = sort.lower()
|
||||
if sort_ not in es_sort_keys:
|
||||
flask.abort(400)
|
||||
|
||||
es_sort = es_sort_keys[sort]
|
||||
|
||||
order_keys = {
|
||||
'desc': 'desc',
|
||||
'asc': 'asc'
|
||||
}
|
||||
|
||||
order_ = order.lower()
|
||||
if order_ not in order_keys:
|
||||
flask.abort(400)
|
||||
|
||||
# Only allow ID, desc if RSS
|
||||
if rss:
|
||||
sort = es_sort_keys['id']
|
||||
order = 'desc'
|
||||
|
||||
# funky, es sort is default asc, prefixed by '-' if desc
|
||||
if 'desc' == order:
|
||||
es_sort = '-' + es_sort
|
||||
|
||||
# Quality filter
|
||||
quality_keys = [
|
||||
'0', # Show all
|
||||
'1', # No remakes
|
||||
'2', # Only trusted
|
||||
'3' # Only completed
|
||||
]
|
||||
|
||||
if quality_filter.lower() not in quality_keys:
|
||||
flask.abort(400)
|
||||
|
||||
quality_filter = int(quality_filter)
|
||||
|
||||
# Category filter
|
||||
main_category = None
|
||||
sub_category = None
|
||||
main_cat_id = 0
|
||||
sub_cat_id = 0
|
||||
if category:
|
||||
cat_match = re.match(r'^(\d+)_(\d+)$', category)
|
||||
if not cat_match:
|
||||
flask.abort(400)
|
||||
|
||||
main_cat_id = int(cat_match.group(1))
|
||||
sub_cat_id = int(cat_match.group(2))
|
||||
|
||||
if main_cat_id > 0:
|
||||
if sub_cat_id > 0:
|
||||
sub_category = models.SubCategory.by_category_ids(main_cat_id, sub_cat_id)
|
||||
if not sub_category:
|
||||
flask.abort(400)
|
||||
else:
|
||||
main_category = models.MainCategory.by_id(main_cat_id)
|
||||
if not main_category:
|
||||
flask.abort(400)
|
||||
|
||||
# This might be useless since we validate users
|
||||
# before coming into this method, but just to be safe...
|
||||
if user:
|
||||
user = models.User.by_id(user)
|
||||
if not user:
|
||||
flask.abort(404)
|
||||
user = user.id
|
||||
|
||||
same_user = False
|
||||
if logged_in_user:
|
||||
same_user = user == logged_in_user.id
|
||||
|
||||
s = Search(using=es_client, index=app.config.get('ES_INDEX_NAME')) # todo, sukebei prefix
|
||||
|
||||
# Apply search term
|
||||
if term:
|
||||
s = s.query('simple_query_string',
|
||||
analyzer='my_search_analyzer',
|
||||
default_operator="AND",
|
||||
query=term)
|
||||
|
||||
# User view (/user/username)
|
||||
if user:
|
||||
s = s.filter('term', uploader_id=user)
|
||||
|
||||
if not admin:
|
||||
# Hide all DELETED torrents if regular user
|
||||
s = s.filter('term', deleted=False)
|
||||
# If logged in user is not the same as the user being viewed,
|
||||
# show only torrents that aren't hidden or anonymous.
|
||||
#
|
||||
# If logged in user is the same as the user being viewed,
|
||||
# show all torrents including hidden and anonymous ones.
|
||||
#
|
||||
# On RSS pages in user view, show only torrents that
|
||||
# aren't hidden or anonymous no matter what
|
||||
if not same_user or rss:
|
||||
s = s.filter('term', hidden=False)
|
||||
s = s.filter('term', anonymous=False)
|
||||
# General view (homepage, general search view)
|
||||
else:
|
||||
if not admin:
|
||||
# Hide all DELETED torrents if regular user
|
||||
s = s.filter('term', deleted=False)
|
||||
# If logged in, show all torrents that aren't hidden unless they belong to you
|
||||
# On RSS pages, show all public torrents and nothing more.
|
||||
if logged_in_user and not rss:
|
||||
hiddenFilter = Q('term', hidden=False)
|
||||
userFilter = Q('term', uploader_id=logged_in_user.id)
|
||||
combinedFilter = hiddenFilter | userFilter
|
||||
s = s.filter('bool', filter=[combinedFilter])
|
||||
else:
|
||||
s = s.filter('term', hidden=False)
|
||||
|
||||
if main_category:
|
||||
s = s.filter('term', main_category_id=main_cat_id)
|
||||
elif sub_category:
|
||||
s = s.filter('term', main_category_id=main_cat_id)
|
||||
s = s.filter('term', sub_category_id=sub_cat_id)
|
||||
|
||||
if quality_filter == 0:
|
||||
pass
|
||||
elif quality_filter == 1:
|
||||
s = s.filter('term', remake=False)
|
||||
elif quality_filter == 2:
|
||||
s = s.filter('term', trusted=True)
|
||||
elif quality_filter == 3:
|
||||
s = s.filter('term', complete=True)
|
||||
|
||||
# Apply sort
|
||||
s = s.sort(es_sort)
|
||||
|
||||
# Only show first RESULTS_PER_PAGE items for RSS
|
||||
if rss:
|
||||
s = s[0:per_page]
|
||||
else:
|
||||
max_page = min(page, int(math.ceil(max_search_results / float(per_page))))
|
||||
from_idx = (max_page-1)*per_page
|
||||
to_idx = min(max_search_results, max_page*per_page)
|
||||
s = s[from_idx:to_idx]
|
||||
|
||||
highlight = app.config.get('ENABLE_ELASTIC_SEARCH_HIGHLIGHT')
|
||||
if highlight:
|
||||
s = s.highlight_options(tags_schema='styled')
|
||||
s = s.highlight("display_name")
|
||||
|
||||
# Return query, uncomment print line to debug query
|
||||
# from pprint import pprint
|
||||
# print(json.dumps(s.to_dict()))
|
||||
return s.execute()
|
||||
|
||||
|
||||
def search_db(term='', user=None, sort='id', order='desc', category='0_0',
|
||||
quality_filter='0', page=1, rss=False, admin=False,
|
||||
logged_in_user=None, per_page=75):
|
||||
sort_keys = {
|
||||
'id': models.Torrent.id,
|
||||
'size': models.Torrent.filesize,
|
||||
# 'name': models.Torrent.display_name, # Disable this because we disabled this in search_elastic, for the sake of consistency
|
||||
'seeders': models.Statistic.seed_count,
|
||||
'leechers': models.Statistic.leech_count,
|
||||
'downloads': models.Statistic.download_count
|
||||
}
|
||||
|
||||
sort_ = sort.lower()
|
||||
if sort_ not in sort_keys:
|
||||
flask.abort(400)
|
||||
sort = sort_keys[sort]
|
||||
|
||||
order_keys = {
|
||||
'desc': 'desc',
|
||||
'asc': 'asc'
|
||||
}
|
||||
|
||||
order_ = order.lower()
|
||||
if order_ not in order_keys:
|
||||
flask.abort(400)
|
||||
|
||||
filter_keys = {
|
||||
'0': None,
|
||||
'1': (models.TorrentFlags.REMAKE, False),
|
||||
'2': (models.TorrentFlags.TRUSTED, True),
|
||||
'3': (models.TorrentFlags.COMPLETE, True)
|
||||
}
|
||||
|
||||
sentinel = object()
|
||||
filter_tuple = filter_keys.get(quality_filter.lower(), sentinel)
|
||||
if filter_tuple is sentinel:
|
||||
flask.abort(400)
|
||||
|
||||
if user:
|
||||
user = models.User.by_id(user)
|
||||
if not user:
|
||||
flask.abort(404)
|
||||
user = user.id
|
||||
|
||||
main_category = None
|
||||
sub_category = None
|
||||
main_cat_id = 0
|
||||
sub_cat_id = 0
|
||||
if category:
|
||||
cat_match = re.match(r'^(\d+)_(\d+)$', category)
|
||||
if not cat_match:
|
||||
flask.abort(400)
|
||||
|
||||
main_cat_id = int(cat_match.group(1))
|
||||
sub_cat_id = int(cat_match.group(2))
|
||||
|
||||
if main_cat_id > 0:
|
||||
if sub_cat_id > 0:
|
||||
sub_category = models.SubCategory.by_category_ids(main_cat_id, sub_cat_id)
|
||||
else:
|
||||
main_category = models.MainCategory.by_id(main_cat_id)
|
||||
|
||||
if not category:
|
||||
flask.abort(400)
|
||||
|
||||
# Force sort by id desc if rss
|
||||
if rss:
|
||||
sort = sort_keys['id']
|
||||
order = 'desc'
|
||||
|
||||
same_user = False
|
||||
if logged_in_user:
|
||||
same_user = logged_in_user.id == user
|
||||
|
||||
if term:
|
||||
query = db.session.query(models.TorrentNameSearch)
|
||||
else:
|
||||
query = models.Torrent.query
|
||||
|
||||
# User view (/user/username)
|
||||
if user:
|
||||
query = query.filter(models.Torrent.uploader_id == user)
|
||||
|
||||
if not admin:
|
||||
# Hide all DELETED torrents if regular user
|
||||
query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.DELETED)).is_(False))
|
||||
# If logged in user is not the same as the user being viewed, show only torrents that aren't hidden or anonymous
|
||||
# If logged in user is the same as the user being viewed, show all torrents including hidden and anonymous ones
|
||||
# On RSS pages in user view, show only torrents that aren't hidden or anonymous no matter what
|
||||
if not same_user or rss:
|
||||
query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN |
|
||||
models.TorrentFlags.ANONYMOUS)).is_(False))
|
||||
# General view (homepage, general search view)
|
||||
else:
|
||||
if not admin:
|
||||
# Hide all DELETED torrents if regular user
|
||||
query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.DELETED)).is_(False))
|
||||
# If logged in, show all torrents that aren't hidden unless they belong to you
|
||||
# On RSS pages, show all public torrents and nothing more.
|
||||
if logged_in_user and not rss:
|
||||
query = query.filter((models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN)).is_(False)) |
|
||||
(models.Torrent.uploader_id == logged_in_user.id))
|
||||
# Otherwise, show all torrents that aren't hidden
|
||||
else:
|
||||
query = query.filter(models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN)).is_(False))
|
||||
|
||||
if main_category:
|
||||
query = query.filter(models.Torrent.main_category_id == main_cat_id)
|
||||
elif sub_category:
|
||||
query = query.filter((models.Torrent.main_category_id == main_cat_id) &
|
||||
(models.Torrent.sub_category_id == sub_cat_id))
|
||||
|
||||
if filter_tuple:
|
||||
query = query.filter(models.Torrent.flags.op('&')(int(filter_tuple[0])).is_(filter_tuple[1]))
|
||||
|
||||
if term:
|
||||
for item in shlex.split(term, posix=False):
|
||||
if len(item) >= 2:
|
||||
query = query.filter(FullTextSearch(
|
||||
item, models.TorrentNameSearch, FullTextMode.NATURAL))
|
||||
|
||||
# Sort and order
|
||||
if sort.class_ != models.Torrent:
|
||||
query = query.join(sort.class_)
|
||||
|
||||
query = query.order_by(getattr(sort, order)())
|
||||
|
||||
if rss:
|
||||
query = query.limit(per_page)
|
||||
else:
|
||||
query = query.paginate_faste(page, per_page=per_page, step=5)
|
||||
|
||||
return query
|
|
@ -97,4 +97,14 @@ table.torrent-list thead th.sorting_desc:after {
|
|||
margin-left: 20px;
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* elasticsearch term highlight */
|
||||
.hlt1 {
|
||||
font-style: normal;
|
||||
display: inline-block;
|
||||
padding: 0 3px;
|
||||
border-radius: 3px;
|
||||
border: 1px solid rgba(100, 56, 0, 0.8);
|
||||
background: rgba(200,127,0,0.3);
|
||||
}
|
||||
|
|
|
@ -4,20 +4,32 @@
|
|||
<description>RSS Feed for {{ term }}</description>
|
||||
<link>{{ url_for('home', _external=True) }}</link>
|
||||
<atom:link href="{{ url_for('home', page='rss', _external=True) }}" rel="self" type="application/rss+xml" />
|
||||
{% for torrent in query %}
|
||||
{% for torrent in torrent_query %}
|
||||
{% if torrent.has_torrent %}
|
||||
<item>
|
||||
<title>{{ torrent.display_name }}</title>
|
||||
{% if use_elastic %}
|
||||
<link>{{ url_for('download_torrent', torrent_id=torrent.meta.id, _external=True) }}</link>
|
||||
<guid isPermaLink="true">{{ url_for('view_torrent', torrent_id=torrent.meta.id, _external=True) }}</guid>
|
||||
<pubDate>{{ torrent.created_time|rfc822_es }}</pubDate>
|
||||
{% else %}
|
||||
<link>{{ url_for('download_torrent', torrent_id=torrent.id, _external=True) }}</link>
|
||||
<guid isPermaLink="true">{{ url_for('view_torrent', torrent_id=torrent.id, _external=True) }}</guid>
|
||||
<pubDate>{{ torrent.created_time|rfc822 }}</pubDate>
|
||||
{% endif %}
|
||||
</item>
|
||||
{% else %}
|
||||
<item>
|
||||
<title>{{ torrent.display_name }}</title>
|
||||
{% if use_elastic %}
|
||||
<link>{{ create_magnet_from_info(torrent.display_name, torrent.info_hash) }}</link>
|
||||
<guid isPermaLink="true">{{ url_for('view_torrent', torrent_id=torrent.meta.id, _external=True) }}</guid>
|
||||
<pubDate>{{ torrent.created_time|rfc822_es }}</pubDate>
|
||||
{% else %}
|
||||
<link>{{ torrent.magnet_uri }}</link>
|
||||
<guid isPermaLink="true">{{ url_for('view_torrent', torrent_id=torrent.id, _external=True) }}</guid>
|
||||
<pubDate>{{ torrent.created_time|rfc822 }}</pubDate>
|
||||
{% endif %}
|
||||
</item>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
|
|
@ -8,7 +8,7 @@
|
|||
{{ caller() }}
|
||||
</th>
|
||||
{% endmacro %}
|
||||
{% if torrent_query.items %}
|
||||
{% if (use_elastic and torrent_query.hits.total > 0) or (torrent_query.items) %}
|
||||
<div class="table-responsive">
|
||||
<table class="table table-bordered table-hover table-striped torrent-list">
|
||||
<thead>
|
||||
|
@ -16,7 +16,7 @@
|
|||
{% call render_column_header("hdr-category", "width:80px;", center_text=True) %}
|
||||
<div>Category</div>
|
||||
{% endcall %}
|
||||
{% call render_column_header("hdr-name", "width:auto;", sort_key="name") %}
|
||||
{% call render_column_header("hdr-name", "width:auto;") %}
|
||||
<div>Name</div>
|
||||
{% endcall %}
|
||||
{% call render_column_header("hdr-link", "width:70px;", center_text=True) %}
|
||||
|
@ -45,27 +45,51 @@
|
|||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for torrent in torrent_query.items %}
|
||||
{% set torrents = torrent_query if use_elastic else torrent_query.items %}
|
||||
{% for torrent in torrents %}
|
||||
<tr class="{% if torrent.deleted %}deleted{% elif torrent.hidden %}warning{% elif torrent.remake %}danger{% elif torrent.trusted %}success{% else %}default{% endif %}">
|
||||
{% set cat_id = (torrent.main_category.id|string) + '_' + (torrent.sub_category.id|string) %}
|
||||
{% set cat_id = (torrent.main_category_id|string) + '_' + (torrent.sub_category_id|string) if use_elastic else (torrent.main_category.id|string) + '_' + (torrent.sub_category.id|string) %}
|
||||
{% set icon_dir = config.SITE_FLAVOR %}
|
||||
<td style="padding:0 4px;">
|
||||
{% if use_elastic %}
|
||||
<a href="/?c={{ cat_id }}" title="{{ torrent.main_category_id }} - {{ torrent.sub_category_id }}">
|
||||
{% else %}
|
||||
<a href="/?c={{ cat_id }}" title="{{ torrent.main_category.name }} - {{ torrent.sub_category.name }}">
|
||||
{% endif %}
|
||||
<img src="/static/img/icons/{{ icon_dir }}/{{ cat_id }}.png">
|
||||
</a>
|
||||
</td>
|
||||
{% if use_elastic %}
|
||||
<td><a href="{{ url_for('view_torrent', torrent_id=torrent.meta.id) }}">{%if "highlight" in torrent.meta %}{{ torrent.meta.highlight.display_name[0] | safe }}{% else %}{{torrent.display_name}}{%endif%}</a></td>
|
||||
{% else %}
|
||||
<td><a href="{{ url_for('view_torrent', torrent_id=torrent.id) }}">{{ torrent.display_name | escape }}</a></td>
|
||||
{% endif %}
|
||||
<td style="white-space: nowrap;text-align: center;">
|
||||
{% if torrent.has_torrent %}<a href="{{ url_for('download_torrent', torrent_id=torrent.id) }}"><i class="fa fa-fw fa-download"></i></a>{% endif %}
|
||||
{% if use_elastic %}
|
||||
<a href="{{ create_magnet_from_info(torrent.display_name, torrent.info_hash) }}"><i class="fa fa-fw fa-magnet"></i></a>
|
||||
{% else %}
|
||||
<a href="{{ torrent.magnet_uri }}"><i class="fa fa-fw fa-magnet"></i></a>
|
||||
{% endif %}
|
||||
</td>
|
||||
<td class="text-center">{{ torrent.filesize | filesizeformat(True) }}</td>
|
||||
{% if use_elastic %}
|
||||
<td class="text-center" data-timestamp="{{ torrent.created_time | utc_time }}">{{ torrent.created_time | display_time }}</td>
|
||||
{% else %}
|
||||
<td class="text-center" data-timestamp="{{ torrent.created_utc_timestamp|int }}">{{ torrent.created_time.strftime('%Y-%m-%d %H:%M') }}</td>
|
||||
{% endif %}
|
||||
|
||||
{% if config.ENABLE_SHOW_STATS %}
|
||||
{% if use_elastic %}
|
||||
<td class="text-center" style="color: green;">{{ torrent.seed_count }}</td>
|
||||
<td class="text-center" style="color: red;">{{ torrent.leech_count }}</td>
|
||||
<td class="text-center">{{ torrent.download_count }}</td>
|
||||
{% else %}
|
||||
<td class="text-center" style="color: green;">{{ torrent.stats.seed_count }}</td>
|
||||
<td class="text-center" style="color: red;">{{ torrent.stats.leech_count }}</td>
|
||||
<td class="text-center">{{ torrent.stats.download_count }}</td>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
|
@ -76,6 +100,11 @@
|
|||
{% endif %}
|
||||
|
||||
<center>
|
||||
{% if use_elastic %}
|
||||
{{ pagination.info }}
|
||||
{{ pagination.links }}
|
||||
{% else %}
|
||||
{% from "bootstrap/pagination.html" import render_pagination %}
|
||||
{{ render_pagination(torrent_query) }}
|
||||
{% endif %}
|
||||
</center>
|
||||
|
|
|
@ -3,6 +3,7 @@ import base64
|
|||
import time
|
||||
from urllib.parse import urlencode
|
||||
from orderedset import OrderedSet
|
||||
from nyaa import app
|
||||
|
||||
from nyaa import bencode
|
||||
from nyaa import app
|
||||
|
@ -53,10 +54,23 @@ def get_trackers(torrent):
|
|||
|
||||
return list(trackers)
|
||||
|
||||
def get_trackers_magnet():
|
||||
trackers = OrderedSet()
|
||||
|
||||
# Our main one first
|
||||
main_announce_url = app.config.get('MAIN_ANNOUNCE_URL')
|
||||
if main_announce_url:
|
||||
trackers.add(main_announce_url)
|
||||
|
||||
# and finally our tracker list
|
||||
trackers.update(default_trackers())
|
||||
|
||||
return list(trackers)
|
||||
|
||||
|
||||
def create_magnet(torrent, max_trackers=5, trackers=None):
|
||||
if trackers is None:
|
||||
trackers = get_trackers(torrent)
|
||||
trackers = get_trackers_magnet()
|
||||
|
||||
magnet_parts = [
|
||||
('dn', torrent.display_name)
|
||||
|
@ -68,6 +82,24 @@ def create_magnet(torrent, max_trackers=5, trackers=None):
|
|||
return 'magnet:?xt=urn:btih:' + b32_info_hash + '&' + urlencode(magnet_parts)
|
||||
|
||||
|
||||
# For processing ES links
|
||||
@app.context_processor
|
||||
def create_magnet_from_info():
|
||||
def _create_magnet_from_info(display_name, info_hash, max_trackers=5, trackers=None):
|
||||
if trackers is None:
|
||||
trackers = get_trackers_magnet()
|
||||
|
||||
magnet_parts = [
|
||||
('dn', display_name)
|
||||
]
|
||||
for tracker in trackers[:max_trackers]:
|
||||
magnet_parts.append(('tr', tracker))
|
||||
|
||||
b32_info_hash = base64.b32encode(bytes.fromhex(info_hash)).decode('utf-8')
|
||||
return 'magnet:?xt=urn:btih:' + b32_info_hash + '&' + urlencode(magnet_parts)
|
||||
return dict(create_magnet_from_info=_create_magnet_from_info)
|
||||
|
||||
|
||||
def create_default_metadata_base(torrent, trackers=None):
|
||||
if trackers is None:
|
||||
trackers = get_trackers(torrent)
|
||||
|
|
|
@ -24,11 +24,17 @@ pycodestyle==2.3.1
|
|||
pycparser==2.17
|
||||
pyparsing==2.2.0
|
||||
six==1.10.0
|
||||
SQLAlchemy>=1.1.9
|
||||
SQLAlchemy==1.1.9
|
||||
SQLAlchemy-FullText-Search==0.2.3
|
||||
SQLAlchemy-Utils>=0.32.14
|
||||
SQLAlchemy-Utils==0.32.14
|
||||
uWSGI==2.0.15
|
||||
visitor==0.1.3
|
||||
webassets==0.12.1
|
||||
Werkzeug==0.12.1
|
||||
WTForms==2.1
|
||||
## elasticsearch dependencies
|
||||
elasticsearch==5.3.0
|
||||
elasticsearch-dsl==5.2.0
|
||||
progressbar2==3.20.0
|
||||
mysql-replication==0.13
|
||||
flask-paginate==0.4.5
|
168
sync_es.py
Normal file
168
sync_es.py
Normal file
|
@ -0,0 +1,168 @@
|
|||
#!/usr/bin/env python
|
||||
"""
|
||||
stream changes in mysql (on the torrents and statistics table) into
|
||||
elasticsearch as they happen on the binlog. This keeps elasticsearch in sync
|
||||
with whatever you do to the database, including stuff like admin queries. Also,
|
||||
because mysql keeps the binlog around for N days before deleting old stuff, you
|
||||
can survive a hiccup of elasticsearch or this script dying and pick up where
|
||||
you left off.
|
||||
|
||||
For that "picking up" part, this script depends on one piece of external state:
|
||||
its last known binlog filename and position. This is saved off as a JSON file
|
||||
to a configurable location on the filesystem periodically. If the file is not
|
||||
present then you can initialize it with the values from `SHOW MASTER STATUS`
|
||||
from the mysql repl, which will start the sync from current state.
|
||||
|
||||
In the case of catastrophic elasticsearch meltdown where you need to
|
||||
reconstruct the index, you'll want to be a bit careful with coordinating
|
||||
sync_es and import_to_es scripts. If you run import_to_es first than run
|
||||
sync_es against SHOW MASTER STATUS, anything that changed the database between
|
||||
when import_to_es and sync_es will be lost. Instead, you can run SHOW MASTER
|
||||
STATUS _before_ you run import_to_es. That way you'll definitely pick up any
|
||||
changes that happen while the import_to_es script is dumping stuff from the
|
||||
database into es, at the expense of redoing a (small) amount of indexing.
|
||||
"""
|
||||
from elasticsearch import Elasticsearch
|
||||
from pymysqlreplication import BinLogStreamReader
|
||||
from pymysqlreplication.row_event import UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent
|
||||
from datetime import datetime
|
||||
from nyaa.models import TorrentFlags
|
||||
import sys
|
||||
import json
|
||||
import time
|
||||
import logging
|
||||
|
||||
logging.basicConfig()
|
||||
|
||||
log = logging.getLogger('sync_es')
|
||||
log.setLevel(logging.INFO)
|
||||
|
||||
#logging.getLogger('elasticsearch').setLevel(logging.DEBUG)
|
||||
|
||||
# in prod want in /var/lib somewhere probably
|
||||
SAVE_LOC = "/var/lib/sync_es_position.json"
|
||||
MYSQL_HOST = '127.0.0.1'
|
||||
MYSQL_PORT = 3306
|
||||
MYSQL_USER = 'test'
|
||||
MYSQL_PW = 'test123'
|
||||
NT_DB = 'nyaav2'
|
||||
|
||||
with open(SAVE_LOC) as f:
|
||||
pos = json.load(f)
|
||||
|
||||
es = Elasticsearch()
|
||||
|
||||
stream = BinLogStreamReader(
|
||||
# TODO parse out from config.py or something
|
||||
connection_settings = {
|
||||
'host': MYSQL_HOST,
|
||||
'port': MYSQL_PORT,
|
||||
'user': MYSQL_USER,
|
||||
'passwd': MYSQL_PW
|
||||
},
|
||||
server_id=10, # arbitrary
|
||||
# only care about this database currently
|
||||
only_schemas=[NT_DB],
|
||||
# these tables in the database
|
||||
only_tables=["nyaa_torrents", "nyaa_statistics", "sukebei_torrents", "sukebei_statistics"],
|
||||
# from our save file
|
||||
resume_stream=True,
|
||||
log_file=pos['log_file'],
|
||||
log_pos=pos['log_pos'],
|
||||
# skip the other stuff like table mapping
|
||||
only_events=[UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent],
|
||||
# if we're at the head of the log, block until something happens
|
||||
# note it'd be nice to block async-style instead, but the mainline
|
||||
# binlogreader is synchronous. there is an (unmaintained?) fork
|
||||
# using aiomysql if anybody wants to revive that.
|
||||
blocking=True)
|
||||
|
||||
def reindex_torrent(t, index_name):
|
||||
# XXX annoyingly different from import_to_es, and
|
||||
# you need to keep them in sync manually.
|
||||
f = t['flags']
|
||||
doc = {
|
||||
"id": t['id'],
|
||||
"display_name": t['display_name'],
|
||||
"created_time": t['created_time'],
|
||||
"updated_time": t['updated_time'],
|
||||
"description": t['description'],
|
||||
# not analyzed but included so we can render magnet links
|
||||
# without querying sql again.
|
||||
"info_hash": t['info_hash'].hex(),
|
||||
"filesize": t['filesize'],
|
||||
"uploader_id": t['uploader_id'],
|
||||
"main_category_id": t['main_category_id'],
|
||||
"sub_category_id": t['sub_category_id'],
|
||||
# XXX all the bitflags are numbers
|
||||
"anonymous": bool(f & TorrentFlags.ANONYMOUS),
|
||||
"trusted": bool(f & TorrentFlags.TRUSTED),
|
||||
"remake": bool(f & TorrentFlags.REMAKE),
|
||||
"complete": bool(f & TorrentFlags.COMPLETE),
|
||||
# TODO instead of indexing and filtering later
|
||||
# could delete from es entirely. Probably won't matter
|
||||
# for at least a few months.
|
||||
"hidden": bool(f & TorrentFlags.HIDDEN),
|
||||
"deleted": bool(f & TorrentFlags.DELETED),
|
||||
"has_torrent": bool(t['has_torrent']),
|
||||
}
|
||||
# update, so we don't delete the stats if present
|
||||
es.update(
|
||||
index=index_name,
|
||||
doc_type='torrent',
|
||||
id=t['id'],
|
||||
body={"doc": doc, "doc_as_upsert": True})
|
||||
|
||||
def reindex_stats(s, index_name):
|
||||
es.update(
|
||||
index=index_name,
|
||||
doc_type='torrent',
|
||||
id=s['torrent_id'],
|
||||
body={
|
||||
"doc": {
|
||||
"stats_last_updated": s["last_updated"],
|
||||
"download_count": s["download_count"],
|
||||
"leech_count": s['leech_count'],
|
||||
"seed_count": s['seed_count'],
|
||||
}})
|
||||
|
||||
n = 0
|
||||
last_save = time.time()
|
||||
|
||||
for event in stream:
|
||||
for row in event.rows:
|
||||
if event.table == "nyaa_torrents" or event.table == "sukebei_torrents":
|
||||
if event.table == "nyaa_torrents":
|
||||
index_name = "nyaa"
|
||||
else:
|
||||
index_name = "sukebei"
|
||||
if type(event) is WriteRowsEvent:
|
||||
reindex_torrent(row['values'], index_name)
|
||||
elif type(event) is UpdateRowsEvent:
|
||||
reindex_torrent(row['after_values'], index_name)
|
||||
elif type(event) is DeleteRowsEvent:
|
||||
# just delete it
|
||||
es.delete(index=index_name, doc_type='torrent', id=row['values']['id'])
|
||||
else:
|
||||
raise Exception(f"unknown event {type(event)}")
|
||||
elif event.table == "nyaa_statistics" or event.table == "sukebei_statistics":
|
||||
if event.table == "nyaa_torrents":
|
||||
index_name = "nyaa"
|
||||
else:
|
||||
index_name = "sukebei"
|
||||
if type(event) is WriteRowsEvent:
|
||||
reindex_stats(row['values'], index_name)
|
||||
elif type(event) is UpdateRowsEvent:
|
||||
reindex_stats(row['after_values'], index_name)
|
||||
elif type(event) is DeleteRowsEvent:
|
||||
# uh ok. assume that the torrent row will get deleted later.
|
||||
pass
|
||||
else:
|
||||
raise Exception(f"unknown event {type(event)}")
|
||||
else:
|
||||
raise Exception(f"unknown table {s.table}")
|
||||
n += 1
|
||||
if n % 100 == 0 or time.time() - last_save > 30:
|
||||
log.info(f"saving position {stream.log_file}/{stream.log_pos}")
|
||||
with open(SAVE_LOC, 'w') as f:
|
||||
json.dump({"log_file": stream.log_file, "log_pos": stream.log_pos}, f)
|
Loading…
Reference in a new issue