From e545f8ae193b3e124d0a20335f2711abd5d57449 Mon Sep 17 00:00:00 2001 From: Anna-Maria Meriniemi Date: Sun, 12 Jul 2020 10:14:06 +0300 Subject: [PATCH] Introduce baked queries (#592) SQA's baked queries prepares the queries in advance, caching yada yada. Makes thing a bit faster. Also bigger speedup included is a shoddy cache for the total torrent count (only applied to baked queries currently). Caching the value for a few dozen seconds shaves off some wasted time, as it's mostly just used for pagination. --- config.example.py | 7 + nyaa/search.py | 307 ++++++++++++++++++++++++++++++++++++++++++++ nyaa/views/main.py | 8 +- nyaa/views/users.py | 7 +- 4 files changed, 325 insertions(+), 4 deletions(-) diff --git a/config.example.py b/config.example.py index 2f40c6f..253cb06 100644 --- a/config.example.py +++ b/config.example.py @@ -156,6 +156,13 @@ RESULTS_PER_PAGE = 75 # How many pages we'll return at most MAX_PAGES = 100 +# How long and how many entries to cache for count queries +COUNT_CACHE_SIZE = 256 +COUNT_CACHE_DURATION = 30 + +# Use baked queries for database search +USE_BAKED_SEARCH = False + # Use better searching with ElasticSearch # See README.MD on setup! USE_ELASTIC_SEARCH = False diff --git a/nyaa/search.py b/nyaa/search.py index a1700c7..f6932af 100644 --- a/nyaa/search.py +++ b/nyaa/search.py @@ -1,13 +1,17 @@ import math import re import shlex +import threading +import time import flask +from flask_sqlalchemy import Pagination import sqlalchemy import sqlalchemy_fulltext.modes as FullTextMode from elasticsearch import Elasticsearch from elasticsearch_dsl import Q, Search +from sqlalchemy.ext import baked from sqlalchemy_fulltext import FullTextSearch from nyaa import models @@ -531,3 +535,306 @@ def search_db(term='', user=None, sort='id', order='desc', category='0_0', max_page=MAX_PAGES) return query + + +# Baked queries follow + +class BakedPair(object): + def __init__(self, *items): + self.items = list(items) + + def __iadd__(self, other): + for item in self.items: + item += other + + return self + + +bakery = baked.bakery() + + +BAKED_SORT_KEYS = { + 'id': models.Torrent.id, + 'size': models.Torrent.filesize, + 'comments': models.Torrent.comment_count, + 'seeders': models.Statistic.seed_count, + 'leechers': models.Statistic.leech_count, + 'downloads': models.Statistic.download_count +} + +BAKED_SORT_LAMBDAS = { + 'id-asc': lambda q: q.order_by(models.Torrent.id.asc()), + 'id-desc': lambda q: q.order_by(models.Torrent.id.desc()), + + 'size-asc': lambda q: q.order_by(models.Torrent.filesize.asc()), + 'size-desc': lambda q: q.order_by(models.Torrent.filesize.desc()), + + 'comments-asc': lambda q: q.order_by(models.Torrent.comment_count.asc()), + 'comments-desc': lambda q: q.order_by(models.Torrent.comment_count.desc()), + + # This is a bit stupid, but programmatically generating these mixed up the baked keys, so deal. + 'seeders-asc': lambda q: q.join(models.Statistic).with_hint( + models.Statistic, 'USE INDEX (idx_nyaa_statistics_seed_count)' + ).order_by(models.Statistic.seed_count.asc(), models.Torrent.id.asc()), + 'seeders-desc': lambda q: q.join(models.Statistic).with_hint( + models.Statistic, 'USE INDEX (idx_nyaa_statistics_seed_count)' + ).order_by(models.Statistic.seed_count.desc(), models.Torrent.id.desc()), + + 'leechers-asc': lambda q: q.join(models.Statistic).with_hint( + models.Statistic, 'USE INDEX (idx_nyaa_statistics_leech_count)' + ).order_by(models.Statistic.leech_count.asc(), models.Torrent.id.asc()), + 'leechers-desc': lambda q: q.join(models.Statistic).with_hint( + models.Statistic, 'USE INDEX (idx_nyaa_statistics_leech_count)' + ).order_by(models.Statistic.leech_count.desc(), models.Torrent.id.desc()), + + 'downloads-asc': lambda q: q.join(models.Statistic).with_hint( + models.Statistic, 'USE INDEX (idx_nyaa_statistics_download_count)' + ).order_by(models.Statistic.download_count.asc(), models.Torrent.id.asc()), + 'downloads-desc': lambda q: q.join(models.Statistic).with_hint( + models.Statistic, 'USE INDEX (idx_nyaa_statistics_download_count)' + ).order_by(models.Statistic.download_count.desc(), models.Torrent.id.desc()), +} + + +BAKED_FILTER_LAMBDAS = { + '0': None, + '1': lambda q: ( + q.filter(models.Torrent.flags.op('&')(models.TorrentFlags.REMAKE.value).is_(False)) + ), + '2': lambda q: ( + q.filter(models.Torrent.flags.op('&')(models.TorrentFlags.TRUSTED.value).is_(True)) + ), + '3': lambda q: ( + q.filter(models.Torrent.flags.op('&')(models.TorrentFlags.COMPLETE.value).is_(True)) + ), +} + + +def search_db_baked(term='', user=None, sort='id', order='desc', category='0_0', + quality_filter='0', page=1, rss=False, admin=False, + logged_in_user=None, per_page=75): + if page > 4294967295: + flask.abort(404) + + MAX_PAGES = app.config.get("MAX_PAGES", 0) + + if MAX_PAGES and page > MAX_PAGES: + flask.abort(flask.Response("You've exceeded the maximum number of pages. Please " + "make your search query less broad.", 403)) + + sort_lambda = BAKED_SORT_LAMBDAS.get('{}-{}'.format(sort, order).lower()) + if not sort_lambda: + flask.abort(400) + + sentinel = object() + filter_lambda = BAKED_FILTER_LAMBDAS.get(quality_filter.lower(), sentinel) + if filter_lambda is sentinel: + flask.abort(400) + + if user: + user = models.User.by_id(user) + if not user: + flask.abort(404) + user = user.id + + main_cat_id = 0 + sub_cat_id = 0 + + if category: + cat_match = re.match(r'^(\d+)_(\d+)$', category) + if not cat_match: + flask.abort(400) + + main_cat_id = int(cat_match.group(1)) + sub_cat_id = int(cat_match.group(2)) + + if main_cat_id > 0: + if sub_cat_id > 0: + sub_category = models.SubCategory.by_category_ids(main_cat_id, sub_cat_id) + if not sub_category: + flask.abort(400) + else: + main_category = models.MainCategory.by_id(main_cat_id) + if not main_category: + flask.abort(400) + + # Force sort by id desc if rss + if rss: + sort_lambda = BAKED_SORT_LAMBDAS['id-desc'] + + same_user = False + if logged_in_user: + same_user = logged_in_user.id == user + + if term: + query = bakery(lambda session: session.query(models.TorrentNameSearch)) + count_query = bakery(lambda session: session.query( + sqlalchemy.func.count(models.TorrentNameSearch.id))) + else: + query = bakery(lambda session: session.query(models.Torrent)) + # This is... eh. Optimize the COUNT() query since MySQL is bad at that. + # See http://docs.sqlalchemy.org/en/rel_1_1/orm/query.html#sqlalchemy.orm.query.Query.count + # Wrap the queries into the helper class to deduplicate code and + # apply filters to both in one go + count_query = bakery(lambda session: session.query( + sqlalchemy.func.count(models.Torrent.id))) + + qpc = BakedPair(query, count_query) + bp = sqlalchemy.bindparam + + baked_params = {} + + # User view (/user/username) + if user: + qpc += lambda q: q.filter(models.Torrent.uploader_id == bp('user')) + baked_params['user'] = user + + if not admin: + # Hide all DELETED torrents if regular user + qpc += lambda q: q.filter(models.Torrent.flags.op('&') + (int(models.TorrentFlags.DELETED)).is_(False)) + # If logged in user is not the same as the user being viewed, + # show only torrents that aren't hidden or anonymous + # + # If logged in user is the same as the user being viewed, + # show all torrents including hidden and anonymous ones + # + # On RSS pages in user view, + # show only torrents that aren't hidden or anonymous no matter what + if not same_user or rss: + qpc += lambda q: ( + q.filter( + models.Torrent.flags.op('&')( + int(models.TorrentFlags.HIDDEN | models.TorrentFlags.ANONYMOUS) + ).is_(False) + ) + ) + # General view (homepage, general search view) + else: + if not admin: + # Hide all DELETED torrents if regular user + qpc += lambda q: q.filter(models.Torrent.flags.op('&') + (int(models.TorrentFlags.DELETED)).is_(False)) + # If logged in, show all torrents that aren't hidden unless they belong to you + # On RSS pages, show all public torrents and nothing more. + if logged_in_user and not rss: + qpc += lambda q: q.filter( + (models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN)).is_(False)) | + (models.Torrent.uploader_id == bp('logged_in_user')) + ) + baked_params['logged_in_user'] = logged_in_user + # Otherwise, show all torrents that aren't hidden + else: + qpc += lambda q: q.filter(models.Torrent.flags.op('&') + (int(models.TorrentFlags.HIDDEN)).is_(False)) + + if sub_cat_id: + qpc += lambda q: q.filter( + (models.Torrent.main_category_id == bp('main_cat_id')), + (models.Torrent.sub_category_id == bp('sub_cat_id')) + ) + baked_params['main_cat_id'] = main_cat_id + baked_params['sub_cat_id'] = sub_cat_id + elif main_cat_id: + qpc += lambda q: q.filter(models.Torrent.main_category_id == bp('main_cat_id')) + baked_params['main_cat_id'] = main_cat_id + + if filter_lambda: + qpc += filter_lambda + + if term: + raise Exception('Baked search does not support search terms') + + # Sort and order + query += sort_lambda + + if rss: + query += lambda q: q.limit(bp('per_page')) + baked_params['per_page'] = per_page + + return query(db.session()).params(**baked_params).all() + + return baked_paginate(query, count_query, baked_params, + page, per_page=per_page, step=5, max_page=MAX_PAGES) + + +class ShoddyLRU(object): + def __init__(self, max_entries=128, expiry=60): + self.max_entries = max_entries + self.expiry = expiry + + # Contains [value, last_used, expires_at] + self.entries = {} + self._lock = threading.Lock() + + self._sentinel = object() + + def get(self, key, default=None): + entry = self.entries.get(key) + if entry is None: + return default + + now = time.time() + if now > entry[2]: + with self._lock: + del self.entries[key] + return default + + entry[1] = now + return entry[0] + + def put(self, key, value, expiry=None): + with self._lock: + overflow = len(self.entries) - self.max_entries + if overflow > 0: + # Pick the least recently used keys + removed_keys = [key for key, value in sorted( + self.entries.items(), key=lambda t:t[1][1])][:overflow] + for key in removed_keys: + del self.entries[key] + + now = time.time() + self.entries[key] = [value, now, now + (expiry or self.expiry)] + + +LRU_CACHE = ShoddyLRU(256, 60) + + +def baked_paginate(query, count_query, params, page=1, per_page=50, max_page=None, step=5): + if page < 1: + flask.abort(404) + + if max_page and page > max_page: + flask.abort(404) + bp = sqlalchemy.bindparam + + ses = db.session() + + # Count all items, use cache + if app.config['COUNT_CACHE_DURATION']: + query_key = (count_query._effective_key(ses), tuple(sorted(params.items()))) + total_query_count = LRU_CACHE.get(query_key) + if total_query_count is None: + total_query_count = count_query(ses).params(**params).scalar() + LRU_CACHE.put(query_key, total_query_count, expiry=app.config['COUNT_CACHE_DURATION']) + else: + total_query_count = count_query(ses).params(**params).scalar() + + # Grab items on current page + query += lambda q: q.limit(bp('limit')).offset(bp('offset')) + params['limit'] = per_page + params['offset'] = (page - 1) * per_page + + res = query(ses).params(**params) + items = res.all() + + if max_page: + total_query_count = min(total_query_count, max_page * per_page) + + # Handle case where we've had no results but then have some while in cache + total_query_count = max(total_query_count, len(items)) + + if not items and page != 1: + flask.abort(404) + + return Pagination(None, page, per_page, total_query_count, items) diff --git a/nyaa/views/main.py b/nyaa/views/main.py index a1dae5a..2243578 100644 --- a/nyaa/views/main.py +++ b/nyaa/views/main.py @@ -10,7 +10,7 @@ from flask_paginate import Pagination from nyaa import models from nyaa.extensions import db from nyaa.search import (DEFAULT_MAX_SEARCH_RESULT, DEFAULT_PER_PAGE, SERACH_PAGINATE_DISPLAY_MSG, - _generate_query_string, search_db, search_elastic) + _generate_query_string, search_db, search_db_baked, search_elastic) from nyaa.utils import chain_get from nyaa.views.account import logout @@ -186,7 +186,11 @@ def home(rss): else: # Otherwise, use db search for everything query_args['term'] = search_term or '' - query = search_db(**query_args) + if app.config['USE_BAKED_SEARCH']: + query = search_db_baked(**query_args) + else: + query = search_db(**query_args) + if render_as_rss: return render_rss('Home', query, use_elastic=False, magnet_links=use_magnet_links) else: diff --git a/nyaa/views/users.py b/nyaa/views/users.py index 87ca471..decf3f6 100644 --- a/nyaa/views/users.py +++ b/nyaa/views/users.py @@ -12,7 +12,7 @@ from itsdangerous import BadSignature, URLSafeSerializer from nyaa import forms, models from nyaa.extensions import db from nyaa.search import (DEFAULT_MAX_SEARCH_RESULT, DEFAULT_PER_PAGE, SERACH_PAGINATE_DISPLAY_MSG, - _generate_query_string, search_db, search_elastic) + _generate_query_string, search_db, search_db_baked, search_elastic) from nyaa.utils import admin_only, chain_get, sha1_hash app = flask.current_app @@ -185,7 +185,10 @@ def view_user(user_name): query_args['term'] = '' else: query_args['term'] = search_term or '' - query = search_db(**query_args) + if app.config['USE_BAKED_SEARCH']: + query = search_db_baked(**query_args) + else: + query = search_db(**query_args) return flask.render_template('user.html', use_elastic=False, torrent_query=query,