Introduce baked queries (#592)

SQA's baked queries prepares the queries in advance, caching yada yada.
Makes thing a bit faster.
Also bigger speedup included is a shoddy cache for the total torrent
count (only applied to baked queries currently). Caching the value for a
few dozen seconds shaves off some wasted time, as it's mostly just used
for pagination.
This commit is contained in:
Anna-Maria Meriniemi 2020-07-12 10:14:06 +03:00 committed by GitHub
parent 4fcef92b94
commit e545f8ae19
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 325 additions and 4 deletions

View File

@ -156,6 +156,13 @@ RESULTS_PER_PAGE = 75
# How many pages we'll return at most
MAX_PAGES = 100
# How long and how many entries to cache for count queries
COUNT_CACHE_SIZE = 256
COUNT_CACHE_DURATION = 30
# Use baked queries for database search
USE_BAKED_SEARCH = False
# Use better searching with ElasticSearch
# See README.MD on setup!
USE_ELASTIC_SEARCH = False

View File

@ -1,13 +1,17 @@
import math
import re
import shlex
import threading
import time
import flask
from flask_sqlalchemy import Pagination
import sqlalchemy
import sqlalchemy_fulltext.modes as FullTextMode
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Q, Search
from sqlalchemy.ext import baked
from sqlalchemy_fulltext import FullTextSearch
from nyaa import models
@ -531,3 +535,306 @@ def search_db(term='', user=None, sort='id', order='desc', category='0_0',
max_page=MAX_PAGES)
return query
# Baked queries follow
class BakedPair(object):
def __init__(self, *items):
self.items = list(items)
def __iadd__(self, other):
for item in self.items:
item += other
return self
bakery = baked.bakery()
BAKED_SORT_KEYS = {
'id': models.Torrent.id,
'size': models.Torrent.filesize,
'comments': models.Torrent.comment_count,
'seeders': models.Statistic.seed_count,
'leechers': models.Statistic.leech_count,
'downloads': models.Statistic.download_count
}
BAKED_SORT_LAMBDAS = {
'id-asc': lambda q: q.order_by(models.Torrent.id.asc()),
'id-desc': lambda q: q.order_by(models.Torrent.id.desc()),
'size-asc': lambda q: q.order_by(models.Torrent.filesize.asc()),
'size-desc': lambda q: q.order_by(models.Torrent.filesize.desc()),
'comments-asc': lambda q: q.order_by(models.Torrent.comment_count.asc()),
'comments-desc': lambda q: q.order_by(models.Torrent.comment_count.desc()),
# This is a bit stupid, but programmatically generating these mixed up the baked keys, so deal.
'seeders-asc': lambda q: q.join(models.Statistic).with_hint(
models.Statistic, 'USE INDEX (idx_nyaa_statistics_seed_count)'
).order_by(models.Statistic.seed_count.asc(), models.Torrent.id.asc()),
'seeders-desc': lambda q: q.join(models.Statistic).with_hint(
models.Statistic, 'USE INDEX (idx_nyaa_statistics_seed_count)'
).order_by(models.Statistic.seed_count.desc(), models.Torrent.id.desc()),
'leechers-asc': lambda q: q.join(models.Statistic).with_hint(
models.Statistic, 'USE INDEX (idx_nyaa_statistics_leech_count)'
).order_by(models.Statistic.leech_count.asc(), models.Torrent.id.asc()),
'leechers-desc': lambda q: q.join(models.Statistic).with_hint(
models.Statistic, 'USE INDEX (idx_nyaa_statistics_leech_count)'
).order_by(models.Statistic.leech_count.desc(), models.Torrent.id.desc()),
'downloads-asc': lambda q: q.join(models.Statistic).with_hint(
models.Statistic, 'USE INDEX (idx_nyaa_statistics_download_count)'
).order_by(models.Statistic.download_count.asc(), models.Torrent.id.asc()),
'downloads-desc': lambda q: q.join(models.Statistic).with_hint(
models.Statistic, 'USE INDEX (idx_nyaa_statistics_download_count)'
).order_by(models.Statistic.download_count.desc(), models.Torrent.id.desc()),
}
BAKED_FILTER_LAMBDAS = {
'0': None,
'1': lambda q: (
q.filter(models.Torrent.flags.op('&')(models.TorrentFlags.REMAKE.value).is_(False))
),
'2': lambda q: (
q.filter(models.Torrent.flags.op('&')(models.TorrentFlags.TRUSTED.value).is_(True))
),
'3': lambda q: (
q.filter(models.Torrent.flags.op('&')(models.TorrentFlags.COMPLETE.value).is_(True))
),
}
def search_db_baked(term='', user=None, sort='id', order='desc', category='0_0',
quality_filter='0', page=1, rss=False, admin=False,
logged_in_user=None, per_page=75):
if page > 4294967295:
flask.abort(404)
MAX_PAGES = app.config.get("MAX_PAGES", 0)
if MAX_PAGES and page > MAX_PAGES:
flask.abort(flask.Response("You've exceeded the maximum number of pages. Please "
"make your search query less broad.", 403))
sort_lambda = BAKED_SORT_LAMBDAS.get('{}-{}'.format(sort, order).lower())
if not sort_lambda:
flask.abort(400)
sentinel = object()
filter_lambda = BAKED_FILTER_LAMBDAS.get(quality_filter.lower(), sentinel)
if filter_lambda is sentinel:
flask.abort(400)
if user:
user = models.User.by_id(user)
if not user:
flask.abort(404)
user = user.id
main_cat_id = 0
sub_cat_id = 0
if category:
cat_match = re.match(r'^(\d+)_(\d+)$', category)
if not cat_match:
flask.abort(400)
main_cat_id = int(cat_match.group(1))
sub_cat_id = int(cat_match.group(2))
if main_cat_id > 0:
if sub_cat_id > 0:
sub_category = models.SubCategory.by_category_ids(main_cat_id, sub_cat_id)
if not sub_category:
flask.abort(400)
else:
main_category = models.MainCategory.by_id(main_cat_id)
if not main_category:
flask.abort(400)
# Force sort by id desc if rss
if rss:
sort_lambda = BAKED_SORT_LAMBDAS['id-desc']
same_user = False
if logged_in_user:
same_user = logged_in_user.id == user
if term:
query = bakery(lambda session: session.query(models.TorrentNameSearch))
count_query = bakery(lambda session: session.query(
sqlalchemy.func.count(models.TorrentNameSearch.id)))
else:
query = bakery(lambda session: session.query(models.Torrent))
# This is... eh. Optimize the COUNT() query since MySQL is bad at that.
# See http://docs.sqlalchemy.org/en/rel_1_1/orm/query.html#sqlalchemy.orm.query.Query.count
# Wrap the queries into the helper class to deduplicate code and
# apply filters to both in one go
count_query = bakery(lambda session: session.query(
sqlalchemy.func.count(models.Torrent.id)))
qpc = BakedPair(query, count_query)
bp = sqlalchemy.bindparam
baked_params = {}
# User view (/user/username)
if user:
qpc += lambda q: q.filter(models.Torrent.uploader_id == bp('user'))
baked_params['user'] = user
if not admin:
# Hide all DELETED torrents if regular user
qpc += lambda q: q.filter(models.Torrent.flags.op('&')
(int(models.TorrentFlags.DELETED)).is_(False))
# If logged in user is not the same as the user being viewed,
# show only torrents that aren't hidden or anonymous
#
# If logged in user is the same as the user being viewed,
# show all torrents including hidden and anonymous ones
#
# On RSS pages in user view,
# show only torrents that aren't hidden or anonymous no matter what
if not same_user or rss:
qpc += lambda q: (
q.filter(
models.Torrent.flags.op('&')(
int(models.TorrentFlags.HIDDEN | models.TorrentFlags.ANONYMOUS)
).is_(False)
)
)
# General view (homepage, general search view)
else:
if not admin:
# Hide all DELETED torrents if regular user
qpc += lambda q: q.filter(models.Torrent.flags.op('&')
(int(models.TorrentFlags.DELETED)).is_(False))
# If logged in, show all torrents that aren't hidden unless they belong to you
# On RSS pages, show all public torrents and nothing more.
if logged_in_user and not rss:
qpc += lambda q: q.filter(
(models.Torrent.flags.op('&')(int(models.TorrentFlags.HIDDEN)).is_(False)) |
(models.Torrent.uploader_id == bp('logged_in_user'))
)
baked_params['logged_in_user'] = logged_in_user
# Otherwise, show all torrents that aren't hidden
else:
qpc += lambda q: q.filter(models.Torrent.flags.op('&')
(int(models.TorrentFlags.HIDDEN)).is_(False))
if sub_cat_id:
qpc += lambda q: q.filter(
(models.Torrent.main_category_id == bp('main_cat_id')),
(models.Torrent.sub_category_id == bp('sub_cat_id'))
)
baked_params['main_cat_id'] = main_cat_id
baked_params['sub_cat_id'] = sub_cat_id
elif main_cat_id:
qpc += lambda q: q.filter(models.Torrent.main_category_id == bp('main_cat_id'))
baked_params['main_cat_id'] = main_cat_id
if filter_lambda:
qpc += filter_lambda
if term:
raise Exception('Baked search does not support search terms')
# Sort and order
query += sort_lambda
if rss:
query += lambda q: q.limit(bp('per_page'))
baked_params['per_page'] = per_page
return query(db.session()).params(**baked_params).all()
return baked_paginate(query, count_query, baked_params,
page, per_page=per_page, step=5, max_page=MAX_PAGES)
class ShoddyLRU(object):
def __init__(self, max_entries=128, expiry=60):
self.max_entries = max_entries
self.expiry = expiry
# Contains [value, last_used, expires_at]
self.entries = {}
self._lock = threading.Lock()
self._sentinel = object()
def get(self, key, default=None):
entry = self.entries.get(key)
if entry is None:
return default
now = time.time()
if now > entry[2]:
with self._lock:
del self.entries[key]
return default
entry[1] = now
return entry[0]
def put(self, key, value, expiry=None):
with self._lock:
overflow = len(self.entries) - self.max_entries
if overflow > 0:
# Pick the least recently used keys
removed_keys = [key for key, value in sorted(
self.entries.items(), key=lambda t:t[1][1])][:overflow]
for key in removed_keys:
del self.entries[key]
now = time.time()
self.entries[key] = [value, now, now + (expiry or self.expiry)]
LRU_CACHE = ShoddyLRU(256, 60)
def baked_paginate(query, count_query, params, page=1, per_page=50, max_page=None, step=5):
if page < 1:
flask.abort(404)
if max_page and page > max_page:
flask.abort(404)
bp = sqlalchemy.bindparam
ses = db.session()
# Count all items, use cache
if app.config['COUNT_CACHE_DURATION']:
query_key = (count_query._effective_key(ses), tuple(sorted(params.items())))
total_query_count = LRU_CACHE.get(query_key)
if total_query_count is None:
total_query_count = count_query(ses).params(**params).scalar()
LRU_CACHE.put(query_key, total_query_count, expiry=app.config['COUNT_CACHE_DURATION'])
else:
total_query_count = count_query(ses).params(**params).scalar()
# Grab items on current page
query += lambda q: q.limit(bp('limit')).offset(bp('offset'))
params['limit'] = per_page
params['offset'] = (page - 1) * per_page
res = query(ses).params(**params)
items = res.all()
if max_page:
total_query_count = min(total_query_count, max_page * per_page)
# Handle case where we've had no results but then have some while in cache
total_query_count = max(total_query_count, len(items))
if not items and page != 1:
flask.abort(404)
return Pagination(None, page, per_page, total_query_count, items)

View File

@ -10,7 +10,7 @@ from flask_paginate import Pagination
from nyaa import models
from nyaa.extensions import db
from nyaa.search import (DEFAULT_MAX_SEARCH_RESULT, DEFAULT_PER_PAGE, SERACH_PAGINATE_DISPLAY_MSG,
_generate_query_string, search_db, search_elastic)
_generate_query_string, search_db, search_db_baked, search_elastic)
from nyaa.utils import chain_get
from nyaa.views.account import logout
@ -186,7 +186,11 @@ def home(rss):
else: # Otherwise, use db search for everything
query_args['term'] = search_term or ''
query = search_db(**query_args)
if app.config['USE_BAKED_SEARCH']:
query = search_db_baked(**query_args)
else:
query = search_db(**query_args)
if render_as_rss:
return render_rss('Home', query, use_elastic=False, magnet_links=use_magnet_links)
else:

View File

@ -12,7 +12,7 @@ from itsdangerous import BadSignature, URLSafeSerializer
from nyaa import forms, models
from nyaa.extensions import db
from nyaa.search import (DEFAULT_MAX_SEARCH_RESULT, DEFAULT_PER_PAGE, SERACH_PAGINATE_DISPLAY_MSG,
_generate_query_string, search_db, search_elastic)
_generate_query_string, search_db, search_db_baked, search_elastic)
from nyaa.utils import admin_only, chain_get, sha1_hash
app = flask.current_app
@ -185,7 +185,10 @@ def view_user(user_name):
query_args['term'] = ''
else:
query_args['term'] = search_term or ''
query = search_db(**query_args)
if app.config['USE_BAKED_SEARCH']:
query = search_db_baked(**query_args)
else:
query = search_db(**query_args)
return flask.render_template('user.html',
use_elastic=False,
torrent_query=query,