mirror of
https://gitlab.com/SIGBUS/nyaa.git
synced 2024-12-22 14:49:59 +00:00
sync_es: move io to separate threads, config json
throughput is definitely massively improved, testing locally. hopefully it'll be enough. config moved a separate file by ops request. lazy lazy
This commit is contained in:
parent
6a4ad827c1
commit
ea2160a49d
11
config_es_sync.json
Normal file
11
config_es_sync.json
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
{
|
||||||
|
"save_loc": "/tmp/pos.json",
|
||||||
|
"mysql_host": "127.0.0.1",
|
||||||
|
"mysql_port": 13306,
|
||||||
|
"mysql_user": "root",
|
||||||
|
"mysql_password": "dunnolol",
|
||||||
|
"database": "nyaav2",
|
||||||
|
"internal_queue_depth": 10000,
|
||||||
|
"es_chunk_size": 10000,
|
||||||
|
"flush_interval": 5
|
||||||
|
}
|
290
sync_es.py
290
sync_es.py
|
@ -21,9 +21,12 @@ when import_to_es and sync_es will be lost. Instead, you can run SHOW MASTER
|
||||||
STATUS _before_ you run import_to_es. That way you'll definitely pick up any
|
STATUS _before_ you run import_to_es. That way you'll definitely pick up any
|
||||||
changes that happen while the import_to_es script is dumping stuff from the
|
changes that happen while the import_to_es script is dumping stuff from the
|
||||||
database into es, at the expense of redoing a (small) amount of indexing.
|
database into es, at the expense of redoing a (small) amount of indexing.
|
||||||
|
|
||||||
|
This uses multithreading so we don't have to block on socket io (both binlog
|
||||||
|
reading and es POSTing). asyncio soon™
|
||||||
"""
|
"""
|
||||||
from elasticsearch import Elasticsearch
|
from elasticsearch import Elasticsearch
|
||||||
from elasticsearch.helpers import bulk
|
from elasticsearch.helpers import bulk, BulkIndexError
|
||||||
from pymysqlreplication import BinLogStreamReader
|
from pymysqlreplication import BinLogStreamReader
|
||||||
from pymysqlreplication.row_event import UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent
|
from pymysqlreplication.row_event import UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
@ -33,54 +36,38 @@ import json
|
||||||
import time
|
import time
|
||||||
import logging
|
import logging
|
||||||
from statsd import StatsClient
|
from statsd import StatsClient
|
||||||
|
from threading import Thread
|
||||||
|
from queue import Queue, Empty
|
||||||
|
|
||||||
logging.basicConfig(format='%(asctime)s %(levelname)s %(name)s - %(message)s')
|
logging.basicConfig(format='%(asctime)s %(levelname)s %(name)s - %(message)s')
|
||||||
|
|
||||||
log = logging.getLogger('sync_es')
|
log = logging.getLogger('sync_es')
|
||||||
log.setLevel(logging.INFO)
|
log.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
# config in json, 2lazy to argparse
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("need config.json location", file=sys.stderr)
|
||||||
|
sys.exit(-1)
|
||||||
|
with open(sys.argv[1]) as f:
|
||||||
|
config = json.load(f)
|
||||||
|
|
||||||
# goes to netdata or other statsd listener
|
# goes to netdata or other statsd listener
|
||||||
stats = StatsClient('localhost', 8125, prefix="sync_es")
|
stats = StatsClient('localhost', 8125, prefix="sync_es")
|
||||||
|
|
||||||
#logging.getLogger('elasticsearch').setLevel(logging.DEBUG)
|
#logging.getLogger('elasticsearch').setLevel(logging.DEBUG)
|
||||||
|
|
||||||
# in prod want in /var/lib somewhere probably
|
# in prod want in /var/lib somewhere probably
|
||||||
SAVE_LOC = "/var/lib/sync_es_position.json"
|
SAVE_LOC = config.get('save_loc', "/tmp/pos.json")
|
||||||
MYSQL_HOST = '127.0.0.1'
|
MYSQL_HOST = config.get('mysql_host', '127.0.0.1')
|
||||||
MYSQL_PORT = 3306
|
MYSQL_PORT = config.get('mysql_port', 3306)
|
||||||
MYSQL_USER = 'test'
|
MYSQL_USER = config.get('mysql_user', 'root')
|
||||||
MYSQL_PW = 'test123'
|
MYSQL_PW = config.get('mysql_password', 'dunnolol')
|
||||||
NT_DB = 'nyaav2'
|
NT_DB = config.get('database', 'nyaav2')
|
||||||
|
INTERNAL_QUEUE_DEPTH = config.get('internal_queue_depth', 10000)
|
||||||
with open(SAVE_LOC) as f:
|
ES_CHUNK_SIZE = config.get('es_chunk_size', 10000)
|
||||||
pos = json.load(f)
|
# seconds since no events happening to flush to es. remember this also
|
||||||
|
# interacts with es' refresh_interval setting.
|
||||||
es = Elasticsearch(timeout=30)
|
FLUSH_INTERVAL = config.get('flush_interval', 5)
|
||||||
|
|
||||||
stream = BinLogStreamReader(
|
|
||||||
# TODO parse out from config.py or something
|
|
||||||
connection_settings = {
|
|
||||||
'host': MYSQL_HOST,
|
|
||||||
'port': MYSQL_PORT,
|
|
||||||
'user': MYSQL_USER,
|
|
||||||
'passwd': MYSQL_PW
|
|
||||||
},
|
|
||||||
server_id=10, # arbitrary
|
|
||||||
# only care about this database currently
|
|
||||||
only_schemas=[NT_DB],
|
|
||||||
# these tables in the database
|
|
||||||
only_tables=["nyaa_torrents", "nyaa_statistics", "sukebei_torrents", "sukebei_statistics"],
|
|
||||||
# from our save file
|
|
||||||
resume_stream=True,
|
|
||||||
log_file=pos['log_file'],
|
|
||||||
log_pos=pos['log_pos'],
|
|
||||||
# skip the other stuff like table mapping
|
|
||||||
only_events=[UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent],
|
|
||||||
# if we're at the head of the log, block until something happens
|
|
||||||
# note it'd be nice to block async-style instead, but the mainline
|
|
||||||
# binlogreader is synchronous. there is an (unmaintained?) fork
|
|
||||||
# using aiomysql if anybody wants to revive that.
|
|
||||||
blocking=True)
|
|
||||||
|
|
||||||
def reindex_torrent(t, index_name):
|
def reindex_torrent(t, index_name):
|
||||||
# XXX annoyingly different from import_to_es, and
|
# XXX annoyingly different from import_to_es, and
|
||||||
|
@ -145,73 +132,176 @@ def delet_this(row, index_name):
|
||||||
'_type': 'torrent',
|
'_type': 'torrent',
|
||||||
'_id': str(row['values']['id'])}
|
'_id': str(row['values']['id'])}
|
||||||
|
|
||||||
last_save = time.time()
|
|
||||||
since_last = 0
|
|
||||||
|
|
||||||
# get POST latency
|
class BinlogReader(Thread):
|
||||||
@stats.timer('post_bulk')
|
# write_buf is the Queue we communicate with
|
||||||
def timed_bulk(events, **kwargs):
|
def __init__(self, write_buf):
|
||||||
bulk(es, events, **kwargs)
|
Thread.__init__(self)
|
||||||
|
self.write_buf = write_buf
|
||||||
|
|
||||||
log.info(f"reading binlog from {stream.log_file}/{stream.log_pos}")
|
def run(self):
|
||||||
|
with open(SAVE_LOC) as f:
|
||||||
|
pos = json.load(f)
|
||||||
|
|
||||||
for event in stream:
|
stream = BinLogStreamReader(
|
||||||
with stats.pipeline() as s:
|
# TODO parse out from config.py or something
|
||||||
s.incr('total_events')
|
connection_settings = {
|
||||||
s.incr(f"event.{event.table}.{type(event).__name__}")
|
'host': MYSQL_HOST,
|
||||||
s.incr('total_rows', len(event.rows))
|
'port': MYSQL_PORT,
|
||||||
s.incr(f"rows.{event.table}.{type(event).__name__}", len(event.rows))
|
'user': MYSQL_USER,
|
||||||
# XXX not a "timer", but we get a histogram out of it
|
'passwd': MYSQL_PW
|
||||||
s.timing(f"rows_per_event.{event.table}.{type(event).__name__}", len(event.rows))
|
},
|
||||||
if event.table == "nyaa_torrents" or event.table == "sukebei_torrents":
|
server_id=10, # arbitrary
|
||||||
if event.table == "nyaa_torrents":
|
# only care about this database currently
|
||||||
index_name = "nyaa"
|
only_schemas=[NT_DB],
|
||||||
else:
|
# these tables in the database
|
||||||
index_name = "sukebei"
|
only_tables=["nyaa_torrents", "nyaa_statistics", "sukebei_torrents", "sukebei_statistics"],
|
||||||
if type(event) is WriteRowsEvent:
|
# from our save file
|
||||||
timed_bulk(reindex_torrent(row['values'], index_name) for row in event.rows)
|
resume_stream=True,
|
||||||
elif type(event) is UpdateRowsEvent:
|
log_file=pos['log_file'],
|
||||||
# UpdateRowsEvent includes the old values too, but we don't care
|
log_pos=pos['log_pos'],
|
||||||
timed_bulk(reindex_torrent(row['after_values'], index_name) for row in event.rows)
|
# skip the other stuff like table mapping
|
||||||
elif type(event) is DeleteRowsEvent:
|
only_events=[UpdateRowsEvent, DeleteRowsEvent, WriteRowsEvent],
|
||||||
# ok, bye
|
# if we're at the head of the log, block until something happens
|
||||||
timed_bulk(delet_this(row, index_name) for row in event.rows)
|
# note it'd be nice to block async-style instead, but the mainline
|
||||||
else:
|
# binlogreader is synchronous. there is an (unmaintained?) fork
|
||||||
raise Exception(f"unknown event {type(event)}")
|
# using aiomysql if anybody wants to revive that.
|
||||||
elif event.table == "nyaa_statistics" or event.table == "sukebei_statistics":
|
blocking=True)
|
||||||
if event.table == "nyaa_statistics":
|
|
||||||
index_name = "nyaa"
|
|
||||||
else:
|
|
||||||
index_name = "sukebei"
|
|
||||||
if type(event) is WriteRowsEvent:
|
|
||||||
timed_bulk(
|
|
||||||
(reindex_stats(row['values'], index_name) for row in event.rows),
|
|
||||||
# statistics updates are pretty low priority, plus in certain
|
|
||||||
# cases where we're really out of sync, the torrent could be missing,
|
|
||||||
# causing a "document missing" error from es, with no way to suppress
|
|
||||||
# that server-side. Thus ignore all errors, oh well.
|
|
||||||
raise_on_error=False, stats_only=True)
|
|
||||||
elif type(event) is UpdateRowsEvent:
|
|
||||||
timed_bulk(
|
|
||||||
(reindex_stats(row['after_values'], index_name) for row in event.rows),
|
|
||||||
raise_on_error=False, stats_only=True)
|
|
||||||
elif type(event) is DeleteRowsEvent:
|
|
||||||
# uh ok. Assume that the torrent row will get deleted later,
|
|
||||||
# which will clean up the entire es "torrent" document
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
raise Exception(f"unknown event {type(event)}")
|
|
||||||
else:
|
|
||||||
raise Exception(f"unknown table {s.table}")
|
|
||||||
|
|
||||||
# how far we're behind, wall clock
|
log.info(f"reading binlog from {stream.log_file}/{stream.log_pos}")
|
||||||
stats.gauge('process_latency', int((time.time() - event.timestamp) * 1000))
|
|
||||||
|
for event in stream:
|
||||||
|
# save the pos of the stream and timestamp with each message, so we
|
||||||
|
# can commit in the other thread. and keep track of process latency
|
||||||
|
pos = (stream.log_file, stream.log_pos, event.timestamp)
|
||||||
|
with stats.pipeline() as s:
|
||||||
|
s.incr('total_events')
|
||||||
|
s.incr(f"event.{event.table}.{type(event).__name__}")
|
||||||
|
s.incr('total_rows', len(event.rows))
|
||||||
|
s.incr(f"rows.{event.table}.{type(event).__name__}", len(event.rows))
|
||||||
|
# XXX not a "timer", but we get a histogram out of it
|
||||||
|
s.timing(f"rows_per_event.{event.table}.{type(event).__name__}", len(event.rows))
|
||||||
|
|
||||||
|
if event.table == "nyaa_torrents" or event.table == "sukebei_torrents":
|
||||||
|
if event.table == "nyaa_torrents":
|
||||||
|
index_name = "nyaa"
|
||||||
|
else:
|
||||||
|
index_name = "sukebei"
|
||||||
|
if type(event) is WriteRowsEvent:
|
||||||
|
for row in event.rows:
|
||||||
|
self.write_buf.put(
|
||||||
|
(pos, reindex_torrent(row['values'], index_name)),
|
||||||
|
block=True)
|
||||||
|
elif type(event) is UpdateRowsEvent:
|
||||||
|
# UpdateRowsEvent includes the old values too, but we don't care
|
||||||
|
for row in event.rows:
|
||||||
|
self.write_buf.put(
|
||||||
|
(pos, reindex_torrent(row['after_values'], index_name)),
|
||||||
|
block=True)
|
||||||
|
elif type(event) is DeleteRowsEvent:
|
||||||
|
# ok, bye
|
||||||
|
for row in event.rows:
|
||||||
|
self.write_buf.put((pos, delet_this(row, index_name)), block=True)
|
||||||
|
else:
|
||||||
|
raise Exception(f"unknown event {type(event)}")
|
||||||
|
elif event.table == "nyaa_statistics" or event.table == "sukebei_statistics":
|
||||||
|
if event.table == "nyaa_statistics":
|
||||||
|
index_name = "nyaa"
|
||||||
|
else:
|
||||||
|
index_name = "sukebei"
|
||||||
|
if type(event) is WriteRowsEvent:
|
||||||
|
for row in event.rows:
|
||||||
|
self.write_buf.put(
|
||||||
|
(pos, reindex_stats(row['values'], index_name)),
|
||||||
|
block=True)
|
||||||
|
elif type(event) is UpdateRowsEvent:
|
||||||
|
for row in event.rows:
|
||||||
|
self.write_buf.put(
|
||||||
|
(pos, reindex_stats(row['after_values'], index_name)),
|
||||||
|
block=True)
|
||||||
|
elif type(event) is DeleteRowsEvent:
|
||||||
|
# uh ok. Assume that the torrent row will get deleted later,
|
||||||
|
# which will clean up the entire es "torrent" document
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
raise Exception(f"unknown event {type(event)}")
|
||||||
|
else:
|
||||||
|
raise Exception(f"unknown table {s.table}")
|
||||||
|
|
||||||
|
class EsPoster(Thread):
|
||||||
|
# read_buf is the queue of stuff to bulk post
|
||||||
|
def __init__(self, read_buf, chunk_size=1000, flush_interval=5):
|
||||||
|
Thread.__init__(self)
|
||||||
|
self.read_buf = read_buf
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
self.flush_interval = flush_interval
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
es = Elasticsearch(timeout=30)
|
||||||
|
|
||||||
since_last += 1
|
|
||||||
if since_last >= 10000 or (time.time() - last_save) > 10:
|
|
||||||
log.info(f"saving position {stream.log_file}/{stream.log_pos}, {time.time() - event.timestamp:,.3} seconds behind")
|
|
||||||
with stats.timer('save_pos'):
|
|
||||||
with open(SAVE_LOC, 'w') as f:
|
|
||||||
json.dump({"log_file": stream.log_file, "log_pos": stream.log_pos}, f)
|
|
||||||
last_save = time.time()
|
last_save = time.time()
|
||||||
since_last = 0
|
since_last = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
actions = []
|
||||||
|
while len(actions) < self.chunk_size:
|
||||||
|
try:
|
||||||
|
# grab next event from queue with metadata that creepily
|
||||||
|
# updates, surviving outside the scope of the loop
|
||||||
|
((log_file, log_pos, timestamp), action) = \
|
||||||
|
self.read_buf.get(block=True, timeout=self.flush_interval)
|
||||||
|
actions.append(action)
|
||||||
|
except Empty:
|
||||||
|
# nothing new for the whole interval
|
||||||
|
break
|
||||||
|
|
||||||
|
if not actions:
|
||||||
|
# nothing to post
|
||||||
|
log.debug("no changes...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# XXX "time" to get histogram of no events per bulk
|
||||||
|
stats.timing('actions_per_bulk', len(actions))
|
||||||
|
|
||||||
|
try:
|
||||||
|
with stats.timer('post_bulk'):
|
||||||
|
bulk(es, actions, chunk_size=self.chunk_size)
|
||||||
|
except BulkIndexError as bie:
|
||||||
|
# in certain cases where we're really out of sync, we update a
|
||||||
|
# stat when the torrent doc is, causing a "document missing"
|
||||||
|
# error from es, with no way to suppress that server-side.
|
||||||
|
# Thus ignore that type of error if it's the only problem
|
||||||
|
for e in bie.errors:
|
||||||
|
try:
|
||||||
|
if e['update']['error']['type'] != 'document_missing_exception':
|
||||||
|
raise bie
|
||||||
|
except KeyError:
|
||||||
|
raise bie
|
||||||
|
|
||||||
|
# how far we're behind, wall clock
|
||||||
|
stats.gauge('process_latency', int((time.time() - timestamp) * 1000))
|
||||||
|
|
||||||
|
since_last += len(actions)
|
||||||
|
if since_last >= 10000 or (time.time() - last_save) > 10:
|
||||||
|
log.info(f"saving position {log_file}/{log_pos}, {time.time() - timestamp:,.3f} seconds behind")
|
||||||
|
with stats.timer('save_pos'):
|
||||||
|
with open(SAVE_LOC, 'w') as f:
|
||||||
|
json.dump({"log_file": log_file, "log_pos": log_pos}, f)
|
||||||
|
last_save = time.time()
|
||||||
|
since_last = 0
|
||||||
|
|
||||||
|
# in-memory queue between binlog and es. The bigger it is, the more events we
|
||||||
|
# can parse in memory while waiting for es to catch up, at the expense of heap.
|
||||||
|
buf = Queue(maxsize=INTERNAL_QUEUE_DEPTH)
|
||||||
|
|
||||||
|
reader = BinlogReader(buf)
|
||||||
|
reader.daemon = True
|
||||||
|
writer = EsPoster(buf, chunk_size=ES_CHUNK_SIZE, flush_interval=FLUSH_INTERVAL)
|
||||||
|
writer.daemon = True
|
||||||
|
reader.start()
|
||||||
|
writer.start()
|
||||||
|
|
||||||
|
# on the main thread, poll the queue size for monitoring
|
||||||
|
while True:
|
||||||
|
stats.gauge('queue_depth', buf.qsize())
|
||||||
|
time.sleep(1)
|
||||||
|
|
Loading…
Reference in a new issue