1
0
Fork 0
mirror of https://gitlab.com/SIGBUS/nyaa.git synced 2024-12-22 19:00:07 +00:00

sync_es: instrument with statsd, improve logging

also fixed the save time loop and spaced it out
to 10k events instead of 100.

Notably, the event no. of rows caps out at around 5 by default
because of default -binlog-row-event-max-size=8192 in mysql; that's
how many (torrent) rows fit into a single event.

We could increase that, but instead I think it's finally time to finally
multithread this thing; both the binlog read and the ES POST shouldn't
use the GIL so it'll actually work.
This commit is contained in:
queue 2017-05-20 23:19:35 -06:00
parent 752a5a1f95
commit 6a4ad827c1

View file

@ -32,12 +32,16 @@ import sys
import json import json
import time import time
import logging import logging
from statsd import StatsClient
logging.basicConfig() logging.basicConfig(format='%(asctime)s %(levelname)s %(name)s - %(message)s')
log = logging.getLogger('sync_es') log = logging.getLogger('sync_es')
log.setLevel(logging.INFO) log.setLevel(logging.INFO)
# goes to netdata or other statsd listener
stats = StatsClient('localhost', 8125, prefix="sync_es")
#logging.getLogger('elasticsearch').setLevel(logging.DEBUG) #logging.getLogger('elasticsearch').setLevel(logging.DEBUG)
# in prod want in /var/lib somewhere probably # in prod want in /var/lib somewhere probably
@ -120,8 +124,8 @@ def reindex_torrent(t, index_name):
def reindex_stats(s, index_name): def reindex_stats(s, index_name):
# update the torrent at torrent_id, assumed to exist; # update the torrent at torrent_id, assumed to exist;
# this will always be the case if you're reading the binlog # this will always be the case if you're reading the binlog
# in order; the foreign key constraint on torrrent_id prevents # in order; the foreign key constraint on torrent_id prevents
# the stats row rom existing if the torrent isn't around. # the stats row from existing if the torrent isn't around.
return { return {
'_op_type': 'update', '_op_type': 'update',
'_index': index_name, '_index': index_name,
@ -141,23 +145,37 @@ def delet_this(row, index_name):
'_type': 'torrent', '_type': 'torrent',
'_id': str(row['values']['id'])} '_id': str(row['values']['id'])}
n = 0
last_save = time.time() last_save = time.time()
since_last = 0
# get POST latency
@stats.timer('post_bulk')
def timed_bulk(events, **kwargs):
bulk(es, events, **kwargs)
log.info(f"reading binlog from {stream.log_file}/{stream.log_pos}")
for event in stream: for event in stream:
with stats.pipeline() as s:
s.incr('total_events')
s.incr(f"event.{event.table}.{type(event).__name__}")
s.incr('total_rows', len(event.rows))
s.incr(f"rows.{event.table}.{type(event).__name__}", len(event.rows))
# XXX not a "timer", but we get a histogram out of it
s.timing(f"rows_per_event.{event.table}.{type(event).__name__}", len(event.rows))
if event.table == "nyaa_torrents" or event.table == "sukebei_torrents": if event.table == "nyaa_torrents" or event.table == "sukebei_torrents":
if event.table == "nyaa_torrents": if event.table == "nyaa_torrents":
index_name = "nyaa" index_name = "nyaa"
else: else:
index_name = "sukebei" index_name = "sukebei"
if type(event) is WriteRowsEvent: if type(event) is WriteRowsEvent:
bulk(es, (reindex_torrent(row['values'], index_name) for row in event.rows)) timed_bulk(reindex_torrent(row['values'], index_name) for row in event.rows)
elif type(event) is UpdateRowsEvent: elif type(event) is UpdateRowsEvent:
# UpdateRowsEvent includes the old values too, but we don't care # UpdateRowsEvent includes the old values too, but we don't care
bulk(es, (reindex_torrent(row['after_values'], index_name) for row in event.rows)) timed_bulk(reindex_torrent(row['after_values'], index_name) for row in event.rows)
elif type(event) is DeleteRowsEvent: elif type(event) is DeleteRowsEvent:
# ok, bye # ok, bye
bulk(es, (delet_this(row, index_name) for row in event.rows)) timed_bulk(delet_this(row, index_name) for row in event.rows)
else: else:
raise Exception(f"unknown event {type(event)}") raise Exception(f"unknown event {type(event)}")
elif event.table == "nyaa_statistics" or event.table == "sukebei_statistics": elif event.table == "nyaa_statistics" or event.table == "sukebei_statistics":
@ -166,11 +184,19 @@ for event in stream:
else: else:
index_name = "sukebei" index_name = "sukebei"
if type(event) is WriteRowsEvent: if type(event) is WriteRowsEvent:
bulk(es, (reindex_stats(row['values'], index_name) for row in event.rows)) timed_bulk(
(reindex_stats(row['values'], index_name) for row in event.rows),
# statistics updates are pretty low priority, plus in certain
# cases where we're really out of sync, the torrent could be missing,
# causing a "document missing" error from es, with no way to suppress
# that server-side. Thus ignore all errors, oh well.
raise_on_error=False, stats_only=True)
elif type(event) is UpdateRowsEvent: elif type(event) is UpdateRowsEvent:
bulk(es, (reindex_stats(row['after_values'], index_name) for row in event.rows)) timed_bulk(
(reindex_stats(row['after_values'], index_name) for row in event.rows),
raise_on_error=False, stats_only=True)
elif type(event) is DeleteRowsEvent: elif type(event) is DeleteRowsEvent:
# uh ok. assume that the torrent row will get deleted later, # uh ok. Assume that the torrent row will get deleted later,
# which will clean up the entire es "torrent" document # which will clean up the entire es "torrent" document
pass pass
else: else:
@ -178,8 +204,14 @@ for event in stream:
else: else:
raise Exception(f"unknown table {s.table}") raise Exception(f"unknown table {s.table}")
n += 1 # how far we're behind, wall clock
if n % 100 == 0 or time.time() - last_save > 30: stats.gauge('process_latency', int((time.time() - event.timestamp) * 1000))
log.info(f"saving position {stream.log_file}/{stream.log_pos}")
with open(SAVE_LOC, 'w') as f: since_last += 1
json.dump({"log_file": stream.log_file, "log_pos": stream.log_pos}, f) if since_last >= 10000 or (time.time() - last_save) > 10:
log.info(f"saving position {stream.log_file}/{stream.log_pos}, {time.time() - event.timestamp:,.3} seconds behind")
with stats.timer('save_pos'):
with open(SAVE_LOC, 'w') as f:
json.dump({"log_file": stream.log_file, "log_pos": stream.log_pos}, f)
last_save = time.time()
since_last = 0