gpt-4chan-public/src/process_data.py

60 lines
1.6 KiB
Python
Executable File

#!/usr/bin/env python3
import json
import bs4
from loguru import logger
import multiprocessing as mp
import tqdm
from absl import app, flags
import warnings
warnings.filterwarnings("ignore", category=bs4.MarkupResemblesLocatorWarning, module='bs4')
DATA_FN = '../tmp/pol_062016-112019_labeled.ndjson'
OUT_FN = '../tmp/kek.txt'
flags.DEFINE_string('data_fn', DATA_FN, 'data file')
flags.DEFINE_string('out_fn', OUT_FN, 'output file')
FLAGS = flags.FLAGS
# from here: https://gist.github.com/zmwangx/ad0830ba94b1fd98f428
def text_with_newlines(elem):
text = ''
for e in elem.descendants:
if isinstance(e, str):
# text += e.strip()
text += e
elif e.name == 'br' or e.name == 'p':
text += '\n'
return text
def parse_line(line):
data = json.loads(line)
posts_text = []
for post in data.get('posts', []):
try:
if 'com' in post:
soup = bs4.BeautifulSoup(post['com'], 'lxml')
post_text = text_with_newlines(soup).strip()
else:
post_text = ''
post_text = f'--- {post["no"]}\n{post_text}'
posts_text.append(post_text)
except Exception:
logger.exception(f'failed to parse post {post}')
return '\n'.join(posts_text)
def main(_):
with open(FLAGS.out_fn, 'w') as out_f:
with open(FLAGS.data_fn) as in_f:
with mp.Pool() as pool:
for parsed_line in pool.imap(parse_line, tqdm.tqdm(in_f)):
out_f.write(parsed_line + '\n-----\n')
if __name__ == '__main__':
app.run(main)