fnb-roc-website/builder.py

import lzma
import json
import os
import re
from pytz import timezone
from datetime import datetime
import bisect
from jinja2 import Environment, PackageLoader, select_autoescape

scraper_path = 'scraper/'
output_path='./static'

env = Environment(
    loader=PackageLoader("builder"),
    autoescape=select_autoescape()
)
blog_template = env.get_template("blog_template.html")
post_template=env.get_template("post.html")

try:
    os.makedirs(output_path)
except FileExistsError as err:
    if not os.path.isdir(output_path):
        raise RuntimeError(f'output path {output_path} exists and is not a directory') from err

posts = []

for folder in os.listdir(scraper_path):
    if os.path.isdir(os.path.join(scraper_path, folder)):
        post = {'pictures': []}
        for file in os.listdir(scraper_path + folder):
            if file.endswith('.xz'):
                if file.endswith('.xz'):
                    with lzma.open(scraper_path + folder + '/' + file) as f:
                        json_bytes = f.read()
                        stri = json_bytes.decode('utf-8')
                        data = json.loads(stri)
                        # checks that we're looking at a downloaded post, not the profile picture or something
                        if 'node' in data.keys() and 'date' in data['node'].keys():
                            timestamp = data['node']['date']
                            post['timestamp'] = timestamp #useful for sorting
                            post['datetime'] = datetime.fromtimestamp(data['node']['date'], timezone("America/New_York")).strftime("%b %e, '%y")
                            # use negative timestamp because that's the easiest way to sort from high to low
                            bisect.insort_right(posts, post, key=lambda t: -t['timestamp'])
            if file.endswith('.webp') or file.endswith('.jpg'):
                filepath=os.path.join(scraper_path, folder, file)
                def get_index(t:str):
                    extension_len = 6
                    if t.endswith('.jpg'):
                        extension_len = 5
                    lastpart = filepath[filepath.rfind('_'):-extension_len] # will get the index of the image within the post if more than one
                    if lastpart.isdigit():
                        return int(lastpart)
                    else:
                        return 0
                bisect.insort_left(post['pictures'], filepath, key=get_index)
            if file.endswith('.txt'):
                filepath=os.path.join(scraper_path, folder, file)
                with open(filepath, 'r') as file:
                    post['caption'] = file.read()

        if ('timestamp' in post.keys()):
            with open(output_path + 'posts/' + str(post['timestamp']) +'.html', 'w+') as output_file:
                output_file.write(post_template.render(post=post))

homepage_template = env.get_template("index.html")

with open(output_path + 'index.html', "w+") as output_file:
    output_file.write(homepage_template.render())

with open(output_path + 'blog.html', "w+") as output_file:
    output_file.write(blog_template.render(posts=posts))