from gutenberg.acquire import load_etext #type: ignore
from gutenberg.cleanup import strip_headers #type: ignore
import regex as re #type: ignore
#This will give us the text:
#text = strip_headers(load_etext(2701)).strip()

#And this is what each entry in gutenberg_pages.txt looks like after we split the html:
#<a class="link" href="/ebooks/4583" accesskey="6">
#<span class="cell leftcell with-cover">
#<img class="cover-thumb" src="/cache/epub/4583/pg4583.cover.small.jpg" alt="">
#</span>
#<span class="cell content">
#<span class="title">Dialogues Concerning Natural Religion</span>
#<span class="subtitle">David Hume</span>
#<span class="extra">474 downloads</span>
#</span>

#Our goal is to extract the id in the href, and save the text to a file.

def get_id(href: str):
    return int(re.search("/ebooks/(\d+)", href).group(1))

def get_text(id: int):
    return strip_headers(load_etext(id)).strip()

import threading
import os

def process_line(entry: str):
    href = re.search("<a class=\"link\" href=\"(.*?)\"", entry).group(1)
    id = get_id(href)
    text = get_text(id)
    title = re.search("<span class=\"title\">(.*?)</span>", entry).group(1)
    with open('texts/' + str(id), "w") as f:
        f.write(text)
    densities = [0.8, 1.2, 1.6, 2, 3, 4, 5]
    results: dict[str, float] = dict()
    for d in densities:
        indv = str(id) + "_" + str(d)
        os.system(f"pithy -f texts/{str(id)} --density {d} --nobar --no_context --sentences 1 > texts/{indv}.pithy")
        with open("texts/" + indv + ".pithy", "r") as f:
            results[f.read().replace('\n', ' ').split(' ', 1)[1]] = d
        os.remove("texts/" + indv + ".pithy")
    os.remove("texts/" + str(id))
    #Write the page, the density -> results to wikipedia.md
    with open("gutenberg_philosophy.md", "a") as f:
        f.write(f"## {title}\n")
        for k, v in results.items():
            f.write(f"* **Density {v}** -> {k}\n")
import time
def load_file_and_save_text(filename: str):
    with open(filename, "r") as f:
        lines = f.read().split('<li class="booklink">')
    #print('\n'.join(lines))
    for line in lines:
        try:
            #process_line(line)
            threading.Thread(target=process_line, args=(line,)).start()
        except KeyboardInterrupt:
            break
        time.sleep(2.5)
        print(line)

load_file_and_save_text("gutenberg_philosophy.txt")