
import urllib3 #type: ignore
requests_pool = urllib3.PoolManager()
import os

def process_page(link):
    page_html = requests_pool.request('GET', link).data
    article = trafilatura.extract(
        page_html,
        favor_precision=True,
        include_comments=False,
        include_formatting=False,
        include_images=False,
        include_tables=False
    )
    #example url: https://www.bbc.com/future/article/20211122-could-mrna-make-us-superhuman
    article_id = link.split('/')[-1]
    with open('texts/' + article_id, "w") as f:
        f.write(article)
    densities = [0.8, 1.2, 1.6, 2, 3, 4, 5]
    results: dict[str, float] = dict()
    for d in densities:
        indv = article_id + "_" + str(d)
        os.system(f"pithy -f texts/{article_id} --density {d} --nobar --no_context --sentences 1 > texts/{indv}.pithy")
        with open("texts/" + indv + ".pithy", "r") as f:
            results[f.read().replace('\n', ' ').split(' ', 1)[1]] = d
        os.remove("texts/" + indv + ".pithy")
    os.remove("texts/" + article_id)

    with open("nytimes_articles.md", "a") as f:
        f.write(f"## {link}\n")
        for k, v in results.items():
            f.write(f"* **Density {v}** -> {k}\n")

from trafilatura import spider #type: ignore
import trafilatura

def get_and_filter_pages():
    pages = spider.focused_crawler("https://www.nytimes.com/section/business", max_seen_urls=150, max_known_urls=150, lang="English", rules=None)
    pages = [p for p in list(list(pages)[1]) if ".html" in p]
    with open("nytimes_urls.txt", "a") as f:
        f.write('\n'.join(pages))
    return pages
import time
import threading
def scrape():
    pages = get_and_filter_pages()
    for page in pages:
        print(page)
        #process_page(page)
        time.sleep(0.15)
        threading.Thread(target=process_page, args=(page,)).start()

scrape()