#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.14"
# dependencies = [
#     "bs4>=0.0.2",
#     "selenium>=4.44.0",
# ]
# ///

import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
import urllib.parse

# read newline separated list of urls
with open("list.txt", "r") as file:
    for l in file:
        url = l

        # break down url to get username & album id
        # this is for csv file naming
        parsed_url = urllib.parse.urlparse(url)
        url_path = parsed_url.path
        album_user = parsed_url.hostname.split(".")[0]
        album_id = url_path.split("/")[3] # get id from album url

        # define csv file name
        csv_file = album_user + "_" + album_id + ".csv"

        # selenium scraping stuff
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        driver = webdriver.Chrome(options=options)
        driver.get(url)
        page_body = driver.page_source

        # start parsing
        soup = BeautifulSoup(page_body, "html.parser")

        # get main container for photo album
        containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")})

        # create & open csv file
        with open(csv_file, mode="w", newline="") as f:
            writer = csv.writer(f)
            # write headers to csv file
            header = ["source", "description"]
            writer.writerow(header)
            for body in containers:
                # find all img wrappers/divs
                img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")})
                for t in img_divs:
                    # find all img tags
                    img_tags = t.find_all("img")
                    # find all img src attributes
                    img_srcs_all = [img["src"] for img in img_tags]
                    for i in img_tags:
                        # find each photo description
                        descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")})
                        # throw it all into a csv file
                        for src, txt in zip(img_srcs_all, descriptions):
                            s = src.replace("600", "original")
                            writer.writerow([s, txt.text])