#!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.14" # dependencies = [ # "bs4>=0.0.2", # "selenium>=4.44.0", # ] # /// import csv from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver import Chrome from selenium.webdriver.chrome.options import Options import urllib.parse # read newline separated list of urls with open("list.txt", "r") as file: for l in file: url = l # break down url to get username & album id # this is for csv file naming parsed_url = urllib.parse.urlparse(url) url_path = parsed_url.path album_user = parsed_url.hostname.split(".")[0] album_id = url_path.split("/")[3] # get id from album url # define csv file name csv_file = album_user + "_" + album_id + ".csv" # selenium scraping stuff options = webdriver.ChromeOptions() options.add_argument('--headless') driver = webdriver.Chrome(options=options) driver.get(url) page_body = driver.page_source # start parsing soup = BeautifulSoup(page_body, "html.parser") # get main container for photo album containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")}) # create & open csv file with open(csv_file, mode="w", newline="") as f: writer = csv.writer(f) # write headers to csv file header = ["source", "description"] writer.writerow(header) for body in containers: # find all img wrappers/divs img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")}) for t in img_divs: # find all img tags img_tags = t.find_all("img") # find all img src attributes img_srcs_all = [img["src"] for img in img_tags] for i in img_tags: # find each photo description descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")}) # throw it all into a csv file for src, txt in zip(img_srcs_all, descriptions): s = src.replace("600", "original") writer.writerow([s, txt.text])