Last active 2 hours ago

lj_albums.py Raw
1#!/usr/bin/env -S uv run --script
2# /// script
3# requires-python = ">=3.14"
4# dependencies = [
5# "bs4>=0.0.2",
6# "selenium>=4.44.0",
7# ]
8# ///
9
10import csv
11from bs4 import BeautifulSoup
12from selenium import webdriver
13from selenium.webdriver import Chrome
14from selenium.webdriver.chrome.options import Options
15import urllib.parse
16
17# read newline separated list of urls
18with open("list.txt", "r") as file:
19 for l in file:
20 url = l
21
22 # break down url to get username & album id
23 # this is for csv file naming
24 parsed_url = urllib.parse.urlparse(url)
25 url_path = parsed_url.path
26 album_user = parsed_url.hostname.split(".")[0]
27 album_id = url_path.split("/")[3] # get id from album url
28
29 # define csv file name
30 csv_file = album_user + "_" + album_id + ".csv"
31
32 # selenium scraping stuff
33 options = webdriver.ChromeOptions()
34 options.add_argument('--headless')
35 driver = webdriver.Chrome(options=options)
36 driver.get(url)
37 page_body = driver.page_source
38
39 # start parsing
40 soup = BeautifulSoup(page_body, "html.parser")
41
42 # get main container for photo album
43 containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")})
44
45 # create & open csv file
46 with open(csv_file, mode="w", newline="") as f:
47 writer = csv.writer(f)
48 # write headers to csv file
49 header = ["source", "description"]
50 writer.writerow(header)
51 for body in containers:
52 # find all img wrappers/divs
53 img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")})
54 for t in img_divs:
55 # find all img tags
56 img_tags = t.find_all("img")
57 # find all img src attributes
58 img_srcs_all = [img["src"] for img in img_tags]
59 for i in img_tags:
60 # find each photo description
61 descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")})
62 # throw it all into a csv file
63 for src, desc in zip(img_srcs_all, descriptions):
64 s = src.replace("600", "original")
65 t = desc.text
66 writer.writerow([s, t])