lj_albums.py - snippets

lj_albums.py · 2.4 KiB · Python Raw

#!/usr/bin/env -S uv run --script # /// script # requires-python = ">=3.14" # dependencies = [ # "bs4>=0.0.2", # "selenium>=4.44.0", # ] # /// import csv from bs4 import BeautifulSoup from selenium import webdriver from selenium.webdriver import Chrome from selenium.webdriver.chrome.options import Options import urllib.parse # read newline separated list of urls with open("list.txt", "r") as file: for l in file: url = l # break down url to get username & album id # this is for csv file naming parsed_url = urllib.parse.urlparse(url) url_path = parsed_url.path album_user = parsed_url.hostname.split(".")[0] album_id = url_path.split("/")[3] # get id from album url # define csv file name csv_file = album_user + "_" + album_id + ".csv" # selenium scraping stuff options = webdriver.ChromeOptions() options.add_argument('--headless') driver = webdriver.Chrome(options=options) driver.get(url) page_body = driver.page_source # start parsing soup = BeautifulSoup(page_body, "html.parser") # get main container for photo album containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")}) # create & open csv file with open(csv_file, mode="w", newline="") as f: writer = csv.writer(f) # write headers to csv file header = ["source", "description"] writer.writerow(header) for body in containers: # find all img wrappers/divs img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")}) for t in img_divs: # find all img tags img_tags = t.find_all("img") # find all img src attributes img_srcs_all = [img["src"] for img in img_tags] for i in img_tags: # find each photo description descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")}) # throw it all into a csv file for src, desc in zip(img_srcs_all, descriptions): s = src.replace("600", "original") t = desc.text writer.writerow([s, t])

1	#!/usr/bin/env -S uv run --script
2	# /// script
3	# requires-python = ">=3.14"
4	# dependencies = [
5	# "bs4>=0.0.2",
6	# "selenium>=4.44.0",
7	# ]
8	# ///
9
10	import csv
11	from bs4 import BeautifulSoup
12	from selenium import webdriver
13	from selenium.webdriver import Chrome
14	from selenium.webdriver.chrome.options import Options
15	import urllib.parse
16
17	# read newline separated list of urls
18	with open("list.txt", "r") as file:
19	for l in file:
20	url = l
21
22	# break down url to get username & album id
23	# this is for csv file naming
24	parsed_url = urllib.parse.urlparse(url)
25	url_path = parsed_url.path
26	album_user = parsed_url.hostname.split(".")[0]
27	album_id = url_path.split("/")[3] # get id from album url
28
29	# define csv file name
30	csv_file = album_user + "_" + album_id + ".csv"
31
32	# selenium scraping stuff
33	options = webdriver.ChromeOptions()
34	options.add_argument('--headless')
35	driver = webdriver.Chrome(options=options)
36	driver.get(url)
37	page_body = driver.page_source
38
39	# start parsing
40	soup = BeautifulSoup(page_body, "html.parser")
41
42	# get main container for photo album
43	containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")})
44
45	# create & open csv file
46	with open(csv_file, mode="w", newline="") as f:
47	writer = csv.writer(f)
48	# write headers to csv file
49	header = ["source", "description"]
50	writer.writerow(header)
51	for body in containers:
52	# find all img wrappers/divs
53	img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")})
54	for t in img_divs:
55	# find all img tags
56	img_tags = t.find_all("img")
57	# find all img src attributes
58	img_srcs_all = [img["src"] for img in img_tags]
59	for i in img_tags:
60	# find each photo description
61	descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")})
62	# throw it all into a csv file
63	for src, desc in zip(img_srcs_all, descriptions):
64	s = src.replace("600", "original")
65	t = desc.text
66	writer.writerow([s, t])