lj_albums.py
· 2.4 KiB · Python
Raw
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.14"
# dependencies = [
# "bs4>=0.0.2",
# "selenium>=4.44.0",
# ]
# ///
import csv
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
import urllib.parse
# read newline separated list of urls
with open("list.txt", "r") as file:
for l in file:
url = l
# break down url to get username & album id
# this is for csv file naming
parsed_url = urllib.parse.urlparse(url)
url_path = parsed_url.path
album_user = parsed_url.hostname.split(".")[0]
album_id = url_path.split("/")[3] # get id from album url
# define csv file name
csv_file = album_user + "_" + album_id + ".csv"
# selenium scraping stuff
options = webdriver.ChromeOptions()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)
driver.get(url)
page_body = driver.page_source
# start parsing
soup = BeautifulSoup(page_body, "html.parser")
# get main container for photo album
containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")})
# create & open csv file
with open(csv_file, mode="w", newline="") as f:
writer = csv.writer(f)
# write headers to csv file
header = ["source", "description"]
writer.writerow(header)
for body in containers:
# find all img wrappers/divs
img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")})
for t in img_divs:
# find all img tags
img_tags = t.find_all("img")
# find all img src attributes
img_srcs_all = [img["src"] for img in img_tags]
for i in img_tags:
# find each photo description
descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")})
# throw it all into a csv file
for src, desc in zip(img_srcs_all, descriptions):
s = src.replace("600", "original")
t = desc.text
writer.writerow([s, t])
| 1 | #!/usr/bin/env -S uv run --script |
| 2 | # /// script |
| 3 | # requires-python = ">=3.14" |
| 4 | # dependencies = [ |
| 5 | # "bs4>=0.0.2", |
| 6 | # "selenium>=4.44.0", |
| 7 | # ] |
| 8 | # /// |
| 9 | |
| 10 | import csv |
| 11 | from bs4 import BeautifulSoup |
| 12 | from selenium import webdriver |
| 13 | from selenium.webdriver import Chrome |
| 14 | from selenium.webdriver.chrome.options import Options |
| 15 | import urllib.parse |
| 16 | |
| 17 | # read newline separated list of urls |
| 18 | with open("list.txt", "r") as file: |
| 19 | for l in file: |
| 20 | url = l |
| 21 | |
| 22 | # break down url to get username & album id |
| 23 | # this is for csv file naming |
| 24 | parsed_url = urllib.parse.urlparse(url) |
| 25 | url_path = parsed_url.path |
| 26 | album_user = parsed_url.hostname.split(".")[0] |
| 27 | album_id = url_path.split("/")[3] # get id from album url |
| 28 | |
| 29 | # define csv file name |
| 30 | csv_file = album_user + "_" + album_id + ".csv" |
| 31 | |
| 32 | # selenium scraping stuff |
| 33 | options = webdriver.ChromeOptions() |
| 34 | options.add_argument('--headless') |
| 35 | driver = webdriver.Chrome(options=options) |
| 36 | driver.get(url) |
| 37 | page_body = driver.page_source |
| 38 | |
| 39 | # start parsing |
| 40 | soup = BeautifulSoup(page_body, "html.parser") |
| 41 | |
| 42 | # get main container for photo album |
| 43 | containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")}) |
| 44 | |
| 45 | # create & open csv file |
| 46 | with open(csv_file, mode="w", newline="") as f: |
| 47 | writer = csv.writer(f) |
| 48 | # write headers to csv file |
| 49 | header = ["source", "description"] |
| 50 | writer.writerow(header) |
| 51 | for body in containers: |
| 52 | # find all img wrappers/divs |
| 53 | img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")}) |
| 54 | for t in img_divs: |
| 55 | # find all img tags |
| 56 | img_tags = t.find_all("img") |
| 57 | # find all img src attributes |
| 58 | img_srcs_all = [img["src"] for img in img_tags] |
| 59 | for i in img_tags: |
| 60 | # find each photo description |
| 61 | descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")}) |
| 62 | # throw it all into a csv file |
| 63 | for src, desc in zip(img_srcs_all, descriptions): |
| 64 | s = src.replace("600", "original") |
| 65 | t = desc.text |
| 66 | writer.writerow([s, t]) |