Revision of lj_albums.py

kat revised this gist 2 hours ago. Go to revision

1 file changed, 3 insertions, 2 deletions

lj_albums.py

			@@ -60,6 +60,7 @@ with open("list.txt", "r") as file:
60	60		# find each photo description
61	61		descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")})
62	62		# throw it all into a csv file
63		-	for src, txt in zip(img_srcs_all, descriptions):
	63	+	for src, desc in zip(img_srcs_all, descriptions):
64	64		s = src.replace("600", "original")
65		-	writer.writerow([s, txt.text])
	65	+	t = desc.text
	66	+	writer.writerow([s, t])

kat revised this gist 3 hours ago. Go to revision

1 file changed, 50 insertions, 28 deletions

lj_albums.py

			@@ -12,32 +12,54 @@ from bs4 import BeautifulSoup
12	12		from selenium import webdriver
13	13		from selenium.webdriver import Chrome
14	14		from selenium.webdriver.chrome.options import Options
	15	+	import urllib.parse
15	16
16		-	url = "https://dani-ellie.livejournal.com/photo/album/2422/"
17		-	csv_file = "album.csv"
18		-
19		-	options = webdriver.ChromeOptions()
20		-	options.add_argument('--headless')
21		-	driver = webdriver.Chrome(options=options)
22		-
23		-	driver.get(url)
24		-	page_body = driver.page_source
25		-
26		-	soup = BeautifulSoup(page_body, "html.parser")
27		-
28		-	containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")})
29		-
30		-	with open(csv_file, mode="w", newline="") as f:
31		-	writer = csv.writer(f)
32		-	header = ["source", "description"]
33		-	writer.writerow(header)
34		-	for body in containers:
35		-	img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")})
36		-	for t in img_divs:
37		-	img_tags = t.find_all("img")
38		-	img_srcs_all = [img["src"] for img in img_tags]
39		-	for i in img_tags:
40		-	descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")})
41		-	for src, txt in zip(img_srcs_all, descriptions):
42		-	s = src.replace("600", "original")
43		-	writer.writerow([s, txt.text])
	17	+	# read newline separated list of urls
	18	+	with open("list.txt", "r") as file:
	19	+	for l in file:
	20	+	url = l
	21	+
	22	+	# break down url to get username & album id
	23	+	# this is for csv file naming
	24	+	parsed_url = urllib.parse.urlparse(url)
	25	+	url_path = parsed_url.path
	26	+	album_user = parsed_url.hostname.split(".")[0]
	27	+	album_id = url_path.split("/")[3] # get id from album url
	28	+
	29	+	# define csv file name
	30	+	csv_file = album_user + "_" + album_id + ".csv"
	31	+
	32	+	# selenium scraping stuff
	33	+	options = webdriver.ChromeOptions()
	34	+	options.add_argument('--headless')
	35	+	driver = webdriver.Chrome(options=options)
	36	+	driver.get(url)
	37	+	page_body = driver.page_source
	38	+
	39	+	# start parsing
	40	+	soup = BeautifulSoup(page_body, "html.parser")
	41	+
	42	+	# get main container for photo album
	43	+	containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")})
	44	+
	45	+	# create & open csv file
	46	+	with open(csv_file, mode="w", newline="") as f:
	47	+	writer = csv.writer(f)
	48	+	# write headers to csv file
	49	+	header = ["source", "description"]
	50	+	writer.writerow(header)
	51	+	for body in containers:
	52	+	# find all img wrappers/divs
	53	+	img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")})
	54	+	for t in img_divs:
	55	+	# find all img tags
	56	+	img_tags = t.find_all("img")
	57	+	# find all img src attributes
	58	+	img_srcs_all = [img["src"] for img in img_tags]
	59	+	for i in img_tags:
	60	+	# find each photo description
	61	+	descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")})
	62	+	# throw it all into a csv file
	63	+	for src, txt in zip(img_srcs_all, descriptions):
	64	+	s = src.replace("600", "original")
	65	+	writer.writerow([s, txt.text])

kat revised this gist 5 hours ago. Go to revision

1 file changed, 43 insertions

lj_albums.py(file created)

		@@ -0,0 +1,43 @@
1	+	#!/usr/bin/env -S uv run --script
2	+	# /// script
3	+	# requires-python = ">=3.14"
4	+	# dependencies = [
5	+	# "bs4>=0.0.2",
6	+	# "selenium>=4.44.0",
7	+	# ]
8	+	# ///
9	+
10	+	import csv
11	+	from bs4 import BeautifulSoup
12	+	from selenium import webdriver
13	+	from selenium.webdriver import Chrome
14	+	from selenium.webdriver.chrome.options import Options
15	+
16	+	url = "https://dani-ellie.livejournal.com/photo/album/2422/"
17	+	csv_file = "album.csv"
18	+
19	+	options = webdriver.ChromeOptions()
20	+	options.add_argument('--headless')
21	+	driver = webdriver.Chrome(options=options)
22	+
23	+	driver.get(url)
24	+	page_body = driver.page_source
25	+
26	+	soup = BeautifulSoup(page_body, "html.parser")
27	+
28	+	containers = soup.find_all("a", {"class" : lambda L: L and L.startswith("Container")})
29	+
30	+	with open(csv_file, mode="w", newline="") as f:
31	+	writer = csv.writer(f)
32	+	header = ["source", "description"]
33	+	writer.writerow(header)
34	+	for body in containers:
35	+	img_divs = body.find_all("div", {"class" : lambda L: L and L.startswith("ImgWrapper")})
36	+	for t in img_divs:
37	+	img_tags = t.find_all("img")
38	+	img_srcs_all = [img["src"] for img in img_tags]
39	+	for i in img_tags:
40	+	descriptions = i.find_all_next("p", {"class" : lambda L: L and L.startswith("Description")})
41	+	for src, txt in zip(img_srcs_all, descriptions):
42	+	s = src.replace("600", "original")
43	+	writer.writerow([s, txt.text])

Newer Older