langchain-qa-bot / download_wikidocs.py
anpigon's picture
feat: Add script to download content from Wikidocs
e05a89c
raw
history blame contribute delete
No virus
2.31 kB
import time
import requests
from bs4 import BeautifulSoup
import re
from markdownify import markdownify as md
import pandas as pd
import argparse
def extract_content(url: str):
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")
page_subject = soup.select_one("#load_content .page-subject")
page_content = soup.select_one("#load_content .page-content")
markdown_content = md(
str(page_subject) + str(page_content),
heading_style="ATX",
bullets="-",
strong_em_symbol="*",
code_language="python",
escape_asterisks=False,
escape_underscores=False,
)
normalized_text = re.sub(r"\n{2}", "\n", markdown_content)
return normalized_text
def main(ebook_url):
base_url = "https://wikidocs.net"
# book_id ์ถ”์ถœ
book_id = ebook_url.split("/")[-1]
# ํŽ˜์ด์ง€ ์†Œ์Šค ๊ฐ€์ ธ์˜ค๊ธฐ
response = requests.get(ebook_url)
response.raise_for_status() # ์˜ˆ์™ธ ์ฒ˜๋ฆฌ
soup = BeautifulSoup(response.content, "html.parser")
# ๋ชฉ์ฐจ์—์„œ 'a' ํƒœ๊ทธ๋งŒ ๊ฐ€์ ธ์˜ค๊ธฐ
toc = soup.select(".list-group-toc a[href^='javascript:page(']")
# ์ถ”์ถœํ•œ ๋ฐ์ดํ„ฐ ์ €์žฅํ•  ๋ฆฌ์ŠคํŠธ
data_list = []
for item in toc:
title = item.get_text(strip=True)
page_id = item.get("href").split("page(")[-1].rstrip(")")
link = f"{base_url}/{page_id}"
data_list.append({"title": title, "link": link})
# ๋ฐ์ดํ„ฐ ๋ฆฌ์ŠคํŠธ๋ฅผ ์ˆœํšŒํ•˜๋ฉฐ ์ฝ˜ํ…์ธ  ์ถ”์ถœ
for item in data_list[1:]:
item["content"] = extract_content(item["link"])
time.sleep(1) # ํŽ˜์ด์ง€ ๋กœ๋“œ๋ฅผ ์œ„ํ•ด ๋Œ€๊ธฐ
# ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์œผ๋กœ ๋ณ€ํ™˜
df = pd.DataFrame(data_list)
df = df.dropna(subset=["content"])
# ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์„ parquet ํŒŒ์ผ๋กœ ์ €์žฅ
parquet_filename = f"wikidocs_{book_id}.parquet"
df.to_parquet(parquet_filename, index=False)
print(f"ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค: {parquet_filename}")
if __name__ == "__main__":
# ๋ช…๋ น์–ด ์ค„ ์ธ์ž ์ฒ˜๋ฆฌ
parser = argparse.ArgumentParser(description="Wikidocs ebook URL์„ ์ž…๋ ฅํ•˜์„ธ์š”.")
parser.add_argument("ebook_url", type=str, help="Wikidocs ebook URL")
args = parser.parse_args()
main(args.ebook_url)