File size: 2,305 Bytes
e05a89c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import time
import requests
from bs4 import BeautifulSoup
import re
from markdownify import markdownify as md
import pandas as pd
import argparse


def extract_content(url: str):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    page_subject = soup.select_one("#load_content .page-subject")
    page_content = soup.select_one("#load_content .page-content")
    markdown_content = md(
        str(page_subject) + str(page_content),
        heading_style="ATX",
        bullets="-",
        strong_em_symbol="*",
        code_language="python",
        escape_asterisks=False,
        escape_underscores=False,
    )
    normalized_text = re.sub(r"\n{2}", "\n", markdown_content)

    return normalized_text


def main(ebook_url):
    base_url = "https://wikidocs.net"

    # book_id ์ถ”์ถœ
    book_id = ebook_url.split("/")[-1]

    # ํŽ˜์ด์ง€ ์†Œ์Šค ๊ฐ€์ ธ์˜ค๊ธฐ
    response = requests.get(ebook_url)
    response.raise_for_status()  # ์˜ˆ์™ธ ์ฒ˜๋ฆฌ
    soup = BeautifulSoup(response.content, "html.parser")

    # ๋ชฉ์ฐจ์—์„œ 'a' ํƒœ๊ทธ๋งŒ ๊ฐ€์ ธ์˜ค๊ธฐ
    toc = soup.select(".list-group-toc a[href^='javascript:page(']")

    # ์ถ”์ถœํ•œ ๋ฐ์ดํ„ฐ ์ €์žฅํ•  ๋ฆฌ์ŠคํŠธ
    data_list = []
    for item in toc:
        title = item.get_text(strip=True)
        page_id = item.get("href").split("page(")[-1].rstrip(")")
        link = f"{base_url}/{page_id}"
        data_list.append({"title": title, "link": link})

    # ๋ฐ์ดํ„ฐ ๋ฆฌ์ŠคํŠธ๋ฅผ ์ˆœํšŒํ•˜๋ฉฐ ์ฝ˜ํ…์ธ  ์ถ”์ถœ
    for item in data_list[1:]:
        item["content"] = extract_content(item["link"])
        time.sleep(1)  # ํŽ˜์ด์ง€ ๋กœ๋“œ๋ฅผ ์œ„ํ•ด ๋Œ€๊ธฐ

    # ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์œผ๋กœ ๋ณ€ํ™˜
    df = pd.DataFrame(data_list)
    df = df.dropna(subset=["content"])

    # ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„์„ parquet ํŒŒ์ผ๋กœ ์ €์žฅ
    parquet_filename = f"wikidocs_{book_id}.parquet"
    df.to_parquet(parquet_filename, index=False)

    print(f"ํŒŒ์ผ์ด ์„ฑ๊ณต์ ์œผ๋กœ ์ €์žฅ๋˜์—ˆ์Šต๋‹ˆ๋‹ค: {parquet_filename}")


if __name__ == "__main__":
    # ๋ช…๋ น์–ด ์ค„ ์ธ์ž ์ฒ˜๋ฆฌ
    parser = argparse.ArgumentParser(description="Wikidocs ebook URL์„ ์ž…๋ ฅํ•˜์„ธ์š”.")
    parser.add_argument("ebook_url", type=str, help="Wikidocs ebook URL")
    args = parser.parse_args()

    main(args.ebook_url)