1 files changed, 297 insertions, 0 deletions
diff --git a/sjdbmk/legacy_wikipedia.py b/sjdbmk/legacy_wikipedia.py
new file mode 100644
index 0000000..c2f60a1
--- /dev/null
+++ b/sjdbmk/legacy_wikipedia.py
@@ -0,0 +1,297 @@
+#!/usr/bin/env python3
+#
+# Legacy Daily Bulletin components that need to be replaced
+# Copyright (C) 2024      Runxi Yu <https://runxiyu.org>
+# Copyright (C) 2023-2024 Albert Tan <albert-tan@qq.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <https://www.gnu.org/licenses/>.
+#
+
+from __future__ import annotations
+import re
+import os
+import copy
+import datetime
+import logging
+import argparse
+import configparser
+
+import requests
+import bs4
+
+logger = logging.getLogger(__name__)
+
+
+def get_on_this_day_zh() -> None:
+    months = list(map(lambda x: str(x) + "月", range(1, 13)))
+
+    for index in range(12):
+
+        month = months[index]
+        day = 1
+
+        url = "https://zh.m.wikipedia.org/zh-cn/Wikipedia:历史上的今天/" + month
+        response = requests.get(url, timeout=15)
+        html = response.text
+        soup = bs4.BeautifulSoup(html, "html.parser")
+        div_elements = soup.find_all("div", class_="selected-anniversary")
+
+        for div_element in div_elements:
+
+            datetime_time = datetime.datetime(2000, index + 1, day)
+            formatted_time_yearless = datetime_time.strftime("%m-%d")
+
+            p_element = div_element.find("p")
+            dl_element = div_element.find("dl")
+            event_elements = dl_element.find_all("div", class_="event")
+            ul_element = soup.new_tag("ul")
+
+            for event in event_elements:
+                li_element = soup.new_tag("li")
+                li_element.append(event)
+                ul_element.append(li_element)
+
+            result = str(p_element).replace(
+                "/wiki", "https://zh.wikipedia.org/zh-cn"
+            ).replace('<span class="otd-year">', "<b>").replace(
+                "</span>：", "：</b>"
+            ) + str(
+                ul_element
+            ).replace(
+                "/wiki", "https://zh.wikipedia.org/zh-cn"
+            ).replace(
+                "</dt><dd>", " – "
+            ).replace(
+                '<div class="event">\n<dt>', ""
+            ).replace(
+                "</dd>\n</div>", ""
+            )
+            result = re.sub(r"<small>.*?图.*?</small>", "", result)
+
+            with open("otd_zh-" + formatted_time_yearless + ".html", "w") as file:
+                file.write(result)
+                file.close()
+                day += 1
+
+
+def get_on_this_day_en() -> None:
+    months = [
+        "January",
+        "February",
+        "March",
+        "April",
+        "May",
+        "June",
+        "July",
+        "August",
+        "September",
+        "October",
+        "November",
+        "December",
+    ]
+
+    for index in range(12):
+
+        month = months[index]
+        day = 1
+        url = (
+            "https://en.m.wikipedia.org/wiki/Wikipedia:Selected_anniversaries/" + month
+        )
+        response = requests.get(url, timeout=15)
+        html = response.text
+        soup = bs4.BeautifulSoup(html, "html.parser")
+        p_elements = soup.find_all("p")
+
+        for p_element in p_elements:
+
+            try:
+                datetime_time = datetime.datetime(2000, index + 1, day)
+                formatted_time_yearless = datetime_time.strftime("%m-%d")
+            except ValueError:
+                break
+
+            if not re.search(
+                f'<p><b><a href="/wiki/{month}_\\d+" title="{month} \\d+">{month} \\d+</a></b',
+                str(p_element),
+            ):
+                continue
+            div_element = p_element.find_next("div")
+            ul_element = div_element.find_next_sibling("ul")
+            ul_element_2 = ul_element.find_next("ul")
+            p_element_2 = soup.new_tag("p")
+            li_contents = list(ul_element_2.find_all("li"))
+
+            for li in li_contents:
+                p_element_2.append(li)
+
+            result = (
+                str(p_element).replace("/wiki", "https://en.wikipedia.org/wiki")
+                + str(ul_element).replace("/wiki", "https://en.wikipedia.org/wiki")
+                + "\n"
+                + str(p_element_2)
+                .replace("</li><li>", "; ")
+                .replace("<li>", "<b>Births and Deaths: </b>")
+                .replace("</li>", "")
+                .replace("/wiki", "https://en.wikipedia.org/wiki")
+            )
+            result = re.sub(r" <i>.*?icture.*?</i>", "", result)
+
+            with open("otd_en-" + formatted_time_yearless + ".html", "w") as file:
+                file.write(result)
+                file.close()
+                day += 1
+
+
+def get_in_the_news_en() -> str:
+    url = "https://en.m.wikipedia.org/wiki/Main_Page"
+    response = requests.get(url, timeout=15)
+    html = response.text
+    soup = bs4.BeautifulSoup(html, "html.parser")
+
+    h2_element = soup.find("div", id="mp-itn")
+    assert h2_element
+    ul_element = h2_element.find_next("ul")
+    assert ul_element
+    ul_element_2 = ul_element.find_next("ul")
+    assert ul_element_2
+    div_element = ul_element_2.find_next("div")
+    assert div_element
+    ul_element_3 = div_element.find_next("ul")
+    assert ul_element_3
+
+    p_element_2 = soup.new_tag("p")
+    p_element_3 = soup.new_tag("p")
+    assert isinstance(ul_element_2, bs4.Tag)
+    assert isinstance(ul_element_3, bs4.Tag)
+    li_contents_2 = list(ul_element_2.find_all("li"))
+    li_contents_3 = list(ul_element_3.find_all("li"))
+    skip = False
+    for li in li_contents_2:
+        if skip:
+            skip = False
+            continue
+        if li.find("ul"):
+            new_li = copy.deepcopy(li)
+            new_li.find("ul").decompose()
+            p_element_2.append(new_li)
+            skip = True
+        else:
+            p_element_2.append(li)
+    for li in li_contents_3:
+        if skip:
+            skip = False
+            continue
+        if li.find("ul"):
+            new_li = copy.deepcopy(li)
+            new_li.find("ul").decompose()
+            p_element_3.append(new_li)
+            skip = True
+        else:
+            p_element_3.append(li)
+
+    result = (
+        str(ul_element).replace("/wiki", "https://en.wikipedia.org/wiki")
+        + str(p_element_2)
+        .replace("</li><li>", "; ")
+        .replace("<li>", "<b>Ongoing: </b>")
+        .replace("</li>", "")
+        .replace("\n;", ";")
+        .replace("/wiki", "https://en.wikipedia.org/wiki")
+        .replace("</p>", "<br>")
+        + str(p_element_3)
+        .replace("</li><li>", "; ")
+        .replace("<li>", "<b>Recent deaths: </b>")
+        .replace("</li>", "")
+        .replace("\n;", ";")
+        .replace("/wiki", "https://en.wikipedia.org/wiki")
+        .replace("<p>", "")
+    )
+    result = re.sub(r" <i>\(.*?\)</i>", "", result)
+
+    return result
+
+
+def get_in_the_news_zh() -> str:
+    url = "https://zh.m.wikipedia.org/zh-cn/Wikipedia:%E9%A6%96%E9%A1%B5"
+    response = requests.get(url, timeout=15)
+    html = response.text
+    soup = bs4.BeautifulSoup(html, "html.parser")
+
+    div_element = soup.find("div", id="column-itn")
+    assert div_element
+    ul_element = div_element.find("ul")
+    assert isinstance(ul_element, bs4.Tag)
+    ul_element_2 = ul_element.find_next("ul")
+    assert isinstance(ul_element_2, bs4.Tag)
+    ul_element_3 = ul_element_2.find_next("ul")
+    assert isinstance(ul_element_3, bs4.Tag)
+    span_element_2 = ul_element_2.find("span", class_="hlist inline")
+    span_element_3 = ul_element_3.find("span", class_="hlist inline")
+    assert span_element_2 and span_element_3
+    p_element_2 = soup.new_tag("p")
+    p_element_3 = soup.new_tag("p")
+    p_element_2.append(span_element_2)
+    p_element_3.append(span_element_3)
+
+    result = (
+        str(ul_element).replace("/wiki", "https://zh.wikipedia.org/zh-cn")
+        + str(p_element_2)
+        .replace('<span class="hlist inline">', "<b>正在发生：</b>")
+        .replace("</span>", "")
+        .replace("－", "；")
+        .replace(
+            '（<a href="/wiki/%E4%BF%84%E7%BE%85%E6%96%AF%E5%85%A5%E4%BE%B5%E7%83%8F%E5%85%8B%E8%98%AD%E6%99%82%E9%96%93%E8%BB%B8" title="俄罗斯入侵乌克兰时间轴">时间轴</a>）',
+            "",
+        )
+        .replace("/wiki", "https://zh.wikipedia.org/zh-cn")
+        + str(p_element_3)
+        .replace('<span class="hlist inline">', "<b>最近逝世：</b>")
+        .replace("</span>", "")
+        .replace("－", "；")
+        .replace("/wiki", "https://zh.wikipedia.org/zh-cn")
+    ).replace("</p><p>", "<br>")
+    result = re.sub(r"<small.*?>.*?</small>", "", result)
+
+    return result
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description="Legacy Wikipedia script for the Daily Bulletin"
+    )
+    parser.add_argument(
+        "--config", default="config.ini", help="path to the configuration file"
+    )
+    args = parser.parse_args()
+
+    config = configparser.ConfigParser()
+    config.read(args.config)
+
+    build_path = config["general"]["build_path"]
+    os.chdir(build_path)
+
+    logging.basicConfig(level=logging.DEBUG)
+    logger.warning("Running main() only grabs On This Day")
+    logger.info("get_on_this_day_en()")
+    get_on_this_day_en()
+    logger.info("get_on_this_day_zh()")
+    get_on_this_day_zh()
+    # logger.info("get_in_the_news_en()")
+    # get_in_the_news_en()
+    # logger.info("get_in_the_news_zh()")
+    # get_in_the_news_zh()
+
+
+if __name__ == "__main__":
+    main()