diff options
Diffstat (limited to 'sjdbmk/legacy_wikipedia.py')
-rw-r--r-- | sjdbmk/legacy_wikipedia.py | 248 |
1 files changed, 248 insertions, 0 deletions
diff --git a/sjdbmk/legacy_wikipedia.py b/sjdbmk/legacy_wikipedia.py new file mode 100644 index 0000000..2d66b20 --- /dev/null +++ b/sjdbmk/legacy_wikipedia.py @@ -0,0 +1,248 @@ +#!/usr/bin/env python3 +# +# Legacy Daily Bulletin components that need to be replaced +# Copyright (C) 2024 Runxi Yu <https://runxiyu.org> +# Copyright (C) 2023-2024 Albert Tan <albert-tan@qq.com> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# + +from __future__ import annotations +import re +import os +import copy +import datetime +import logging +import argparse +import configparser + +import requests +import bs4 + +logger = logging.getLogger(__name__) + + +def get_on_this_day_zh() -> None: + months = list(map(lambda x: str(x) + "月", range(1, 13))) + + for index in range(12): + + month = months[index] + day = 1 + + url = "https://zh.m.wikipedia.org/zh-cn/Wikipedia:历史上的今天/" + month + response = requests.get(url, timeout=15) + html = response.text + soup = bs4.BeautifulSoup(html, "html.parser") + div_elements = soup.find_all("div", class_="selected-anniversary") + + for div_element in div_elements: + + datetime_time = datetime.datetime(2000, index + 1, day) + formatted_time_yearless = datetime_time.strftime("%m-%d") + + p_element = div_element.find("p") + dl_element = div_element.find("dl") + event_elements = dl_element.find_all("div", class_="event") + ul_element = soup.new_tag("ul") + + for event in event_elements: + li_element = soup.new_tag("li") + li_element.append(event) + ul_element.append(li_element) + + result = str(p_element).replace("/wiki", "https://zh.wikipedia.org/zh-cn").replace('<span class="otd-year">', "<b>").replace("</span>:", ":</b>") + str(ul_element).replace("/wiki", "https://zh.wikipedia.org/zh-cn").replace("</dt><dd>", " – ").replace('<div class="event">\n<dt>', "").replace("</dd>\n</div>", "") + result = re.sub(r"<small>.*?图.*?</small>", "", result) + + with open("otd_zh-" + formatted_time_yearless + ".html", "w") as file: + file.write(result) + file.close() + day += 1 + + +def get_on_this_day_en() -> None: + months = [ + "January", + "February", + "March", + "April", + "May", + "June", + "July", + "August", + "September", + "October", + "November", + "December", + ] + + for index in range(12): + + month = months[index] + day = 1 + url = "https://en.m.wikipedia.org/wiki/Wikipedia:Selected_anniversaries/" + month + response = requests.get(url, timeout=15) + html = response.text + soup = bs4.BeautifulSoup(html, "html.parser") + p_elements = soup.find_all("p") + + for p_element in p_elements: + + try: + datetime_time = datetime.datetime(2000, index + 1, day) + formatted_time_yearless = datetime_time.strftime("%m-%d") + except ValueError: + break + + if not re.search( + f'<p><b><a href="/wiki/{month}_\\d+" title="{month} \\d+">{month} \\d+</a></b', + str(p_element), + ): + continue + div_element = p_element.find_next("div") + ul_element = div_element.find_next_sibling("ul") + ul_element_2 = ul_element.find_next("ul") + p_element_2 = soup.new_tag("p") + li_contents = list(ul_element_2.find_all("li")) + + for li in li_contents: + p_element_2.append(li) + + result = str(p_element).replace("/wiki", "https://en.wikipedia.org/wiki") + str(ul_element).replace("/wiki", "https://en.wikipedia.org/wiki") + "\n" + str(p_element_2).replace("</li><li>", "; ").replace("<li>", "<b>Births and Deaths: </b>").replace("</li>", "").replace("/wiki", "https://en.wikipedia.org/wiki") + result = re.sub(r" <i>.*?icture.*?</i>", "", result) + + with open("otd_en-" + formatted_time_yearless + ".html", "w") as file: + file.write(result) + file.close() + day += 1 + + +def get_in_the_news_en() -> str: + url = "https://en.m.wikipedia.org/wiki/Main_Page" + response = requests.get(url, timeout=15) + html = response.text + soup = bs4.BeautifulSoup(html, "html.parser") + + h2_element = soup.find("div", id="mp-itn") + assert h2_element + ul_element = h2_element.find_next("ul") + assert ul_element + ul_element_2 = ul_element.find_next("ul") + assert ul_element_2 + div_element = ul_element_2.find_next("div") + assert div_element + ul_element_3 = div_element.find_next("ul") + assert ul_element_3 + + p_element_2 = soup.new_tag("p") + p_element_3 = soup.new_tag("p") + assert isinstance(ul_element_2, bs4.Tag) + assert isinstance(ul_element_3, bs4.Tag) + li_contents_2 = list(ul_element_2.find_all("li")) + li_contents_3 = list(ul_element_3.find_all("li")) + skip = False + for li in li_contents_2: + if skip: + skip = False + continue + if li.find("ul"): + new_li = copy.deepcopy(li) + new_li.find("ul").decompose() + p_element_2.append(new_li) + skip = True + else: + p_element_2.append(li) + for li in li_contents_3: + if skip: + skip = False + continue + if li.find("ul"): + new_li = copy.deepcopy(li) + new_li.find("ul").decompose() + p_element_3.append(new_li) + skip = True + else: + p_element_3.append(li) + + result = str(ul_element).replace("/wiki", "https://en.wikipedia.org/wiki") + str(p_element_2).replace("</li><li>", "; ").replace("<li>", "<b>Ongoing: </b>").replace("</li>", "").replace("\n;", ";").replace("/wiki", "https://en.wikipedia.org/wiki").replace("</p>", "<br>") + str(p_element_3).replace("</li><li>", "; ").replace("<li>", "<b>Recent deaths: </b>").replace("</li>", "").replace("\n;", ";").replace("/wiki", "https://en.wikipedia.org/wiki").replace("<p>", "") + result = re.sub(r" <i>\(.*?\)</i>", "", result) + + return result + + +def get_in_the_news_zh() -> str: + url = "https://zh.m.wikipedia.org/zh-cn/Wikipedia:%E9%A6%96%E9%A1%B5" + response = requests.get(url, timeout=15) + html = response.text + soup = bs4.BeautifulSoup(html, "html.parser") + + div_element = soup.find("div", id="column-itn") + assert div_element + ul_element = div_element.find("ul") + assert isinstance(ul_element, bs4.Tag) + ul_element_2 = ul_element.find_next("ul") + assert isinstance(ul_element_2, bs4.Tag) + ul_element_3 = ul_element_2.find_next("ul") + assert isinstance(ul_element_3, bs4.Tag) + span_element_2 = ul_element_2.find("span", class_="hlist inline") + span_element_3 = ul_element_3.find("span", class_="hlist inline") + assert span_element_2 and span_element_3 + p_element_2 = soup.new_tag("p") + p_element_3 = soup.new_tag("p") + p_element_2.append(span_element_2) + p_element_3.append(span_element_3) + + result = ( + str(ul_element).replace("/wiki", "https://zh.wikipedia.org/zh-cn") + + str(p_element_2) + .replace('<span class="hlist inline">', "<b>正在发生:</b>") + .replace("</span>", "") + .replace("-", ";") + .replace( + '(<a href="/wiki/%E4%BF%84%E7%BE%85%E6%96%AF%E5%85%A5%E4%BE%B5%E7%83%8F%E5%85%8B%E8%98%AD%E6%99%82%E9%96%93%E8%BB%B8" title="俄罗斯入侵乌克兰时间轴">时间轴</a>)', + "", + ) + .replace("/wiki", "https://zh.wikipedia.org/zh-cn") + + str(p_element_3).replace('<span class="hlist inline">', "<b>最近逝世:</b>").replace("</span>", "").replace("-", ";").replace("/wiki", "https://zh.wikipedia.org/zh-cn") + ).replace("</p><p>", "<br>") + result = re.sub(r"<small.*?>.*?</small>", "", result) + + return result + + +def main() -> None: + parser = argparse.ArgumentParser(description="Legacy Wikipedia script for the Daily Bulletin") + parser.add_argument("--config", default="config.ini", help="path to the configuration file") + args = parser.parse_args() + + config = configparser.ConfigParser() + config.read(args.config) + + build_path = config["general"]["build_path"] + os.chdir(build_path) + + logging.basicConfig(level=logging.DEBUG) + logger.warning("Running main() only grabs On This Day") + logger.info("get_on_this_day_en()") + get_on_this_day_en() + logger.info("get_on_this_day_zh()") + get_on_this_day_zh() + # logger.info("get_in_the_news_en()") + # get_in_the_news_en() + # logger.info("get_in_the_news_zh()") + # get_in_the_news_zh() + + +if __name__ == "__main__": + main() |