diff options
Diffstat (limited to 'sjdbmk/weekly.py')
-rw-r--r-- | sjdbmk/weekly.py | 516 |
1 files changed, 516 insertions, 0 deletions
diff --git a/sjdbmk/weekly.py b/sjdbmk/weekly.py new file mode 100644 index 0000000..9c0a0c9 --- /dev/null +++ b/sjdbmk/weekly.py @@ -0,0 +1,516 @@ +#!/usr/bin/env python3 +# +# Weekly script to prepare the YK Pao School Daily Bulletin's week JSON data +# Copyright (C) 2024 Runxi Yu <https://runxiyu.org> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <https://www.gnu.org/licenses/>. +# +# Some rules: +# - Pass localized aware datetime objects around. +# Minimize the use of date strings and numbers. +# NEVER used naive datetime objects. +# Frequently check if the tzinfo is correct or cast the zone. +# - Delete variables that aren't supposed to be used anymore. +# - Functions should be short. +# - Do not pass ConfigParser objects around. +# - Use meaningful variable names. +# - Always write type hints. +# - Use the logger! Try not to print. +# +# TODO: Check The Week Ahead's dates + +from __future__ import annotations +from typing import Any, Iterable, Iterator +from configparser import ConfigParser +import argparse +import logging +import subprocess +import datetime +import zoneinfo +import os +import shutil +import json +import base64 +import email +import re + +import requests +import msal # type: ignore +import pptx +import pptx.exc + +import menuparser + +logger = logging.getLogger(__name__) + + +def generate( + datetime_target: datetime.datetime, # expected to be local time + the_week_ahead_url: str, + the_week_ahead_community_time_page_number: int, + the_week_ahead_aod_page_number: int, + weekly_menu_breakfast_page_number: int, + weekly_menu_lunch_page_number: int, + weekly_menu_dinner_page_number: int, + weekly_menu_query_string: str, + weekly_menu_sender: str, + weekly_menu_subject_regex: str, + weekly_menu_subject_regex_four_groups: tuple[int, int, int, int], + graph_client_id: str, + graph_authority: str, + graph_username: str, + graph_password: str, + graph_scopes: list[str], + calendar_address: str, + soffice: str, +) -> str: + if not datetime_target.tzinfo: + raise TypeError("Naive datetimes are unsupported") + output_filename = "week-%s.json" % datetime_target.strftime("%Y%m%d") + logger.info("Output filename: %s" % output_filename) + + token: str = acquire_token( + graph_client_id, graph_authority, graph_username, graph_password, graph_scopes + ) + + calendar_response = requests.get( + "https://graph.microsoft.com/v1.0/users/%s/calendar/calendarView" + % calendar_address, + headers={"Authorization": "Bearer " + token}, + params={ + "startDateTime": datetime_target.replace(microsecond=0).isoformat(), + "endDateTime": (datetime_target + datetime.timedelta(days=7)) + .replace(microsecond=0) + .isoformat(), + }, + timeout=15, + ) + if calendar_response.status_code != 200: + raise ValueError( + "Calendar response status code is not 200", calendar_response.content + ) + calendar_object = calendar_response.json() + # pprint(calendar_object) + + the_week_ahead_filename = "the_week_ahead-%s.pptx" % datetime_target.strftime( + "%Y%m%d" + ) + if not os.path.isfile(the_week_ahead_filename): + logger.info( + "The Week Ahead doesn't seem to exist at %s, downloading" + % the_week_ahead_filename + ) + download_share_url(token, the_week_ahead_url, the_week_ahead_filename) + logger.info("Downloaded The Week Ahead to %s" % the_week_ahead_filename) + assert os.path.isfile(the_week_ahead_filename) + else: + logger.info("The Week Ahead already exists at %s" % the_week_ahead_filename) + + menu_filename = "menu-%s.xlsx" % datetime_target.strftime("%Y%m%d") + if not os.path.isfile(menu_filename): + logger.info("Menu not found, downloading") + download_menu( + token, + datetime_target, + weekly_menu_query_string, + weekly_menu_sender, + weekly_menu_subject_regex, + weekly_menu_subject_regex_four_groups, + menu_filename, + ) + assert os.path.isfile(menu_filename) + else: + logger.info("All menus already exist") + + logger.info("Beginning to parse The Week Ahead") + the_week_ahead_presentation = pptx.Presentation(the_week_ahead_filename) + try: + community_time = extract_community_time( + the_week_ahead_presentation, + the_week_ahead_community_time_page_number, + ) + except ValueError: + logger.error( + "Invalid community time! Opening The Week Ahead for manual intervention." + ) + del the_week_ahead_presentation + subprocess.run([soffice, the_week_ahead_filename], check=True) + the_week_ahead_presentation = pptx.Presentation(the_week_ahead_filename) + community_time = extract_community_time( + the_week_ahead_presentation, + the_week_ahead_community_time_page_number, + ) + del the_week_ahead_filename + + aods = extract_aods(the_week_ahead_presentation, the_week_ahead_aod_page_number) + # We're assuming the the AODs don't need manual intervention. I think that's fair. + del the_week_ahead_presentation + logger.info("Finished parsing The Week Ahead") + + logger.info("Beginning to extract menus") + menu = menuparser.parse_menus( + menu_filename, + ) + logger.info("Finished extracting menus") + + final_data = { + "start_date": datetime_target.strftime("%Y-%m-%d"), + "community_time": community_time, + "aods": aods, + "menu": menu, + } + + with open(output_filename, "w", encoding="utf-8") as fd: + json.dump(final_data, fd, ensure_ascii=False, indent="\t") + logger.info("Dumped to: %s" % output_filename) + return output_filename + + +def main() -> None: + logging.basicConfig(level=logging.INFO) + parser = argparse.ArgumentParser(description="Weekly script for the Daily Bulletin") + parser.add_argument( + "--date", + default=None, + help="the start of the week to generate for, in local time, YYYY-MM-DD; defaults to next Monday", + ) + parser.add_argument( + "--config", default="config.ini", help="path to the configuration file" + ) + args = parser.parse_args() + + if args.date: + datetime_target_naive = datetime.datetime.strptime(args.date, "%Y-%m-%d") + else: + datetime_target_naive = None + del args.date + + config = ConfigParser() + config.read(args.config) + + tzinfo = zoneinfo.ZoneInfo(config["general"]["timezone"]) + if datetime_target_naive: + datetime_target_aware = datetime_target_naive.replace(tzinfo=tzinfo) + else: + datetime_current_aware = datetime.datetime.now(tz=tzinfo) + datetime_target_aware = datetime_current_aware + datetime.timedelta( + days=((-datetime_current_aware.weekday()) % 7) + ) + del datetime_current_aware + del datetime_target_naive + logger.info("Generating for %s" % datetime_target_aware.strftime("%Y-%m-%d %Z")) + + build_path = config["general"]["build_path"] + # TODO: check if the build path exists and create it if it doesn't + os.chdir(build_path) + + the_week_ahead_url = config["the_week_ahead"]["file_url"] + the_week_ahead_community_time_page_number = int( + config["the_week_ahead"]["community_time_page_number"] + ) + the_week_ahead_aod_page_number = int(config["the_week_ahead"]["aod_page_number"]) + + weekly_menu_breakfast_page_number = int( + config["weekly_menu"]["breakfast_page_number"] + ) + weekly_menu_lunch_page_number = int(config["weekly_menu"]["lunch_page_number"]) + weekly_menu_dinner_page_number = int(config["weekly_menu"]["dinner_page_number"]) + weekly_menu_query_string = config["weekly_menu"]["query_string"] + weekly_menu_sender = config["weekly_menu"]["sender"] + weekly_menu_subject_regex = config["weekly_menu"]["subject_regex"] + weekly_menu_subject_regex_four_groups_raw = config["weekly_menu"][ + "subject_regex_four_groups" + ].split(" ") + weekly_menu_subject_regex_four_groups = tuple( + [int(z) for z in weekly_menu_subject_regex_four_groups_raw] + ) + assert len(weekly_menu_subject_regex_four_groups) == 4 + del weekly_menu_subject_regex_four_groups_raw + # weekly_menu_dessert_page_number = config["weekly_menu"]["dessert_page_number"] + + graph_client_id = config["credentials"]["client_id"] + graph_authority = config["credentials"]["authority"] + graph_username = config["credentials"]["username"] + graph_password = config["credentials"]["password"] + graph_scopes = config["credentials"]["scope"].split(" ") + + calendar_address = config["calendar"]["address"] + + soffice = config["general"]["soffice"] + + # TODO: make a function that checks the configuration + + generate( + datetime_target=datetime_target_aware, + the_week_ahead_url=the_week_ahead_url, + the_week_ahead_community_time_page_number=the_week_ahead_community_time_page_number, + the_week_ahead_aod_page_number=the_week_ahead_aod_page_number, + weekly_menu_breakfast_page_number=weekly_menu_breakfast_page_number, + weekly_menu_lunch_page_number=weekly_menu_lunch_page_number, + weekly_menu_dinner_page_number=weekly_menu_dinner_page_number, + weekly_menu_query_string=weekly_menu_query_string, + weekly_menu_sender=weekly_menu_sender, + weekly_menu_subject_regex=weekly_menu_subject_regex, + weekly_menu_subject_regex_four_groups=weekly_menu_subject_regex_four_groups, + graph_client_id=graph_client_id, + graph_authority=graph_authority, + graph_username=graph_username, + graph_password=graph_password, + graph_scopes=graph_scopes, + calendar_address=calendar_address, + soffice=soffice, + ) + # NOTE: generate() can get the timezone from datetime_target_aware + # It returns the generated filename. + + +def encode_sharing_url(url: str) -> str: + return "u!" + base64.urlsafe_b64encode(url.encode("utf-8")).decode("ascii").rstrip( + "=" + ) + + +def download_share_url( + token: str, url: str, local_filename: str, chunk_size: int = 65536 +) -> None: + + download_direct_url = requests.get( + "https://graph.microsoft.com/v1.0/shares/%s/driveItem" + % encode_sharing_url(url), + headers={"Authorization": "Bearer " + token}, + timeout=20, + ).json()["@microsoft.graph.downloadUrl"] + + with requests.get( + download_direct_url, + headers={ + "Authorization": "Bearer %s" % token, + "Accept-Encoding": "identity", + }, + stream=True, + timeout=20, + ) as r: + with open(local_filename, "wb") as fd: + shutil.copyfileobj(r.raw, fd) + fd.flush() + + +def acquire_token( + graph_client_id: str, + graph_authority: str, + graph_username: str, + graph_password: str, + graph_scopes: list[str], +) -> str: + app = msal.PublicClientApplication( + graph_client_id, + authority=graph_authority, + ) + result = app.acquire_token_by_username_password( + graph_username, graph_password, scopes=graph_scopes + ) + + if "access_token" in result: + assert isinstance(result["access_token"], str) + return result["access_token"] + raise ValueError("Authentication error in password login") + + +def search_mail(token: str, query_string: str) -> list[dict[str, Any]]: + hits = requests.post( + "https://graph.microsoft.com/v1.0/search/query", + headers={"Authorization": "Bearer " + token}, + json={ + "requests": [ + { + "entityTypes": ["message"], + "query": {"queryString": query_string}, + "from": 0, + "size": 15, + "enableTopResults": True, + } + ] + }, + timeout=20, + ).json()["value"][0]["hitsContainers"][0]["hits"] + assert isinstance(hits, list) + assert isinstance(hits[0], dict) + return hits + + +def extract_aods( + prs: pptx.presentation.Presentation, aod_page_number: int +) -> list[str]: + slide = prs.slides[aod_page_number] + aods = ["", "", "", ""] + for shape in slide.shapes: + if hasattr(shape, "text") and "Monday: " in shape.text: + slist = shape.text.split("\n") + for s in slist: + try: + day, aod = s.split(": ", 1) + except ValueError: + pass + day = day.lower() + if day == "monday": + aods[0] = aod + elif day == "tuesday": + aods[1] = aod + elif day == "wednesday": + aods[2] = aod + elif day == "thursday": + aods[3] = aod + if not all(aods): + raise ValueError( + "AOD parsing: The Week Ahead doesn't include all AOD days, or the formatting is borked" + ) + return aods + raise ValueError("AOD parsing: The Week Ahead's doesn't even include \"Monday\"") + # TODO: this is one of those places where Monday is *expected* to be the first day. + # TODO: revamp this. this is ugly! + + +def extract_community_time( + prs: pptx.presentation.Presentation, community_time_page_number: int +) -> list[list[str]]: + + slide = prs.slides[community_time_page_number] + for shape in slide.shapes: + if not shape.has_table: + continue + break + else: + raise ValueError("No shapes") + tbl = shape.table + row_count = len(tbl.rows) + col_count = len(tbl.columns) + if col_count not in [4, 5]: + raise ValueError( + "Community time parsing: The Week Ahead community time table does not have 4 or 5 columns" + ) + if col_count == 4: + logger.warning( + "Community time warning: only four columns found, assuming that Y12 has graduated" + ) + + res = [["" for c in range(col_count)] for r in range(row_count)] + + for r in range(row_count): + for c in range(col_count): + cell = tbl.cell(r, c) + if not cell.is_spanned: + t = "" + for p in cell.text_frame.paragraphs: + for pr in p.runs: + t += pr.text + t = t.strip() + if "whole school assembly" in t.lower(): + t = "Whole School Assembly" + elif ( + "tutor group check-in" in t.lower() + or "follow up day" in t.lower() + or "open session for tutor and tutee" in t.lower() + ): + t = "Tutor Time" + res[r][c] = t + if cell.is_merge_origin: + for sh in range(cell.span_height): + for sw in range(cell.span_width): + res[r + sh][c + sw] = t + + return [x[1:] for x in res[1:]] + + +def filter_mail_results_by_sender( + original: Iterable[dict[str, Any]], sender: str +) -> Iterator[dict[str, Any]]: + for hit in original: + if ( + hit["resource"]["sender"]["emailAddress"]["address"].lower() + == sender.lower() + ): + yield hit + + +# TODO: Potentially replace this with a pattern-match based on strptime(). +def filter_mail_results_by_subject_regex_groups( + original: Iterable[dict[str, Any]], + subject_regex: str, + subject_regex_groups: Iterable[int], +) -> Iterator[tuple[dict[str, Any], list[str]]]: + for hit in original: + logging.debug("Trying %s" % hit["resource"]["subject"]) + matched = re.compile(subject_regex).match(hit["resource"]["subject"]) + if matched: + yield (hit, [matched.group(group) for group in subject_regex_groups]) + + +def download_menu( + token: str, + datetime_target: datetime.datetime, + weekly_menu_query_string: str, + weekly_menu_sender: str, + weekly_menu_subject_regex: str, + weekly_menu_subject_regex_four_groups: tuple[int, int, int, int], + menu_filename: str, +) -> None: + search_results = search_mail(token, weekly_menu_query_string) + + for hit, matched_groups in filter_mail_results_by_subject_regex_groups( + filter_mail_results_by_sender(search_results, weekly_menu_sender), + weekly_menu_subject_regex, + weekly_menu_subject_regex_four_groups, + ): + try: + subject_1st_month = datetime.datetime.strptime( + matched_groups[0], "%b" # issues here are probably locales + ).month + subject_1st_day = int(matched_groups[1]) + except ValueError as exc: + raise ValueError(hit["resource"]["subject"], matched_groups[0]) from exc + if ( + subject_1st_month == datetime_target.month + and subject_1st_day == datetime_target.day + ): + break + else: + raise ValueError("No SJ-menu email found") + + with requests.get( + "https://graph.microsoft.com/v1.0/me/messages/%s/$value" % hit["hitId"], + headers={ + "Authorization": "Bearer %s" % token, + "Accept-Encoding": "identity", + }, + stream=True, + timeout=20, + ) as r: + msg = email.message_from_bytes(r.content) + + for part in msg.walk(): + if part.get_content_type() in [ + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ]: + payload = part.get_payload(decode=True) + pb = bytes(payload) + + with open(menu_filename, "wb") as w: + w.write(pb) + break + else: + raise ValueError("No proper attachment found in email") + + +if __name__ == "__main__": + main() |