aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--menuparser.py (renamed from menu.py)0
-rwxr-xr-xweekly.py307
2 files changed, 24 insertions, 283 deletions
diff --git a/menu.py b/menuparser.py
index bbdb3b5..bbdb3b5 100644
--- a/menu.py
+++ b/menuparser.py
diff --git a/weekly.py b/weekly.py
index 8b8f20b..46c0cae 100755
--- a/weekly.py
+++ b/weekly.py
@@ -47,11 +47,13 @@ import email
import re
import requests
-import msal # type: ignore
-import pptx # type: ignore
-import pptx.exc # type: ignore
+import msal # type: ignore
+import pptx
+import pptx.exc
import pypdf
+import menuparser
+
logger = logging.getLogger(__name__)
@@ -113,8 +115,7 @@ def generate(
"Calendar response status code is not 200", calendar_response.content
)
calendar_object = calendar_response.json()
- pprint(calendar_object)
- # exit(1)
+ # pprint(calendar_object)
the_week_ahead_filename = "the_week_ahead-%s.pptx" % datetime_target.strftime(
"%Y%m%d"
@@ -130,17 +131,9 @@ def generate(
else:
logger.info("The Week Ahead already exists at %s" % the_week_ahead_filename)
- menu_en_filename = "menu-%s-en.pptx" % datetime_target.strftime("%Y%m%d")
- menu_zh_filename = "menu-%s-zh.pptx" % datetime_target.strftime("%Y%m%d")
- menu_pdf_filename = "menu-%s.pdf" % datetime_target.strftime(
- "%Y%m%d"
- ) # TODO: Snacks
- if not (
- os.path.isfile(menu_en_filename)
- and os.path.isfile(menu_zh_filename)
- and os.path.isfile(menu_pdf_filename)
- ):
- logger.info("Not all menus exist, downloading")
+ menu_filename = "menu-%s.xlsx" % datetime_target.strftime("%Y%m%d")
+ if not (os.path.isfile(menu_filename)):
+ logger.info("Menu not found, downloading")
download_menu(
token,
datetime_target,
@@ -148,14 +141,10 @@ def generate(
weekly_menu_sender,
weekly_menu_subject_regex,
weekly_menu_subject_regex_four_groups,
- menu_en_filename,
- menu_zh_filename,
- menu_pdf_filename,
+ menu_filename,
)
assert (
- os.path.isfile(menu_en_filename)
- and os.path.isfile(menu_zh_filename)
- and os.path.isfile(menu_pdf_filename)
+ os.path.isfile(menu_filename)
)
else:
logger.info("All menus already exist")
@@ -186,31 +175,9 @@ def generate(
logger.info("Finished parsing The Week Ahead")
logger.info("Beginning to extract menus")
- try:
- menu = extract_pptx_menus(
- menu_en_filename,
- menu_zh_filename,
- weekly_menu_breakfast_page_number,
- weekly_menu_lunch_page_number,
- weekly_menu_dinner_page_number,
- )
- snacks = fix_snacks(extract_snacks(menu_pdf_filename))
- except MealTableShapeError as e:
- logger.error(
- "Invalid menus! Opening both PPTX menus for manual intervention.", e.args[0]
- )
- subprocess.run([soffice, menu_en_filename, menu_zh_filename], check=True)
- menu = extract_pptx_menus(
- menu_en_filename,
- menu_zh_filename,
- weekly_menu_breakfast_page_number,
- weekly_menu_lunch_page_number,
- weekly_menu_dinner_page_number,
- )
- snacks = fix_snacks(extract_snacks(menu_pdf_filename))
- del menu_en_filename
- del menu_zh_filename
- del menu_pdf_filename
+ menu = menuparser.extract(
+ menu_filename,
+ )
logger.info("Finished extracting menus")
final_data = {
@@ -218,7 +185,7 @@ def generate(
"community_time": community_time,
"aods": aods,
"menu": menu,
- "snacks": snacks,
+ "snacks": {}, # TODO
}
with open(output_filename, "w", encoding="utf-8") as fd:
@@ -399,122 +366,7 @@ def search_mail(token: str, query_string: str) -> list[dict[str, Any]]:
return hits
-def slide_to_srep(slide: pptx.slide) -> list[list[tuple[str, int, int, str]]]:
- # NOTE: Only processes FIRST table.
- for shape in slide.shapes:
- if shape.has_table:
- break
- else:
- raise ValueError("Slide doesn't contain any tables?")
- tbl = shape.table
- row_count: int = len(tbl.rows)
- col_count: int = len(tbl.columns)
- tbll = []
- for r in range(row_count):
- row: list[tuple[str, int, int, str]] = [("", 0, 0, "")] * col_count
- for c in range(col_count):
- cell_text = ""
- cell = tbl.cell(r, c)
- assert isinstance(cell.span_height, int)
- assert isinstance(cell.span_width, int)
- paragraphs = cell.text_frame.paragraphs
- for paragraph in paragraphs:
- for run in paragraph.runs:
- cell_text += run.text
- row[c] = (
- "o" if cell.is_merge_origin else ("s" if cell.is_spanned else "n"),
- cell.span_height,
- cell.span_width,
- cell_text.strip(),
- )
- tbll.append(row)
- return tbll
-
-
-def combine_parsed_meal_tables(
- en: list[list[list[str]]], cn: list[list[list[str]]]
-) -> list[list[list[list[str]]]]:
- if not equal_shapes(cn, en):
- raise MealTableShapeError(
- "Augmented menus not in the same shape",
- zero_list(en),
- zero_list(cn),
- en,
- cn,
- )
-
- c = zero_list(en)
-
- for j in range(len(en)):
- for i in range(len(en[j])):
- for k in range(len(en[j][i])):
- c[j][i][k] = {"en": en[j][i][k], "zh": cn[j][i][k]}
- return c
-
-
-def parse_meal_tables(
- tbl: list[list[tuple[str, int, int, str]]]
-) -> list[list[list[str]]]:
- windows = []
- for j in range(1, len(tbl)):
- cell = tbl[j][0]
- if cell[0] in ["o", "n"]:
- windows.append((j, j - 1 + cell[1]))
-
- daysmenus: list[list[list[str]]] = [[], [], [], [], []]
-
- if len(tbl[0]) != 6:
- logger.warning(100 * "@" + "Fewer than 5 days of menus, time to audit?")
-
- for i in range(1, len(tbl[0])):
- for s, f in windows:
- thiswindow = []
- for j in range(s, f + 1):
- if tbl[j][i][-1].strip() and (
- tbl[j][i][-1].strip().lower().replace(",", "")
- not in ["condiments selection", "葱香菜榨菜丝老干妈生抽醋"]
- ): # seriously
- thiswindow.append(
- tbl[j][i][-1]
- .replace(", ", ", ")
- .replace(",", ", ")
- .replace("Juice /", "Juice/")
- )
- daysmenus[i - 1].append(thiswindow)
- return daysmenus
-
-
-def extract_pptx_menus(
- menu_en_filename: str,
- menu_zh_filename: str,
- breakfast_page_number: int,
- lunch_page_number: int,
- dinner_page_number: int,
-) -> dict[str, list[list[list[list[str]]]]]:
- try:
- enprs = pptx.Presentation(menu_en_filename)
- zhprs = pptx.Presentation(menu_zh_filename)
- except pptx.exc.PackageNotFoundError:
- raise ValueError("Presentation path doesn't exist or is broken") from None
-
- mtable = {}
- for meal, pageno in {
- "breakfast": breakfast_page_number,
- "lunch": lunch_page_number,
- "dinner": dinner_page_number,
- }.items():
- try:
- mtable[meal] = combine_parsed_meal_tables(
- parse_meal_tables(slide_to_srep(enprs.slides[pageno])),
- parse_meal_tables(slide_to_srep(zhprs.slides[pageno])),
- )
- except MealTableShapeError:
- raise MealTableShapeError(meal) from None
- assert len(mtable) == 3
- return mtable
-
-
-def extract_aods(prs: pptx.Presentation, aod_page_number: int) -> list[str]:
+def extract_aods(prs: pptx.presentation.Presentation, aod_page_number: int) -> list[str]:
slide = prs.slides[aod_page_number]
aods = ["", "", "", ""]
for shape in slide.shapes:
@@ -545,7 +397,7 @@ def extract_aods(prs: pptx.Presentation, aod_page_number: int) -> list[str]:
def extract_community_time(
- prs: pptx.Presentation, community_time_page_number: int
+ prs: pptx.presentation.Presentation, community_time_page_number: int
) -> list[list[str]]:
slide = prs.slides[community_time_page_number]
@@ -626,9 +478,7 @@ def download_menu(
weekly_menu_sender: str,
weekly_menu_subject_regex: str,
weekly_menu_subject_regex_four_groups: tuple[int, int, int, int],
- menu_en_filename: str,
- menu_zh_filename: str,
- menu_pdf_filename: str,
+ menu_filename: str,
) -> None:
search_results = search_mail(token, weekly_menu_query_string)
@@ -639,7 +489,7 @@ def download_menu(
):
try:
subject_1st_month = datetime.datetime.strptime(
- matched_groups[0], "%b" # issues here are probably locales
+ matched_groups[0], "%b" # issues here are probably locales
).month
subject_1st_day = int(matched_groups[1])
except ValueError:
@@ -665,19 +515,17 @@ def download_menu(
for part in msg.walk():
if part.get_content_type() in [
- "application/vnd.openxmlformats-officedocument.presentationml.presentation",
- "application/pdf",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
]:
payload = part.get_payload(decode=True)
payload_filename_encoded = part.get_filename()
if not payload_filename_encoded:
- raise ValueError("pptx/pdf doesn't have a filename, very unexpected")
+ raise ValueError("xlsx does not have a filename")
payload_filename_mix = email.header.decode_header(payload_filename_encoded)
assert len(payload_filename_mix) == 1
payload_filename_encoded, payload_filename_encoding = payload_filename_mix[
0
]
-
if payload_filename_encoding is None:
assert isinstance(payload_filename_encoded, str)
filename = payload_filename_encoded
@@ -685,120 +533,13 @@ def download_menu(
filename = payload_filename_encoded.decode(payload_filename_encoding)
else:
raise TypeError("What?")
- if (
- part.get_content_type()
- == "application/vnd.openxmlformats-officedocument.presentationml.presentation"
- ):
- if "EN" in filename:
- formatted_filename = menu_en_filename
- elif "CH" in filename or "CN" in filename or "ZH" in filename:
- formatted_filename = menu_zh_filename
- else:
- raise ValueError(
- "%s does not contain a language specification string (EN/CH/CN)"
- % filename
- )
- elif part.get_content_type() == "application/pdf":
- formatted_filename = menu_pdf_filename
- else:
- continue
pb = bytes(payload)
- with open(formatted_filename, "wb") as w:
+ with open(menu_filename, "wb") as w:
w.write(pb)
-
-
-def extract_snacks(fn: str) -> tuple[list[str], list[str], list[str]]:
-
- visitor_state: list[Optional[float]] = [None, None]
-
- def visitor_1st_run(
- text: str,
- cm: list[float],
- tm: list[float],
- fdict: Optional[pypdf.generic._data_structures.DictionaryObject],
- fsize: Optional[float],
- ) -> None:
- if "students snack" in text.lower():
- visitor_state[0], visitor_state[1] = tm[-2], tm[-1]
-
- pdf = pypdf.PdfReader(fn)
- page = pdf.pages[2]
- page.extract_text(visitor_text=visitor_1st_run)
-
- snack_state: list[int] = [0]
- morning: list[str] = []
- afternoon: list[str] = []
- evening: list[str] = []
-
- if (not visitor_state[0]) or (not visitor_state[1]):
- page = pdf.pages[3]
- page.extract_text(visitor_text=visitor_1st_run)
-
- snack_state = [0]
- morning = []
- afternoon = []
- evening = []
-
- def visitor_2nd_run(
- text: str,
- cm: list[float],
- tm: list[float],
- fdict: Optional[pypdf.generic._data_structures.DictionaryObject],
- fsize: Optional[float],
- ) -> None:
- assert visitor_state[1] is not None
- if tm[-1] < visitor_state[1]:
- tsl = text.strip().lower()
- if "morning snack" in tsl:
- snack_state[0] = 1
- elif "afternoon snack" in tsl:
- snack_state[0] = 2
- elif "evening snack" in tsl:
- snack_state[0] = 3
- elif tsl:
- match snack_state[0]:
- case 1:
- morning.append(text.strip())
- case 2:
- afternoon.append(text.strip())
- case 3:
- evening.append(text.strip())
- case _:
- pass
-
- page.extract_text(visitor_text=visitor_2nd_run)
-
- return morning, afternoon, evening
-
-
-def fix_snacks(
- extracted: tuple[list[str], list[str], list[str]]
-) -> list[list[dict[str, str]]]:
- res: list[list[dict[str, str]],] = []
- for snackset in extracted:
- sres = []
- if len(snackset) % 2 == 0:
- pass
- else:
- roasted_bread = False
- actual_snack_set = []
- for p in snackset:
- if p == "Roasted Bread":
- roasted_bread = True
- elif roasted_bread and p == "with Ham and Cheese":
- actual_snack_set.append("Roasted Bread with Ham and Cheese")
- roasted_bread = False
- else:
- actual_snack_set.append(p)
- snackset = actual_snack_set
- for i in range(0, len(snackset), 2):
- sres.append({"en": snackset[i], "zh": snackset[i + 1]})
- res.append(sres)
-
- assert len(res) == 3
- return res
+ else:
+ raise ValueError("No proper attachment found in email")
if __name__ == "__main__":