#!/usr/bin/env python # -*- coding: utf-8 -*- # System packages import pathlib from datetime import datetime from html.parser import HTMLParser from typing import List, Tuple, Optional, Set from enum import IntEnum path_project = pathlib.Path("/home/gmartin/Workspace/DiscordPrettyPrinter") class DiscordRollTok(IntEnum): """ List the different times discords tokens has been modified. """ ED_20230709 = 1 # Exports réalisés le 2023-07-09 ED_20240605 = 2 # Exports réalisés le 2024-06-05 # Tokens used to parse pseudos. PSEUDO_TAG: dict[DiscordRollTok, str] = { DiscordRollTok.ED_20230709: "username-h_Y3Us", DiscordRollTok.ED_20240605: "username__0b0e7", } # Tokens used to parse reaction counts. REACTCOUNT_TAG: dict[DiscordRollTok, str] = { DiscordRollTok.ED_20230709: "reactionCount-26U4As", DiscordRollTok.ED_20240605: "reactionCount__2c34d", } # Tokens used to parse the divider and the date. DIVIDER_TAG: dict[DiscordRollTok, str] = { DiscordRollTok.ED_20230709: "divider-IqmEqJ", DiscordRollTok.ED_20240605: "divider__01aed", } def check_tagattr(attrs: List[Tuple[str, str]], tag: str) -> bool: """Checks if the given tag, e.g. 'username-h_Y3Us', is inside the list of attributes values. """ attrs_keyvalues: List[str] = list(zip(*attrs)) if len(attrs_keyvalues): for attr_value in attrs_keyvalues[1]: if tag in attr_value: return True return False class DiscordHTMLParser(HTMLParser): def __init__(self, keep_time: bool, convert_charrefs=True): super().__init__(convert_charrefs=convert_charrefs) self.keep_time = keep_time # Parsed text to be exported. self.export_txt: str = "" self.nb_messages: int = 0 self.nb_duplicate: int = 0 # Information about what is being parsed. self.is_scan_message: bool = False self.is_scan_pseudo: bool = False self.is_scan_time: bool = False self.is_scan_reactioncount: bool = False self.is_scan_datedivider: bool = False # When parsing a time element, keep its datetime to print it in the message. self.current_dt: Optional[datetime] = None # Keep track of which message and date separator have been added. self.set_idmessage: Set[str] = set() self.set_dateseparator: Set[str] = set() def handle_starttag(self, tag, attrs): # Add to the export the tags used to format the text. match tag: case "em": if self.is_scan_message: self.export_txt += "" case "strong": if self.is_scan_message: self.export_txt += "" case "li": if check_tagattr(attrs, "chat-messages"): # Retrieve the id of the message. id_message: str = "" for key, value in attrs: if "chat-messages" in value: id_message = value.split("chat-messages-")[1] # If the message has already been seen, don't keep it. if id_message in self.set_idmessage: self.nb_duplicate += 1 else: self.set_idmessage.add(id_message) self.nb_messages += 1 self.is_scan_message = True self.export_txt += f"\n
  • " case "span": # Tell whether the pseudo is being scanned or not. if check_tagattr(attrs, PSEUDO_TAG[DiscordRollTok.ED_20240605]): self.is_scan_pseudo = True case "div": # Tell whether the reaction count is being scanned or not. if check_tagattr(attrs, REACTCOUNT_TAG[DiscordRollTok.ED_20240605]): self.is_scan_reactioncount = True elif check_tagattr(attrs, DIVIDER_TAG[DiscordRollTok.ED_20240605]): # Retrieve which date separator is being scanned. date_separator: str = "" for key, value in attrs: if key == "aria-label": date_separator = value if date_separator not in self.set_dateseparator: self.set_dateseparator.add(date_separator) self.is_scan_datedivider = True case "time": if self.is_scan_message: if not check_tagattr(attrs, "Edited"): self.is_scan_time = True for key, value in attrs: if key == "datetime": self.current_dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%fZ") def handle_endtag(self, tag): # Add to the export the tags used to format the text. match tag: case "em": if self.is_scan_message: self.export_txt += "" case "strong": if self.is_scan_message: self.export_txt += "" case "li": if self.is_scan_message: self.is_scan_message = False self.export_txt += "
  • " case "span": self.is_scan_pseudo = False case "div": self.is_scan_reactioncount = False self.is_scan_datedivider = False case "time": self.is_scan_time = False self.current_dt = None def handle_data(self, data): if self.is_scan_message: if self.is_scan_reactioncount: return elif self.is_scan_time and self.keep_time: if self.current_dt is not None: self.export_txt += f"[{self.current_dt.strftime('%Y-%m-%d %H:%M')}] " self.current_dt = None elif self.is_scan_pseudo: pseudo_ansi = ''.join(i for i in data.lower() if ord(i) < 128) self.export_txt += f"" self.export_txt += data self.export_txt += "" elif not (data in [""]): data = data.replace("\n", "\n
    ") self.export_txt += data elif self.is_scan_datedivider: self.export_txt += f"
    {data}
    \n" def export_messages(self) -> str: before_export: str = "" after_export: str = "" before_export += "\n" before_export += "\n" before_export += "\n" before_export += "\n" before_export += "\n" before_export += "\n" after_export += "\n" after_export += "\n" return before_export + self.export_txt + after_export