diff --git a/htmlparser.py b/htmlparser.py new file mode 100644 index 0000000..94bb068 --- /dev/null +++ b/htmlparser.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# System packages +import pathlib +from datetime import datetime +from html.parser import HTMLParser +from typing import List, Tuple, Optional, Set +from enum import IntEnum + +path_project = pathlib.Path("/home/gmartin/Workspace/DiscordPrettyPrinter") + + +class DiscordRollTok(IntEnum): + """ + List the different times discords tokens has been modified. + """ + ED_20230709 = 1 # Exports réalisés le 2023-07-09 + ED_20240605 = 2 # Exports réalisés le 2024-06-05 + +# Tokens used to parse pseudos. +PSEUDO_TAG: dict[DiscordRollTok, str] = { + DiscordRollTok.ED_20230709: "username-h_Y3Us", + DiscordRollTok.ED_20240605: "username__0b0e7", +} +# Tokens used to parse reaction counts. +REACTCOUNT_TAG: dict[DiscordRollTok, str] = { + DiscordRollTok.ED_20230709: "reactionCount-26U4As", + DiscordRollTok.ED_20240605: "reactionCount__2c34d", +} +# Tokens used to parse the divider and the date. +DIVIDER_TAG: dict[DiscordRollTok, str] = { + DiscordRollTok.ED_20230709: "divider-IqmEqJ", + DiscordRollTok.ED_20240605: "divider__01aed", +} + + +def check_tagattr(attrs: List[Tuple[str, str]], tag: str) -> bool: + """Checks if the given tag, e.g. 'username-h_Y3Us', is inside the list of attributes values. + """ + attrs_keyvalues: List[str] = list(zip(*attrs)) + if len(attrs_keyvalues): + for attr_value in attrs_keyvalues[1]: + if tag in attr_value: + return True + return False + + +class DiscordHTMLParser(HTMLParser): + + def __init__(self, keep_time: bool, convert_charrefs=True): + super().__init__(convert_charrefs=convert_charrefs) + + self.keep_time = keep_time + + # Parsed text to be exported. + self.export_txt: str = "" + self.nb_messages: int = 0 + self.nb_duplicate: int = 0 + + # Information about what is being parsed. + self.is_scan_message: bool = False + self.is_scan_pseudo: bool = False + self.is_scan_time: bool = False + self.is_scan_reactioncount: bool = False + self.is_scan_datedivider: bool = False + + # When parsing a time element, keep its datetime to print it in the message. + self.current_dt: Optional[datetime] = None + + # Keep track of which message and date separator have been added. + self.set_idmessage: Set[str] = set() + self.set_dateseparator: Set[str] = set() + + def handle_starttag(self, tag, attrs): + + # Add to the export the tags used to format the text. + match tag: + case "em": + if self.is_scan_message: + self.export_txt += "" + case "strong": + if self.is_scan_message: + self.export_txt += "" + case "li": + if check_tagattr(attrs, "chat-messages"): + + # Retrieve the id of the message. + id_message: str = "" + for key, value in attrs: + if "chat-messages" in value: + id_message = value.split("chat-messages-")[1] + + # If the message has already been seen, don't keep it. + if id_message in self.set_idmessage: + self.nb_duplicate += 1 + else: + self.set_idmessage.add(id_message) + + self.nb_messages += 1 + self.is_scan_message = True + self.export_txt += f"\n
  • " + case "span": + # Tell whether the pseudo is being scanned or not. + if check_tagattr(attrs, PSEUDO_TAG[DiscordRollTok.ED_20240605]): + self.is_scan_pseudo = True + case "div": + # Tell whether the reaction count is being scanned or not. + if check_tagattr(attrs, REACTCOUNT_TAG[DiscordRollTok.ED_20240605]): + self.is_scan_reactioncount = True + elif check_tagattr(attrs, DIVIDER_TAG[DiscordRollTok.ED_20240605]): + + # Retrieve which date separator is being scanned. + date_separator: str = "" + for key, value in attrs: + if key == "aria-label": + date_separator = value + + if date_separator not in self.set_dateseparator: + self.set_dateseparator.add(date_separator) + self.is_scan_datedivider = True + case "time": + if self.is_scan_message: + if not check_tagattr(attrs, "Edited"): + self.is_scan_time = True + for key, value in attrs: + if key == "datetime": + self.current_dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%fZ") + + def handle_endtag(self, tag): + + # Add to the export the tags used to format the text. + match tag: + case "em": + if self.is_scan_message: + self.export_txt += "" + case "strong": + if self.is_scan_message: + self.export_txt += "" + case "li": + if self.is_scan_message: + self.is_scan_message = False + self.export_txt += "
  • " + case "span": + self.is_scan_pseudo = False + case "div": + self.is_scan_reactioncount = False + self.is_scan_datedivider = False + case "time": + self.is_scan_time = False + self.current_dt = None + + def handle_data(self, data): + + if self.is_scan_message: + if self.is_scan_reactioncount: + return + elif self.is_scan_time and self.keep_time: + if self.current_dt is not None: + self.export_txt += f"[{self.current_dt.strftime('%Y-%m-%d %H:%M')}] " + self.current_dt = None + elif self.is_scan_pseudo: + pseudo_ansi = ''.join(i for i in data.lower() if ord(i) < 128) + self.export_txt += f"" + self.export_txt += data + self.export_txt += "" + elif not (data in [""]): + data = data.replace("\n", "\n
    ") + self.export_txt += data + elif self.is_scan_datedivider: + self.export_txt += f"
    {data}
    \n" + + def export_messages(self) -> str: + before_export: str = "" + after_export: str = "" + + before_export += "\n" + before_export += "\n" + before_export += "\n" + before_export += "\n" + before_export += "\n" + before_export += "\n" + after_export += "\n" + after_export += "\n" + return before_export + self.export_txt + after_export diff --git a/main.py b/main.py new file mode 100644 index 0000000..e9b7862 --- /dev/null +++ b/main.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +# System packages +import pathlib +from pathlib import Path + +# Project package +from htmlparser import DiscordHTMLParser + +path_project = pathlib.Path("/home/gmartin/Workspace/DiscordPrettyPrinter") + +if __name__ == '__main__': + parser = DiscordHTMLParser(keep_time=False) + folder_name = "sar" + file_name = "mp" + + xml_path: pathlib.Path = path_project / "input" / folder_name / (file_name + ".xml") + + with open(xml_path, "r") as file_xml: + xml_str = "" + for line in file_xml: + xml_str += line + parser.feed(xml_str) + # print(parser.export_messages()) + print(f"Parsed: {xml_path}") + print(f"{parser.nb_messages} messages found (and {parser.nb_duplicate} duplicates).") + + # Creation of the output folder if it does not exist. + path_file_output: Path = ( + path_project / "output" / folder_name / (file_name + ".html") + ) + path_file_output.parent.mkdir(parents=True, exist_ok=True) + + # Export of the parsed text and formatted into html. + with open(path_file_output, "w") as file_htmloutput: + file_htmloutput.write(parser.export_messages())