misc: Added initial codebase.

2025-09-23 23:20:43 +02:00 · 2025-09-23 23:20:43 +02:00 · b47996b4c1
parent 17162b9cfd
commit b47996b4c1
2 changed files with 224 additions and 0 deletions
--- a/htmlparser.py
+++ b/htmlparser.py
@ -0,0 +1,187 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # System packages
 import pathlib
 from datetime import datetime
 from html.parser import HTMLParser
 from typing import List, Tuple, Optional, Set
 from enum import IntEnum
 path_project = pathlib.Path("/home/gmartin/Workspace/DiscordPrettyPrinter")
 class DiscordRollTok(IntEnum):
    """
    List the different times discords tokens has been modified.
    """
    ED_20230709 = 1  # Exports réalisés le 2023-07-09
    ED_20240605 = 2  # Exports réalisés le 2024-06-05
 # Tokens used to parse pseudos.
 PSEUDO_TAG: dict[DiscordRollTok, str] = {
    DiscordRollTok.ED_20230709: "username-h_Y3Us",
    DiscordRollTok.ED_20240605: "username__0b0e7",
 }
 # Tokens used to parse reaction counts.
 REACTCOUNT_TAG: dict[DiscordRollTok, str] = {
    DiscordRollTok.ED_20230709: "reactionCount-26U4As",
    DiscordRollTok.ED_20240605: "reactionCount__2c34d",
 }
 # Tokens used to parse the divider and the date.
 DIVIDER_TAG: dict[DiscordRollTok, str] = {
    DiscordRollTok.ED_20230709: "divider-IqmEqJ",
    DiscordRollTok.ED_20240605: "divider__01aed",
 }
 def check_tagattr(attrs: List[Tuple[str, str]], tag: str) -> bool:
    """Checks if the given tag, e.g. 'username-h_Y3Us', is inside the list of attributes values.
    """
    attrs_keyvalues: List[str] = list(zip(*attrs))
    if len(attrs_keyvalues):
        for attr_value in attrs_keyvalues[1]:
            if tag in attr_value:
                return True
    return False
 class DiscordHTMLParser(HTMLParser):
    def __init__(self, keep_time: bool, convert_charrefs=True):
        super().__init__(convert_charrefs=convert_charrefs)
        self.keep_time = keep_time
        # Parsed text to be exported.
        self.export_txt: str = ""
        self.nb_messages: int = 0
        self.nb_duplicate: int = 0
        # Information about what is being parsed.
        self.is_scan_message: bool = False
        self.is_scan_pseudo: bool = False
        self.is_scan_time: bool = False
        self.is_scan_reactioncount: bool = False
        self.is_scan_datedivider: bool = False
        # When parsing a time element, keep its datetime to print it in the message.
        self.current_dt: Optional[datetime] = None
        # Keep track of which message and date separator have been added.
        self.set_idmessage: Set[str] = set()
        self.set_dateseparator: Set[str] = set()
    def handle_starttag(self, tag, attrs):
        # Add to the export the tags used to format the text.
        match tag:
            case "em":
                if self.is_scan_message:
                    self.export_txt += "<em>"
            case "strong":
                if self.is_scan_message:
                    self.export_txt += "<b>"
            case "li":
                if check_tagattr(attrs, "chat-messages"):
                    # Retrieve the id of the message.
                    id_message: str = ""
                    for key, value in attrs:
                        if "chat-messages" in value:
                            id_message = value.split("chat-messages-")[1]
                    # If the message has already been seen, don't keep it.
                    if id_message in self.set_idmessage:
                        self.nb_duplicate += 1
                    else:
                        self.set_idmessage.add(id_message)
                        self.nb_messages += 1
                        self.is_scan_message = True
                        self.export_txt += f"\n<li class=\"message\" id_message=\"{id_message}\">"
            case "span":
                # Tell whether the pseudo is being scanned or not.
                if check_tagattr(attrs, PSEUDO_TAG[DiscordRollTok.ED_20240605]):
                    self.is_scan_pseudo = True
            case "div":
                # Tell whether the reaction count is being scanned or not.
                if check_tagattr(attrs, REACTCOUNT_TAG[DiscordRollTok.ED_20240605]):
                    self.is_scan_reactioncount = True
                elif check_tagattr(attrs, DIVIDER_TAG[DiscordRollTok.ED_20240605]):
                    # Retrieve which date separator is being scanned.
                    date_separator: str = ""
                    for key, value in attrs:
                        if key == "aria-label":
                            date_separator = value
                    if date_separator not in self.set_dateseparator:
                        self.set_dateseparator.add(date_separator)
                        self.is_scan_datedivider = True
            case "time":
                if self.is_scan_message:
                    if not check_tagattr(attrs, "Edited"):
                        self.is_scan_time = True
                        for key, value in attrs:
                            if key == "datetime":
                                self.current_dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%fZ")
    def handle_endtag(self, tag):
        # Add to the export the tags used to format the text.
        match tag:
            case "em":
                if self.is_scan_message:
                    self.export_txt += "</em>"
            case "strong":
                if self.is_scan_message:
                    self.export_txt += "</b>"
            case "li":
                if self.is_scan_message:
                    self.is_scan_message = False
                    self.export_txt += "</li>"
            case "span":
                self.is_scan_pseudo = False
            case "div":
                self.is_scan_reactioncount = False
                self.is_scan_datedivider = False
            case "time":
                self.is_scan_time = False
                self.current_dt = None
    def handle_data(self, data):
        if self.is_scan_message:
            if self.is_scan_reactioncount:
                return
            elif self.is_scan_time and self.keep_time:
                if self.current_dt is not None:
                    self.export_txt += f"<span class=\"time\">[{self.current_dt.strftime('%Y-%m-%d %H:%M')}]</span> "
                    self.current_dt = None
            elif self.is_scan_pseudo:
                pseudo_ansi = ''.join(i for i in data.lower() if ord(i) < 128)
                self.export_txt += f"<span class=\"pseudo_{pseudo_ansi}\"><b>"
                self.export_txt += data
                self.export_txt += "</b></span>"
            elif not (data in [""]):
                data = data.replace("\n", "\n<br/>")
                self.export_txt += data
        elif self.is_scan_datedivider:
            self.export_txt += f"<div class=\"dateSeparator\">{data}</div>\n"
    def export_messages(self) -> str:
        before_export: str = ""
        after_export: str = ""
        before_export += "<html>\n"
        before_export += "<head>\n"
        before_export += "<link rel=\"stylesheet\" type=\"text/css\" href=\"style.css\" />\n"
        before_export += "</head>\n"
        before_export += "<body>\n"
        before_export += "<ul>\n"
        after_export += "</ul>\n"
        after_export += "</body>\n"
        after_export += "</html>\n"
        return before_export + self.export_txt + after_export
--- a/main.py
+++ b/main.py
@ -0,0 +1,37 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 # System packages
 import pathlib
 from pathlib import Path
 # Project package
 from htmlparser import DiscordHTMLParser
 path_project = pathlib.Path("/home/gmartin/Workspace/DiscordPrettyPrinter")
 if __name__ == '__main__':
    parser = DiscordHTMLParser(keep_time=False)
    folder_name = "sar"
    file_name = "mp"
    xml_path: pathlib.Path = path_project / "input" / folder_name / (file_name + ".xml")
    with open(xml_path, "r") as file_xml:
        xml_str = ""
        for line in file_xml:
            xml_str += line
        parser.feed(xml_str)
        # print(parser.export_messages())
        print(f"Parsed: {xml_path}")
        print(f"{parser.nb_messages} messages found (and {parser.nb_duplicate} duplicates).")
        # Creation of the output folder if it does not exist.
        path_file_output: Path = (
            path_project / "output" / folder_name / (file_name + ".html")
        )
        path_file_output.parent.mkdir(parents=True, exist_ok=True)
        # Export of the parsed text and formatted into html.
        with open(path_file_output, "w") as file_htmloutput:
            file_htmloutput.write(parser.export_messages())