misc: Added initial codebase.

main
Abigaëlle Martin 2025-09-23 23:20:43 +02:00
parent 17162b9cfd
commit b47996b4c1
2 changed files with 224 additions and 0 deletions

187
htmlparser.py Normal file
View File

@ -0,0 +1,187 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# System packages
import pathlib
from datetime import datetime
from html.parser import HTMLParser
from typing import List, Tuple, Optional, Set
from enum import IntEnum
path_project = pathlib.Path("/home/gmartin/Workspace/DiscordPrettyPrinter")
class DiscordRollTok(IntEnum):
"""
List the different times discords tokens has been modified.
"""
ED_20230709 = 1 # Exports réalisés le 2023-07-09
ED_20240605 = 2 # Exports réalisés le 2024-06-05
# Tokens used to parse pseudos.
PSEUDO_TAG: dict[DiscordRollTok, str] = {
DiscordRollTok.ED_20230709: "username-h_Y3Us",
DiscordRollTok.ED_20240605: "username__0b0e7",
}
# Tokens used to parse reaction counts.
REACTCOUNT_TAG: dict[DiscordRollTok, str] = {
DiscordRollTok.ED_20230709: "reactionCount-26U4As",
DiscordRollTok.ED_20240605: "reactionCount__2c34d",
}
# Tokens used to parse the divider and the date.
DIVIDER_TAG: dict[DiscordRollTok, str] = {
DiscordRollTok.ED_20230709: "divider-IqmEqJ",
DiscordRollTok.ED_20240605: "divider__01aed",
}
def check_tagattr(attrs: List[Tuple[str, str]], tag: str) -> bool:
"""Checks if the given tag, e.g. 'username-h_Y3Us', is inside the list of attributes values.
"""
attrs_keyvalues: List[str] = list(zip(*attrs))
if len(attrs_keyvalues):
for attr_value in attrs_keyvalues[1]:
if tag in attr_value:
return True
return False
class DiscordHTMLParser(HTMLParser):
def __init__(self, keep_time: bool, convert_charrefs=True):
super().__init__(convert_charrefs=convert_charrefs)
self.keep_time = keep_time
# Parsed text to be exported.
self.export_txt: str = ""
self.nb_messages: int = 0
self.nb_duplicate: int = 0
# Information about what is being parsed.
self.is_scan_message: bool = False
self.is_scan_pseudo: bool = False
self.is_scan_time: bool = False
self.is_scan_reactioncount: bool = False
self.is_scan_datedivider: bool = False
# When parsing a time element, keep its datetime to print it in the message.
self.current_dt: Optional[datetime] = None
# Keep track of which message and date separator have been added.
self.set_idmessage: Set[str] = set()
self.set_dateseparator: Set[str] = set()
def handle_starttag(self, tag, attrs):
# Add to the export the tags used to format the text.
match tag:
case "em":
if self.is_scan_message:
self.export_txt += "<em>"
case "strong":
if self.is_scan_message:
self.export_txt += "<b>"
case "li":
if check_tagattr(attrs, "chat-messages"):
# Retrieve the id of the message.
id_message: str = ""
for key, value in attrs:
if "chat-messages" in value:
id_message = value.split("chat-messages-")[1]
# If the message has already been seen, don't keep it.
if id_message in self.set_idmessage:
self.nb_duplicate += 1
else:
self.set_idmessage.add(id_message)
self.nb_messages += 1
self.is_scan_message = True
self.export_txt += f"\n<li class=\"message\" id_message=\"{id_message}\">"
case "span":
# Tell whether the pseudo is being scanned or not.
if check_tagattr(attrs, PSEUDO_TAG[DiscordRollTok.ED_20240605]):
self.is_scan_pseudo = True
case "div":
# Tell whether the reaction count is being scanned or not.
if check_tagattr(attrs, REACTCOUNT_TAG[DiscordRollTok.ED_20240605]):
self.is_scan_reactioncount = True
elif check_tagattr(attrs, DIVIDER_TAG[DiscordRollTok.ED_20240605]):
# Retrieve which date separator is being scanned.
date_separator: str = ""
for key, value in attrs:
if key == "aria-label":
date_separator = value
if date_separator not in self.set_dateseparator:
self.set_dateseparator.add(date_separator)
self.is_scan_datedivider = True
case "time":
if self.is_scan_message:
if not check_tagattr(attrs, "Edited"):
self.is_scan_time = True
for key, value in attrs:
if key == "datetime":
self.current_dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%fZ")
def handle_endtag(self, tag):
# Add to the export the tags used to format the text.
match tag:
case "em":
if self.is_scan_message:
self.export_txt += "</em>"
case "strong":
if self.is_scan_message:
self.export_txt += "</b>"
case "li":
if self.is_scan_message:
self.is_scan_message = False
self.export_txt += "</li>"
case "span":
self.is_scan_pseudo = False
case "div":
self.is_scan_reactioncount = False
self.is_scan_datedivider = False
case "time":
self.is_scan_time = False
self.current_dt = None
def handle_data(self, data):
if self.is_scan_message:
if self.is_scan_reactioncount:
return
elif self.is_scan_time and self.keep_time:
if self.current_dt is not None:
self.export_txt += f"<span class=\"time\">[{self.current_dt.strftime('%Y-%m-%d %H:%M')}]</span> "
self.current_dt = None
elif self.is_scan_pseudo:
pseudo_ansi = ''.join(i for i in data.lower() if ord(i) < 128)
self.export_txt += f"<span class=\"pseudo_{pseudo_ansi}\"><b>"
self.export_txt += data
self.export_txt += "</b></span>"
elif not (data in [""]):
data = data.replace("\n", "\n<br/>")
self.export_txt += data
elif self.is_scan_datedivider:
self.export_txt += f"<div class=\"dateSeparator\">{data}</div>\n"
def export_messages(self) -> str:
before_export: str = ""
after_export: str = ""
before_export += "<html>\n"
before_export += "<head>\n"
before_export += "<link rel=\"stylesheet\" type=\"text/css\" href=\"style.css\" />\n"
before_export += "</head>\n"
before_export += "<body>\n"
before_export += "<ul>\n"
after_export += "</ul>\n"
after_export += "</body>\n"
after_export += "</html>\n"
return before_export + self.export_txt + after_export

37
main.py Normal file
View File

@ -0,0 +1,37 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# System packages
import pathlib
from pathlib import Path
# Project package
from htmlparser import DiscordHTMLParser
path_project = pathlib.Path("/home/gmartin/Workspace/DiscordPrettyPrinter")
if __name__ == '__main__':
parser = DiscordHTMLParser(keep_time=False)
folder_name = "sar"
file_name = "mp"
xml_path: pathlib.Path = path_project / "input" / folder_name / (file_name + ".xml")
with open(xml_path, "r") as file_xml:
xml_str = ""
for line in file_xml:
xml_str += line
parser.feed(xml_str)
# print(parser.export_messages())
print(f"Parsed: {xml_path}")
print(f"{parser.nb_messages} messages found (and {parser.nb_duplicate} duplicates).")
# Creation of the output folder if it does not exist.
path_file_output: Path = (
path_project / "output" / folder_name / (file_name + ".html")
)
path_file_output.parent.mkdir(parents=True, exist_ok=True)
# Export of the parsed text and formatted into html.
with open(path_file_output, "w") as file_htmloutput:
file_htmloutput.write(parser.export_messages())