misc: Added initial codebase.
parent
17162b9cfd
commit
b47996b4c1
|
|
@ -0,0 +1,187 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# System packages
|
||||||
|
import pathlib
|
||||||
|
from datetime import datetime
|
||||||
|
from html.parser import HTMLParser
|
||||||
|
from typing import List, Tuple, Optional, Set
|
||||||
|
from enum import IntEnum
|
||||||
|
|
||||||
|
path_project = pathlib.Path("/home/gmartin/Workspace/DiscordPrettyPrinter")
|
||||||
|
|
||||||
|
|
||||||
|
class DiscordRollTok(IntEnum):
|
||||||
|
"""
|
||||||
|
List the different times discords tokens has been modified.
|
||||||
|
"""
|
||||||
|
ED_20230709 = 1 # Exports réalisés le 2023-07-09
|
||||||
|
ED_20240605 = 2 # Exports réalisés le 2024-06-05
|
||||||
|
|
||||||
|
# Tokens used to parse pseudos.
|
||||||
|
PSEUDO_TAG: dict[DiscordRollTok, str] = {
|
||||||
|
DiscordRollTok.ED_20230709: "username-h_Y3Us",
|
||||||
|
DiscordRollTok.ED_20240605: "username__0b0e7",
|
||||||
|
}
|
||||||
|
# Tokens used to parse reaction counts.
|
||||||
|
REACTCOUNT_TAG: dict[DiscordRollTok, str] = {
|
||||||
|
DiscordRollTok.ED_20230709: "reactionCount-26U4As",
|
||||||
|
DiscordRollTok.ED_20240605: "reactionCount__2c34d",
|
||||||
|
}
|
||||||
|
# Tokens used to parse the divider and the date.
|
||||||
|
DIVIDER_TAG: dict[DiscordRollTok, str] = {
|
||||||
|
DiscordRollTok.ED_20230709: "divider-IqmEqJ",
|
||||||
|
DiscordRollTok.ED_20240605: "divider__01aed",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def check_tagattr(attrs: List[Tuple[str, str]], tag: str) -> bool:
|
||||||
|
"""Checks if the given tag, e.g. 'username-h_Y3Us', is inside the list of attributes values.
|
||||||
|
"""
|
||||||
|
attrs_keyvalues: List[str] = list(zip(*attrs))
|
||||||
|
if len(attrs_keyvalues):
|
||||||
|
for attr_value in attrs_keyvalues[1]:
|
||||||
|
if tag in attr_value:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
class DiscordHTMLParser(HTMLParser):
|
||||||
|
|
||||||
|
def __init__(self, keep_time: bool, convert_charrefs=True):
|
||||||
|
super().__init__(convert_charrefs=convert_charrefs)
|
||||||
|
|
||||||
|
self.keep_time = keep_time
|
||||||
|
|
||||||
|
# Parsed text to be exported.
|
||||||
|
self.export_txt: str = ""
|
||||||
|
self.nb_messages: int = 0
|
||||||
|
self.nb_duplicate: int = 0
|
||||||
|
|
||||||
|
# Information about what is being parsed.
|
||||||
|
self.is_scan_message: bool = False
|
||||||
|
self.is_scan_pseudo: bool = False
|
||||||
|
self.is_scan_time: bool = False
|
||||||
|
self.is_scan_reactioncount: bool = False
|
||||||
|
self.is_scan_datedivider: bool = False
|
||||||
|
|
||||||
|
# When parsing a time element, keep its datetime to print it in the message.
|
||||||
|
self.current_dt: Optional[datetime] = None
|
||||||
|
|
||||||
|
# Keep track of which message and date separator have been added.
|
||||||
|
self.set_idmessage: Set[str] = set()
|
||||||
|
self.set_dateseparator: Set[str] = set()
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attrs):
|
||||||
|
|
||||||
|
# Add to the export the tags used to format the text.
|
||||||
|
match tag:
|
||||||
|
case "em":
|
||||||
|
if self.is_scan_message:
|
||||||
|
self.export_txt += "<em>"
|
||||||
|
case "strong":
|
||||||
|
if self.is_scan_message:
|
||||||
|
self.export_txt += "<b>"
|
||||||
|
case "li":
|
||||||
|
if check_tagattr(attrs, "chat-messages"):
|
||||||
|
|
||||||
|
# Retrieve the id of the message.
|
||||||
|
id_message: str = ""
|
||||||
|
for key, value in attrs:
|
||||||
|
if "chat-messages" in value:
|
||||||
|
id_message = value.split("chat-messages-")[1]
|
||||||
|
|
||||||
|
# If the message has already been seen, don't keep it.
|
||||||
|
if id_message in self.set_idmessage:
|
||||||
|
self.nb_duplicate += 1
|
||||||
|
else:
|
||||||
|
self.set_idmessage.add(id_message)
|
||||||
|
|
||||||
|
self.nb_messages += 1
|
||||||
|
self.is_scan_message = True
|
||||||
|
self.export_txt += f"\n<li class=\"message\" id_message=\"{id_message}\">"
|
||||||
|
case "span":
|
||||||
|
# Tell whether the pseudo is being scanned or not.
|
||||||
|
if check_tagattr(attrs, PSEUDO_TAG[DiscordRollTok.ED_20240605]):
|
||||||
|
self.is_scan_pseudo = True
|
||||||
|
case "div":
|
||||||
|
# Tell whether the reaction count is being scanned or not.
|
||||||
|
if check_tagattr(attrs, REACTCOUNT_TAG[DiscordRollTok.ED_20240605]):
|
||||||
|
self.is_scan_reactioncount = True
|
||||||
|
elif check_tagattr(attrs, DIVIDER_TAG[DiscordRollTok.ED_20240605]):
|
||||||
|
|
||||||
|
# Retrieve which date separator is being scanned.
|
||||||
|
date_separator: str = ""
|
||||||
|
for key, value in attrs:
|
||||||
|
if key == "aria-label":
|
||||||
|
date_separator = value
|
||||||
|
|
||||||
|
if date_separator not in self.set_dateseparator:
|
||||||
|
self.set_dateseparator.add(date_separator)
|
||||||
|
self.is_scan_datedivider = True
|
||||||
|
case "time":
|
||||||
|
if self.is_scan_message:
|
||||||
|
if not check_tagattr(attrs, "Edited"):
|
||||||
|
self.is_scan_time = True
|
||||||
|
for key, value in attrs:
|
||||||
|
if key == "datetime":
|
||||||
|
self.current_dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%fZ")
|
||||||
|
|
||||||
|
def handle_endtag(self, tag):
|
||||||
|
|
||||||
|
# Add to the export the tags used to format the text.
|
||||||
|
match tag:
|
||||||
|
case "em":
|
||||||
|
if self.is_scan_message:
|
||||||
|
self.export_txt += "</em>"
|
||||||
|
case "strong":
|
||||||
|
if self.is_scan_message:
|
||||||
|
self.export_txt += "</b>"
|
||||||
|
case "li":
|
||||||
|
if self.is_scan_message:
|
||||||
|
self.is_scan_message = False
|
||||||
|
self.export_txt += "</li>"
|
||||||
|
case "span":
|
||||||
|
self.is_scan_pseudo = False
|
||||||
|
case "div":
|
||||||
|
self.is_scan_reactioncount = False
|
||||||
|
self.is_scan_datedivider = False
|
||||||
|
case "time":
|
||||||
|
self.is_scan_time = False
|
||||||
|
self.current_dt = None
|
||||||
|
|
||||||
|
def handle_data(self, data):
|
||||||
|
|
||||||
|
if self.is_scan_message:
|
||||||
|
if self.is_scan_reactioncount:
|
||||||
|
return
|
||||||
|
elif self.is_scan_time and self.keep_time:
|
||||||
|
if self.current_dt is not None:
|
||||||
|
self.export_txt += f"<span class=\"time\">[{self.current_dt.strftime('%Y-%m-%d %H:%M')}]</span> "
|
||||||
|
self.current_dt = None
|
||||||
|
elif self.is_scan_pseudo:
|
||||||
|
pseudo_ansi = ''.join(i for i in data.lower() if ord(i) < 128)
|
||||||
|
self.export_txt += f"<span class=\"pseudo_{pseudo_ansi}\"><b>"
|
||||||
|
self.export_txt += data
|
||||||
|
self.export_txt += "</b></span>"
|
||||||
|
elif not (data in [""]):
|
||||||
|
data = data.replace("\n", "\n<br/>")
|
||||||
|
self.export_txt += data
|
||||||
|
elif self.is_scan_datedivider:
|
||||||
|
self.export_txt += f"<div class=\"dateSeparator\">{data}</div>\n"
|
||||||
|
|
||||||
|
def export_messages(self) -> str:
|
||||||
|
before_export: str = ""
|
||||||
|
after_export: str = ""
|
||||||
|
|
||||||
|
before_export += "<html>\n"
|
||||||
|
before_export += "<head>\n"
|
||||||
|
before_export += "<link rel=\"stylesheet\" type=\"text/css\" href=\"style.css\" />\n"
|
||||||
|
before_export += "</head>\n"
|
||||||
|
before_export += "<body>\n"
|
||||||
|
before_export += "<ul>\n"
|
||||||
|
|
||||||
|
after_export += "</ul>\n"
|
||||||
|
after_export += "</body>\n"
|
||||||
|
after_export += "</html>\n"
|
||||||
|
return before_export + self.export_txt + after_export
|
||||||
|
|
@ -0,0 +1,37 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
# System packages
|
||||||
|
import pathlib
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Project package
|
||||||
|
from htmlparser import DiscordHTMLParser
|
||||||
|
|
||||||
|
path_project = pathlib.Path("/home/gmartin/Workspace/DiscordPrettyPrinter")
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
parser = DiscordHTMLParser(keep_time=False)
|
||||||
|
folder_name = "sar"
|
||||||
|
file_name = "mp"
|
||||||
|
|
||||||
|
xml_path: pathlib.Path = path_project / "input" / folder_name / (file_name + ".xml")
|
||||||
|
|
||||||
|
with open(xml_path, "r") as file_xml:
|
||||||
|
xml_str = ""
|
||||||
|
for line in file_xml:
|
||||||
|
xml_str += line
|
||||||
|
parser.feed(xml_str)
|
||||||
|
# print(parser.export_messages())
|
||||||
|
print(f"Parsed: {xml_path}")
|
||||||
|
print(f"{parser.nb_messages} messages found (and {parser.nb_duplicate} duplicates).")
|
||||||
|
|
||||||
|
# Creation of the output folder if it does not exist.
|
||||||
|
path_file_output: Path = (
|
||||||
|
path_project / "output" / folder_name / (file_name + ".html")
|
||||||
|
)
|
||||||
|
path_file_output.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Export of the parsed text and formatted into html.
|
||||||
|
with open(path_file_output, "w") as file_htmloutput:
|
||||||
|
file_htmloutput.write(parser.export_messages())
|
||||||
Loading…
Reference in New Issue