#!/usr/bin/env python
# -*- coding: utf-8 -*-
# System packages
import pathlib
from datetime import datetime
from html.parser import HTMLParser
from typing import List, Tuple, Optional, Set
from enum import IntEnum
path_project = pathlib.Path("/home/gmartin/Workspace/DiscordPrettyPrinter")
class DiscordRollTok(IntEnum):
"""
List the different times discords tokens has been modified.
"""
ED_20230709 = 1 # Exports réalisés le 2023-07-09
ED_20240605 = 2 # Exports réalisés le 2024-06-05
# Tokens used to parse pseudos.
PSEUDO_TAG: dict[DiscordRollTok, str] = {
DiscordRollTok.ED_20230709: "username-h_Y3Us",
DiscordRollTok.ED_20240605: "username__0b0e7",
}
# Tokens used to parse reaction counts.
REACTCOUNT_TAG: dict[DiscordRollTok, str] = {
DiscordRollTok.ED_20230709: "reactionCount-26U4As",
DiscordRollTok.ED_20240605: "reactionCount__2c34d",
}
# Tokens used to parse the divider and the date.
DIVIDER_TAG: dict[DiscordRollTok, str] = {
DiscordRollTok.ED_20230709: "divider-IqmEqJ",
DiscordRollTok.ED_20240605: "divider__01aed",
}
def check_tagattr(attrs: List[Tuple[str, str]], tag: str) -> bool:
"""Checks if the given tag, e.g. 'username-h_Y3Us', is inside the list of attributes values.
"""
attrs_keyvalues: List[str] = list(zip(*attrs))
if len(attrs_keyvalues):
for attr_value in attrs_keyvalues[1]:
if tag in attr_value:
return True
return False
class DiscordHTMLParser(HTMLParser):
def __init__(self, keep_time: bool, convert_charrefs=True):
super().__init__(convert_charrefs=convert_charrefs)
self.keep_time = keep_time
# Parsed text to be exported.
self.export_txt: str = ""
self.nb_messages: int = 0
self.nb_duplicate: int = 0
# Information about what is being parsed.
self.is_scan_message: bool = False
self.is_scan_pseudo: bool = False
self.is_scan_time: bool = False
self.is_scan_reactioncount: bool = False
self.is_scan_datedivider: bool = False
# When parsing a time element, keep its datetime to print it in the message.
self.current_dt: Optional[datetime] = None
# Keep track of which message and date separator have been added.
self.set_idmessage: Set[str] = set()
self.set_dateseparator: Set[str] = set()
def handle_starttag(self, tag, attrs):
# Add to the export the tags used to format the text.
match tag:
case "em":
if self.is_scan_message:
self.export_txt += ""
case "strong":
if self.is_scan_message:
self.export_txt += ""
case "li":
if check_tagattr(attrs, "chat-messages"):
# Retrieve the id of the message.
id_message: str = ""
for key, value in attrs:
if "chat-messages" in value:
id_message = value.split("chat-messages-")[1]
# If the message has already been seen, don't keep it.
if id_message in self.set_idmessage:
self.nb_duplicate += 1
else:
self.set_idmessage.add(id_message)
self.nb_messages += 1
self.is_scan_message = True
self.export_txt += f"\n"
case "strong":
if self.is_scan_message:
self.export_txt += ""
case "li":
if self.is_scan_message:
self.is_scan_message = False
self.export_txt += ""
case "span":
self.is_scan_pseudo = False
case "div":
self.is_scan_reactioncount = False
self.is_scan_datedivider = False
case "time":
self.is_scan_time = False
self.current_dt = None
def handle_data(self, data):
if self.is_scan_message:
if self.is_scan_reactioncount:
return
elif self.is_scan_time and self.keep_time:
if self.current_dt is not None:
self.export_txt += f"[{self.current_dt.strftime('%Y-%m-%d %H:%M')}] "
self.current_dt = None
elif self.is_scan_pseudo:
pseudo_ansi = ''.join(i for i in data.lower() if ord(i) < 128)
self.export_txt += f""
self.export_txt += data
self.export_txt += ""
elif not (data in [""]):
data = data.replace("\n", "\n
")
self.export_txt += data
elif self.is_scan_datedivider:
self.export_txt += f"