-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathhtml_to_plain_text.py
More file actions
107 lines (81 loc) · 2.93 KB
/
html_to_plain_text.py
File metadata and controls
107 lines (81 loc) · 2.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# ruff: noqa: D102
from dataclasses import dataclass
from functools import cache
from html.parser import HTMLParser
@dataclass
class OrderedList:
num: int = 1
@dataclass
class UnorderedList:
pass
class PlainTextHTMLConverter(HTMLParser):
plain_text: str
list_item_stack: list[OrderedList | UnorderedList]
def __init__(self) -> None:
super().__init__()
self.plain_text = ""
self.list_item_stack = []
def handle_data(self, data: str) -> None:
self.plain_text += data
def handle_starttag( # noqa: C901 - imo the match is rated too highly
self,
tag: str,
attrs: list[tuple[str, str | None]],
) -> None:
match tag.lower():
case "br":
self.plain_text += "\n"
case "ol":
self.plain_text += "\n"
self.list_item_stack.append(OrderedList())
case "ul":
self.plain_text += "\n"
self.list_item_stack.append(UnorderedList())
case "li":
if len(self.list_item_stack) >= 1:
list_state = self.list_item_stack[-1]
match list_state:
case OrderedList():
self.plain_text += f"{list_state.num}. "
list_state.num += 1
case UnorderedList():
self.plain_text += "- "
case "img":
for name, val in attrs:
if name.lower() == "alt" and val is not None:
self.plain_text += val
break
case _:
pass
def handle_endtag(self, tag: str) -> None:
match tag.lower():
case "ol":
if isinstance(self.list_item_stack[-1], OrderedList):
self.list_item_stack.pop()
case "ul":
if isinstance(self.list_item_stack[-1], UnorderedList):
self.list_item_stack.pop()
case "li":
self.plain_text += "\n"
case _:
pass
@cache
def html_to_plain_text(html: str) -> str:
"""
Extracts plain text from HTML-containing text. This is *NOT* input sanitisation.
Removes most tags in place, and decodes entities - `<b>&</b>` becomes `&`.
A few tags are substituted for plain text equivalents:
- `<br>` becomes a newline
- `<ol><li>` becomes `1. ` (incrementing with each list item)
- `<ul><li>` becomes `- `
- `<img alt='xyz'>` becomes it's alt text
Intended for use when accessing a mod name/description/option/etc., which may contain HTML tags,
but in a situation where such tags would be inappropriate.
Args:
html: The HTML-containing text.
Returns:
The extracted plain text.
"""
parser = PlainTextHTMLConverter()
parser.feed(html)
return parser.plain_text