Skip to content

Commit b7ebba5

Browse files
author
Silvia
committed
add split chapter script
1 parent 7501985 commit b7ebba5

1 file changed

Lines changed: 357 additions & 0 deletions

File tree

split_chapter.py

Lines changed: 357 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,357 @@
1+
#!/usr/bin/env python3
2+
# encoding: utf-8
3+
import argparse
4+
import re
5+
import tkinter as tk
6+
from tkinter import messagebox
7+
8+
import html
9+
from html.parser import HTMLParser
10+
11+
import yaml
12+
from pymysql import connect
13+
14+
15+
class _HTMLStripper(HTMLParser):
16+
def __init__(self):
17+
super().__init__()
18+
self._parts = []
19+
20+
def handle_data(self, data):
21+
self._parts.append(data)
22+
23+
def handle_starttag(self, tag, attrs):
24+
if tag in ("p", "br", "div", "li", "tr", "h1", "h2", "h3", "h4"):
25+
self._parts.append("\n")
26+
27+
def get_text(self):
28+
return html.unescape("".join(self._parts))
29+
30+
31+
def strip_html(text):
32+
s = _HTMLStripper()
33+
s.feed(text or "")
34+
return s.get_text()
35+
36+
37+
def load_config(path):
38+
with open(path) as f:
39+
return yaml.safe_load(f)
40+
41+
42+
def get_connection(cfg):
43+
return connect(
44+
host=cfg["db_host"],
45+
user=cfg["db_user"],
46+
password=cfg.get("db_password") or "",
47+
database=cfg["output_database"],
48+
charset="utf8mb4",
49+
use_unicode=True,
50+
autocommit=False,
51+
)
52+
53+
54+
def fetch_chapter(conn, chapter_id):
55+
with conn.cursor() as cur:
56+
cur.execute("SELECT * FROM chapters WHERE id = %s", (chapter_id,))
57+
cols = [d[0] for d in cur.description]
58+
row = cur.fetchone()
59+
if row is None:
60+
raise ValueError(f"Chapter {chapter_id} not found in output database")
61+
return dict(zip(cols, row))
62+
63+
64+
def db_update_chapter_text(cur, chapter_id, text):
65+
cur.execute("UPDATE chapters SET text = %s WHERE id = %s", (text, chapter_id))
66+
67+
68+
def db_update_chapter_text_and_title(cur, chapter_id, text, title):
69+
cur.execute(
70+
"UPDATE chapters SET text = %s, title = %s WHERE id = %s",
71+
(text, title, chapter_id),
72+
)
73+
74+
75+
def db_shift_later_chapters(cur, story_id, after_position):
76+
cur.execute(
77+
"UPDATE chapters SET position = position + 1 WHERE story_id = %s AND position > %s",
78+
(story_id, after_position),
79+
)
80+
81+
82+
def db_insert_chapter(cur, story_id, position, title, author_id, text, date, notes, url):
83+
cur.execute(
84+
"""INSERT INTO chapters (position, title, author_id, text, date, story_id, notes, url)
85+
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)""",
86+
(position, title, author_id, text, date, story_id, notes, url),
87+
)
88+
89+
90+
def db_trim_chapter(conn, chapter_id, trimmed_text):
91+
with conn.cursor() as cur:
92+
db_update_chapter_text(cur, chapter_id, trimmed_text)
93+
conn.commit()
94+
95+
96+
def db_split_chapter(conn, chapter, before_text, after_text, title_part1, title_part2):
97+
story_id = chapter["story_id"]
98+
orig_position = chapter["position"]
99+
with conn.cursor() as cur:
100+
db_shift_later_chapters(cur, story_id, orig_position)
101+
db_update_chapter_text_and_title(cur, chapter["id"], before_text, title_part1)
102+
db_insert_chapter(
103+
cur,
104+
story_id=story_id,
105+
position=orig_position + 1,
106+
title=title_part2,
107+
author_id=chapter["author_id"],
108+
text=after_text,
109+
date=chapter["date"],
110+
notes=chapter["notes"],
111+
url=chapter["url"],
112+
)
113+
conn.commit()
114+
115+
116+
class SplitChapterApp:
117+
def __init__(self, root, conn, chapter):
118+
self.root = root
119+
self.conn = conn
120+
self.chapter = chapter
121+
self.split_index = None
122+
123+
root.title(f"Split Chapter: {chapter['title']}")
124+
125+
info = tk.Frame(root)
126+
info.pack(fill=tk.X, padx=10, pady=5)
127+
tk.Label(
128+
info,
129+
text=f"Chapter ID: {chapter['id']} | Story ID: {chapter['story_id']} | Position: {chapter['position']}",
130+
).pack(side=tk.LEFT)
131+
132+
text_frame = tk.Frame(root)
133+
text_frame.pack(fill=tk.BOTH, expand=True, padx=10)
134+
135+
scrollbar = tk.Scrollbar(text_frame)
136+
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
137+
138+
self.text_widget = tk.Text(
139+
text_frame,
140+
wrap=tk.WORD,
141+
yscrollcommand=scrollbar.set,
142+
width=100,
143+
height=40,
144+
cursor="ibeam",
145+
)
146+
self.text_widget.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
147+
scrollbar.config(command=self.text_widget.yview)
148+
149+
# Display stripped text; keep a mapping from display offset → raw HTML offset
150+
self._display_text, self._offset_map = self._build_display(chapter["text"] or "")
151+
self.text_widget.insert(tk.END, self._display_text)
152+
self.text_widget.config(state=tk.DISABLED)
153+
self.text_widget.bind("<Button-1>", self.on_click)
154+
self.text_widget.bind("<ButtonRelease-1>", self.on_release)
155+
156+
self.status_var = tk.StringVar(value="Click in the text to set a split point.")
157+
tk.Label(root, textvariable=self.status_var, fg="blue").pack(pady=4)
158+
159+
counts_frame = tk.Frame(root)
160+
counts_frame.pack()
161+
self.before_var = tk.StringVar(value="Before: —")
162+
self.after_var = tk.StringVar(value="After: —")
163+
tk.Label(counts_frame, textvariable=self.before_var, width=30).pack(side=tk.LEFT, padx=10)
164+
tk.Label(counts_frame, textvariable=self.after_var, width=30).pack(side=tk.LEFT, padx=10)
165+
166+
btn_frame = tk.Frame(root)
167+
btn_frame.pack(pady=10)
168+
self.split_btn = tk.Button(
169+
btn_frame,
170+
text="Split at click point",
171+
state=tk.DISABLED,
172+
command=self.do_split,
173+
padx=20,
174+
)
175+
self.split_btn.pack(side=tk.LEFT, padx=10)
176+
self.trim_btn = tk.Button(
177+
btn_frame,
178+
text="Trim to selection",
179+
state=tk.DISABLED,
180+
command=self.do_trim,
181+
padx=20,
182+
)
183+
self.trim_btn.pack(side=tk.LEFT, padx=10)
184+
tk.Button(btn_frame, text="Cancel", command=root.quit, padx=20).pack(side=tk.LEFT, padx=10)
185+
186+
def _build_display(self, raw_html):
187+
"""
188+
Returns (display_text, offset_map) where offset_map[display_idx] = raw_idx.
189+
Strips HTML tags, converts block-level tags to newlines, unescapes entities.
190+
"""
191+
display_chars = []
192+
offset_map = [] # display position i → raw HTML position
193+
raw = raw_html
194+
i = 0
195+
block_tags = {"p", "br", "div", "li", "tr", "h1", "h2", "h3", "h4", "hr"}
196+
while i < len(raw):
197+
if raw[i] == "<":
198+
end = raw.find(">", i)
199+
if end == -1:
200+
display_chars.append(raw[i])
201+
offset_map.append(i)
202+
i += 1
203+
continue
204+
tag_content = raw[i + 1:end].strip().lower().lstrip("/").split()[0] if raw[i + 1:end].strip() else ""
205+
if tag_content in block_tags:
206+
display_chars.append("\n")
207+
offset_map.append(i)
208+
i = end + 1
209+
elif raw[i] == "&":
210+
end = raw.find(";", i)
211+
if end == -1 or end - i > 10:
212+
display_chars.append(raw[i])
213+
offset_map.append(i)
214+
i += 1
215+
else:
216+
entity = raw[i:end + 1]
217+
decoded = html.unescape(entity)
218+
for ch in decoded:
219+
display_chars.append(ch)
220+
offset_map.append(i)
221+
i = end + 1
222+
else:
223+
display_chars.append(raw[i])
224+
offset_map.append(i)
225+
i += 1
226+
return "".join(display_chars), offset_map
227+
228+
def on_click(self, event):
229+
idx = self.text_widget.index(f"@{event.x},{event.y}")
230+
display_offset = self._tk_index_to_char_offset(idx, self._display_text)
231+
# Map display offset back to raw HTML offset
232+
if display_offset < len(self._offset_map):
233+
raw_offset = self._offset_map[display_offset]
234+
else:
235+
raw_offset = len(self.chapter["text"] or "")
236+
self.split_index = raw_offset
237+
238+
full_text = self.chapter["text"] or ""
239+
before_len = len(full_text[:raw_offset])
240+
after_len = len(full_text[raw_offset:])
241+
242+
self.before_var.set(f"Before: {before_len:,} chars")
243+
self.after_var.set(f"After: {after_len:,} chars")
244+
self.status_var.set(f"Split point at raw HTML offset {raw_offset:,}. Click 'Split' to confirm.")
245+
self.split_btn.config(state=tk.NORMAL)
246+
247+
self.text_widget.config(state=tk.NORMAL)
248+
self.text_widget.tag_remove("split", "1.0", tk.END)
249+
self.text_widget.tag_add("split", idx)
250+
self.text_widget.tag_config("split", background="yellow")
251+
self.text_widget.config(state=tk.DISABLED)
252+
253+
def _tk_index_to_char_offset(self, idx, text):
254+
line, col = map(int, idx.split("."))
255+
lines = text.split("\n")
256+
offset = sum(len(lines[i]) + 1 for i in range(line - 1)) # +1 for each \n
257+
return offset + col
258+
259+
def on_release(self, event):
260+
try:
261+
sel_start = self.text_widget.index(tk.SEL_FIRST)
262+
sel_end = self.text_widget.index(tk.SEL_LAST)
263+
if sel_start != sel_end:
264+
self.trim_btn.config(state=tk.NORMAL)
265+
self.status_var.set("Text selected. Click 'Trim to selection' to keep only the selected text.")
266+
return
267+
except tk.TclError:
268+
pass
269+
self.trim_btn.config(state=tk.DISABLED)
270+
271+
def do_trim(self):
272+
try:
273+
sel_start = self.text_widget.index(tk.SEL_FIRST)
274+
sel_end = self.text_widget.index(tk.SEL_LAST)
275+
except tk.TclError:
276+
messagebox.showwarning("No selection", "Please select the text you want to keep.")
277+
return
278+
279+
start_display = self._tk_index_to_char_offset(sel_start, self._display_text)
280+
end_display = self._tk_index_to_char_offset(sel_end, self._display_text)
281+
282+
raw = self.chapter["text"] or ""
283+
raw_start = self._offset_map[start_display] if start_display < len(self._offset_map) else 0
284+
raw_end = self._offset_map[end_display] if end_display < len(self._offset_map) else len(raw)
285+
trimmed = raw[raw_start:raw_end]
286+
287+
if not trimmed.strip():
288+
messagebox.showwarning("Empty selection", "The selected text is empty.")
289+
return
290+
291+
if not messagebox.askyesno(
292+
"Confirm trim",
293+
f"This will replace the chapter text with the selected {len(trimmed):,} characters.\n\nThis cannot be undone. Continue?",
294+
):
295+
return
296+
297+
try:
298+
db_trim_chapter(self.conn, self.chapter["id"], trimmed)
299+
messagebox.showinfo("Done", f"Chapter {self.chapter['id']} trimmed to {len(trimmed):,} characters.")
300+
self.root.quit()
301+
except Exception as e:
302+
self.conn.rollback()
303+
messagebox.showerror("Error", str(e))
304+
305+
def do_split(self):
306+
if self.split_index is None:
307+
return
308+
309+
full_text = self.chapter["text"] or ""
310+
before_text = full_text[: self.split_index]
311+
after_text = full_text[self.split_index :]
312+
313+
if not before_text.strip() or not after_text.strip():
314+
messagebox.showwarning("Invalid split", "Both parts must have content.")
315+
return
316+
317+
base_title = re.sub(r"\s+Part \d+$", "", self.chapter["title"] or "").strip()
318+
title_part1 = f"{base_title} Part 1"
319+
title_part2 = f"{base_title} Part 2"
320+
321+
try:
322+
db_split_chapter(self.conn, self.chapter, before_text, after_text, title_part1, title_part2)
323+
messagebox.showinfo(
324+
"Done",
325+
f"Chapter split successfully.\n\n"
326+
f"Chapter {self.chapter['id']} → '{title_part1}'\n"
327+
f"New chapter (position {self.chapter['position'] + 1}) → '{title_part2}'",
328+
)
329+
self.root.quit()
330+
except Exception as e:
331+
self.conn.rollback()
332+
messagebox.showerror("Error", str(e))
333+
334+
335+
def main():
336+
import getpass
337+
338+
parser = argparse.ArgumentParser(description="Split a chapter in the Open Doors output database")
339+
parser.add_argument("-p", "--properties_file", required=True, help="Path to yml config file")
340+
parser.add_argument("--chapter_id", required=True, type=int, help="ID of the chapter to split")
341+
args = parser.parse_args()
342+
343+
cfg = load_config(args.properties_file)
344+
if not cfg.get("db_password"):
345+
cfg["db_password"] = getpass.getpass(f"MySQL password for {cfg['db_user']}@{cfg['db_host']}: ")
346+
conn = get_connection(cfg)
347+
chapter = fetch_chapter(conn, args.chapter_id)
348+
349+
root = tk.Tk()
350+
root.geometry("1000x700")
351+
SplitChapterApp(root, conn, chapter)
352+
root.mainloop()
353+
conn.close()
354+
355+
356+
if __name__ == "__main__":
357+
main()

0 commit comments

Comments
 (0)