-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathChapters.py
More file actions
executable file
·163 lines (145 loc) · 6.7 KB
/
Chapters.py
File metadata and controls
executable file
·163 lines (145 loc) · 6.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# -- coding: utf-8 --
import codecs
import os
import re
from pip._vendor.distlib.compat import raw_input
from shared_python import Common
# TODO this code is no longer needed for eFiction and will need to be reviewed for other archive types
class Chapters(object):
def __init__(self, args, sql, log):
self.args = args
self.sql = sql
self.log = log
def _ends_with(self, filename, extensions):
return any(filename.endswith(ext) for ext in extensions)
def _gather_and_dedupe(self, chapters_path, extensions, has_ids=False):
self.log.info("\nFinding chapters and identifying duplicates")
extensions = re.split(r", ?", extensions)
story_folder = os.walk(chapters_path)
file_paths = {}
duplicate_chapters = {}
has_duplicates = False
messages = []
sql_messages = []
cur = 0
for root, _, filenames in story_folder:
total = len(filenames)
Common.print_progress(cur, total)
for filename in filenames:
if has_ids and self._ends_with(filename, extensions):
file_path = os.path.join(root, filename)
cid = os.path.splitext(filename)[0]
if cid not in file_paths.keys():
file_paths[cid] = file_path
else:
duplicate_folder = os.path.split(os.path.split(file_path)[0])[1]
messages.append(
file_path + " is a duplicate of " + file_paths[cid]
)
sql_messages.append(
"SELECT * FROM chapters WHERE id = {0}".format(cid)
)
duplicate_chapters[cid] = [
{
"folder_name": os.path.split(
os.path.split(file_paths[cid])[0]
)[1],
"filename": filename,
"path": file_paths[cid],
},
{
"folder_name": duplicate_folder,
"filename": filename,
"path": file_path,
},
]
has_duplicates = True
else:
file_path = os.path.join(root, filename)
name = os.path.splitext(filename)[0]
file_paths[name] = file_path
if has_duplicates:
self.log.warn("\n".join(messages + sql_messages))
self.log.warn(duplicate_chapters)
folder_name_type = raw_input(
"Resolving duplicates: pick the type of the folder name under {0} "
"\n1 = author id\n2 = author name\n3 = skip duplicates check\n".format(
chapters_path
)
)
if folder_name_type == "1":
for cid, duplicate in duplicate_chapters.items():
# look up the author id and add that one to the file_names list
sql_author_id = self.sql.execute_and_fetchall(
self.sql.database,
"SELECT author_id FROM chapters WHERE id = {0}".format(cid),
)
if len(sql_author_id) > 0:
author_id = sql_author_id[0][0]
file_paths[cid] = [
dc["path"]
for dc in duplicate_chapters[cid]
if dc["folder_name"] == str(author_id)
][0]
elif folder_name_type == "2":
self.log.warn("Not implemented")
return file_paths
# TODO this is no longer needed to load eFiction chapters - see if it's still useful for other archive types
def populate_chapters(self, folder=None, extensions=None):
if folder is None:
folder = self.args.chapters_path
if extensions is None:
extensions = self.args.chapters_file_extensions
self.log.info("Processing chapters...")
filenames_are_ids = raw_input("\nChapter file names are chapter ids? Y/N\n")
has_ids = True if str.lower(filenames_are_ids) == "y" else False
file_paths = self._gather_and_dedupe(folder, extensions, has_ids)
char_encoding = raw_input(
"\n\nImporting chapters: pick character encoding (check for curly quotes):\n"
"1 = Windows 1252\nenter = UTF-8\n"
)
if char_encoding == "1":
char_encoding = "cp1252"
else:
char_encoding = "utf8"
cur = 0
total = len(file_paths)
if has_ids:
for cid, chapter_path in file_paths.items():
with codecs.open(chapter_path, "r", encoding=char_encoding) as c:
try:
cur = Common.print_progress(cur, total)
file_contents = c.read()
query = "UPDATE {0}.chapters SET text=%s WHERE id=%s".format(
self.args.output_database
)
self.sql.execute(query, (file_contents, int(cid)))
except Exception as e:
self.log.error(
"Error = chapter id: {0} - chapter: {1}\n{2}".format(
cid, chapter_path, str(e)
)
)
finally:
pass
else:
for _, chapter_path in file_paths.items():
path = chapter_path.replace(self.args.chapters_path, "")[1:]
if os.sep == "\\": # if this script is run on windows
path = path.replace("\\", "/")
with codecs.open(chapter_path, "r", encoding=char_encoding) as c:
try:
cur = Common.print_progress(cur, total)
file_contents = c.read()
query = "UPDATE {0}.chapters SET text=%s WHERE url=%s and text=''".format(
self.args.output_database
)
self.sql.execute(query, (file_contents, path))
except Exception as e:
self.log.error(
"Error = chapter id: {0} - chapter: {1}\n{2}".format(
path, chapter_path, str(e)
)
)
finally:
pass