-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathmain.py
More file actions
624 lines (493 loc) · 26.4 KB
/
main.py
File metadata and controls
624 lines (493 loc) · 26.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
import os
import re
import sys
import traceback
import urllib.request
from datetime import datetime
import requests
from bs4 import BeautifulSoup
from dateutil import tz
from dotenv import load_dotenv
from loguru import logger
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from common.elasticsearch_utils import document_view, document_add
load_dotenv()
from config.conf import DATA_DIR, INDEX_NAME
DOWNLOAD_PATH = os.path.join(DATA_DIR, "mailing-list/bitcoin-dev")
ORIGINAL_URL = "https://gnusha.org/pi/bitcoindev/"
CUSTOM_URL = "https://mailing-list.bitcoindevs.xyz/bitcoindev/"
month_dict = {
1: "Jan", 2: "Feb", 3: "March", 4: "April", 5: "May", 6: "June",
7: "July", 8: "Aug", 9: "Sept", 10: "Oct", 11: "Nov", 12: "Dec"
}
def save_web_page(link, file_name):
main_url = ORIGINAL_URL + link
html_response = requests.get(f"{ORIGINAL_URL}{link}")
soup = BeautifulSoup(html_response.content, 'html.parser')
main_url_anchor = soup.new_tag("a", href=main_url.replace('#t', ''), id='main_url')
soup.body.append(main_url_anchor)
path = os.path.join(DOWNLOAD_PATH, file_name)
with open(path, 'w', encoding='utf-8') as file:
logger.info(f'Downloading {file_name}')
file.write(str(soup))
def download_dumps(path, page_visited_count, max_page_count=2):
if page_visited_count > max_page_count: return
page_visited_count += 1
logger.info(f"Page {page_visited_count}: {path}")
with urllib.request.urlopen(f"{path}") as f:
soup = BeautifulSoup(f, "html.parser")
pre_tags = soup.find_all('pre')
if len(pre_tags) < 1:
return
next_page_link = f"{ORIGINAL_URL}{soup.find('a', {'rel': 'next'}).get('href')}"
for tag in pre_tags[1].find_all('a'):
try:
date = tag.next_sibling.strip()[:7]
date = date.strip().split('-')
# date = tag.next_sibling.strip()[:8]
if len(date) < 2:
continue
year = int(date[0])
mon = int(date[1])
month = month_dict.get(int(date[1]))
if year < 2024 or (year == 2024 and mon == 1):
return
href = tag.get('href')
file_name = f"{year}-{month}-{href.strip().split('/')[0]}.html"
save_web_page(href, file_name)
except Exception as e:
logger.error(e)
logger.error(tag)
continue
logger.info('----------------------------------------------------------\n')
if next_page_link:
download_dumps(next_page_link, page_visited_count)
def get_thread_structure(soup):
"""Parse the thread structure from the thread overview section"""
thread_structure = []
# Find the thread overview section
thread_overview = None
# Look for the thread overview in different ways
# Method 1: Look for <b id="t">Thread overview:</b>
thread_b_tag = soup.find('b', id='t')
if thread_b_tag and "Thread overview:" in thread_b_tag.text:
# Find the parent container (usually a <pre> tag containing the thread structure)
thread_overview = thread_b_tag.find_parent('pre')
# Method 2: Fallback to searching in pre tags
if not thread_overview:
for pre_tag in soup.find_all('pre'):
if "Thread overview:" in pre_tag.text:
thread_overview = pre_tag
break
if not thread_overview:
logger.warning("⚠️ THREADING: No thread overview section found!")
return []
# Get all text from the thread overview section
full_text = thread_overview.text
# Split into lines and process each line
lines = full_text.split('\n')
# Process each line to extract threading information - USING FIXED VERSION
thread_structure = _parse_thread_lines_fixed(lines, thread_overview)
logger.success(f"✅ THREADING: Total extracted {len(thread_structure)} messages")
# Log a brief summary of the thread hierarchy for important threads only
if thread_structure and len(thread_structure) >= 20: # Only log for larger threads
logger.info(f"🎯 THREADING: Large thread detected ({len(thread_structure)} messages), max depth: {max(item['depth'] for item in thread_structure)}")
return thread_structure
def _parse_thread_lines_fixed(lines, thread_overview_soup):
"""FIXED: Parse thread lines correctly from the HTML structure"""
thread_structure = []
if not thread_overview_soup:
return thread_structure
anchor_to_author = {}
anchor_links = thread_overview_soup.find_all('a', href=lambda href: href and href.startswith('#m'))
for link in anchor_links:
href = link.get('href', '')
anchor_id = href.replace('#', '') if href.startswith('#') else None
if not anchor_id:
continue
link_text = link.get_text().strip()
if link_text.startswith('['):
# Author is in the text node AFTER this link
next_sibling = link.next_sibling
if next_sibling:
if hasattr(next_sibling, 'strip'):
author_text = str(next_sibling)
elif hasattr(next_sibling, 'get_text'):
author_text = next_sibling.get_text()
else:
author_text = str(next_sibling)
# Clean up
author_text = author_text.strip()
if '\n' in author_text:
author_text = author_text.split('\n')[0].strip()
else:
author_text = ""
else:
author_text = link_text
author_text = author_text.rstrip('`"\' ')
author_text = re.sub(r'\s+via\s+Bitcoin\s+Development\s+Mailing\s+List.*$', '',
author_text, flags=re.IGNORECASE).strip()
if author_text and len(author_text) >= 2:
anchor_to_author[anchor_id] = author_text
anchor_link_index = 0
for line in lines:
# Skip non-thread lines
if ("links below jump to the message" in line or
"Thread overview:" in line or
"download:" in line or
"mbox.gz" in line or
"Atom feed" in line or
"end of thread" in line or
"only message in thread" in line or
"other threads:" in line or
not line.strip()):
continue
# Find timestamp pattern
timestamp_pattern = r'(\d{4}-\d{2}-\d{2}\s+\d{1,2}:\d{2})'
timestamp_match = re.search(timestamp_pattern, line)
if not timestamp_match:
continue
timestamp = timestamp_match.group(1)
after_timestamp = line[timestamp_match.end():]
# Count leading spaces for depth
space_match = re.match(r'^(\s*)', after_timestamp)
leading_spaces = len(space_match.group(1)) if space_match else 0
# Check for backtick (indicates reply)
has_backtick = '`' in after_timestamp
# Calculate thread depth
if has_backtick:
thread_depth = leading_spaces // 2 + 1 if leading_spaces > 0 else 1
else:
thread_depth = 0
# Get anchor ID from HTML structure
anchor_id = None
if anchor_link_index < len(anchor_links):
href = anchor_links[anchor_link_index].get('href', '')
anchor_id = href.replace('#', '') if href.startswith('#') else None
anchor_link_index += 1
if not anchor_id:
import hashlib
anchor_content = f"{timestamp}-{line[:50]}"
anchor_id = hashlib.md5(anchor_content.encode()).hexdigest()[:32]
# Get author from our pre-built map (THE CORRECT WAY)
author = anchor_to_author.get(anchor_id, '')
# Skip if we couldn't extract a valid author
if not author or len(author) < 2:
logger.warning(f"⚠️ THREADING: Could not extract author for anchor {anchor_id}")
author = "Unknown Author"
# Filter out navigation artifacts
if re.search(r'UTC\s*\|\s*newest', author, re.IGNORECASE):
continue
if author.lower() in ["utc", "newest", "flat", "nested", "permalink", "raw", "reply"]:
continue
# Final sanitization
author = sanitize_author(author)
thread_structure.append({
'timestamp': timestamp,
'anchor_id': anchor_id,
'author': author,
'depth': thread_depth,
'line': line.strip(),
'leading_spaces': leading_spaces,
'has_backtick': has_backtick
})
logger.success(f"✅ THREADING: Extracted {len(thread_structure)} messages")
return thread_structure
# Removed old _parse_thread_lines function - no longer needed
def get_thread_urls_with_date(pre_tags):
urls_dates = []
date_time_pattern = r'\b\d{4}-\d{2}-\d{2} {1,2}(?:[01]?\d|2[0-3]):[0-5]\d\b'
for pre_tag in reversed(pre_tags):
if "links below jump to the message on this page" in pre_tag.text:
anchor_tags = pre_tag.find_all('a', href=lambda href: href and '#' in href)
for anchor in anchor_tags:
date_search = re.search(date_time_pattern, anchor.previous_sibling.text)
if date_search:
date = date_search.group()
original_datetime = datetime.strptime(date, '%Y-%m-%d %H:%M')
original_datetime = original_datetime.replace(tzinfo=tz.tzutc())
dt = original_datetime.isoformat(timespec='milliseconds').replace('+00:00', 'Z')
urls_dates.append((anchor, dt))
# sort the urls_dates list by datetime in ascending order (earliest first)
urls_dates.sort(key=lambda x: x[1])
return urls_dates
def get_year_month(date):
date = date.strip().split('-')
year = int(date[0])
month = int(date[1])
return year, month
def get_author(content_soup):
"""Extract author from the message header, not from quoted email content"""
# Look for the pattern: <b>@ YYYY-MM-DD HH:MM Author Name</b>
b_tags = content_soup.find_all('b')
for b_tag in b_tags:
text = b_tag.get_text()
# Pattern: "@ 2025-07-12 21:36 Jameson Lopp"
author_match = re.search(r'@\s+\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+(.+)', text)
if author_match:
author = author_match.group(1).strip()
# Clean up common artifacts
author = author.replace("via Bitcoin Development Mailing List", "").strip()
# Handle special characters
author = author.replace("'", "'").replace("<", "<").replace(">", ">")
# Remove backticks
author = author.replace("`", "").strip()
return author
# Fallback: try the From: line method
text = content_soup.get_text()
lines = text.split('\n')
for line in lines[:15]: # Check more lines for headers
if line.startswith('From:') and '@' in line and 'UTC' in line:
# Pattern: "From: Jameson Lopp @ 2025-07-12 21:36 UTC"
from_match = re.search(r'From:\s*(.+?)\s+@\s+\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+UTC', line)
if from_match:
author = from_match.group(1).strip()
author = author.replace("'", "").replace("via Bitcoin Development Mailing List", "").strip()
# Handle special characters
author = author.replace("'", "'").replace("<", "<").replace(">", ">")
# Remove backticks
author = author.replace("`", "").strip()
return author
# Enhanced fallback: look for any line with author pattern
for line in lines[:20]:
# Look for patterns like: "2025-07-14 2:07 ` Antoine Riard"
author_pattern = re.search(r'\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}\s+[`\s]*(.+?)(?:\s|$)', line)
if author_pattern:
potential_author = author_pattern.group(1).strip()
# Skip if it looks like a subject line or other metadata
if not any(skip in potential_author.lower() for skip in ['[bitcoindev]', 'thread overview', 'mbox.gz', 'atom feed', '`']):
if len(potential_author) > 3 and not potential_author.startswith('http'):
# Skip navigation/metadata patterns
if "UTC" in potential_author and "|" in potential_author and "newest" in potential_author:
continue
if potential_author.lower() in ["utc", "newest", "flat", "nested", "permalink", "raw", "reply"]:
continue
author = potential_author.replace("via Bitcoin Development Mailing List", "").strip()
author = author.replace("'", "'").replace("<", "<").replace(">", ">")
# Remove backticks
author = author.replace("`", "").strip()
return author
logger.warning(f"⚠️ AUTHOR: Could not extract author from content")
return "Unknown Author"
def sanitize_author(author, max_length=100):
"""
"""
if not author:
return "Unknown Author"
author = str(author).strip()
# Remove 'UTC | newest]' pattern (navigation artifact)
if 'UTC' in author and '|' in author and 'newest' in author:
return "Unknown Author"
# Remove leading/trailing quotes (some author names have these)
author = author.strip('`"\' ')
# Remove timestamps at the end in various formats
author = re.sub(r'\s+\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}[+\-]\d{2}:\d{2}$', '', author)
author = re.sub(r'\s+\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}.*$', '', author)
author = re.sub(r'\s+\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}$', '', author)
author = re.sub(r'\s+\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}$', '', author)
# Remove "via Bitcoin Development Mailing List" suffix
author = re.sub(r'\s+via\s+Bitcoin\s+Development\s+Mailing\s+List.*$', '', author, flags=re.IGNORECASE)
# Remove common title prefixes that might have leaked in
author = re.sub(r'^Re:\s*', '', author)
author = re.sub(r'^\[bitcoindev\]\s*', '', author, flags=re.IGNORECASE)
author = re.sub(r'^\[Bitcoin-development\]\s*', '', author, flags=re.IGNORECASE)
author = re.sub(r'^\[bitcoin-dev\]\s*', '', author, flags=re.IGNORECASE)
# Final cleanup of any remaining quotes
author = author.strip('`"\' ')
if len(author) > max_length:
logger.warning(f"⚠️ AUTHOR: Suspiciously long author name ({len(author)} chars): {author[:50]}...")
return author.strip() if author.strip() else "Unknown Author"
def href_contains_text(tag, search_text):
return tag.name == 'a' and tag.has_attr('href') and search_text in tag['href']
def preprocess_body_text(text):
text = text.replace("[|]", "").strip()
text = re.sub(r'\[not found\] <[^>]+>', "", text)
text = re.sub(re.compile(
r'You received this message because you are subscribed to the Google Groups .+? group.\s+'
r'To unsubscribe from this group and stop receiving emails from it, send an email to .+?\.\s+'
r'To view this discussion on the web visit .+\.',
re.DOTALL
), '', text)
return text
def parse_dumps():
doc = []
for root, dirs, files in os.walk(DOWNLOAD_PATH):
for file in reversed(files):
logger.info(f'parsing : {file}')
with open(f'{os.path.join(root, file)}', 'r', encoding='utf-8') as f:
u = file[9:].replace(".html", "")
html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')
# scrape url
main_url = soup.find('a', id='main_url')
main_url = main_url.get('href')
# Scrape title
title = soup.find_all('b')[1].text
title = title.replace("[Bitcoin-development] ", "").replace("[bitcoin-dev] ", "").replace(
"[bitcoindev] ", "").replace("\t", "").strip()
# Get thread structure for threading relationships
thread_structure = get_thread_structure(soup)
# Create a mapping of anchor_id to thread info
thread_map = {}
for thread_info in thread_structure:
anchor_id = thread_info['anchor_id'].replace('#', '')
thread_map[anchor_id] = thread_info
urls_with_date = get_thread_urls_with_date(soup.find_all('pre'))
for index, (url, date) in enumerate(urls_with_date):
try:
year, month = get_year_month(date)
if year < 2024 or (year == 2024 and month == 1):
continue
href = url.get('href')
tag_id = url.get('id')
anchor_id = href.replace('#', '')
content = soup.find(lambda tag: tag.name == "pre" and tag.find('a', href=f"#{tag_id}"))
# Scrape Body
for c in content.find_all('b'):
c.decompose()
for c in content.find_all('u'):
c.decompose()
for c in content.find_all(lambda tag: href_contains_text(tag, href.replace("#", "")[1:])):
c.decompose()
for c in content.find_all(lambda tag: href_contains_text(tag, u)):
c.decompose()
body_text = preprocess_body_text(content.text)
doc_id = f"mailing-list-{year}-{month:02d}-{anchor_id}"
# Get threading information by matching with thread structure
thread_info = None
thread_depth = 0
author = None # Will be set from thread structure
# Parse document timestamp for matching
parsed_date = datetime.fromisoformat(date.replace('Z', '+00:00'))
doc_timestamp = parsed_date.strftime('%Y-%m-%d %H:%M')
# Find matching thread info by anchor_id first (most reliable)
for thread_item in thread_structure:
thread_anchor = thread_item['anchor_id']
# Try exact anchor match
if thread_anchor == anchor_id:
thread_info = thread_item
thread_depth = thread_item.get('depth', 0)
author = thread_item.get('author') # Use thread structure author
break
# Fallback: match by author and timestamp
if not thread_info:
for thread_item in thread_structure:
thread_author = thread_item['author'].lower().strip()
doc_author = author.lower().strip()
thread_timestamp = thread_item['timestamp']
# Try exact author + timestamp match
if thread_author == doc_author and thread_timestamp == doc_timestamp:
thread_info = thread_item
thread_depth = thread_item.get('depth', 0)
break
# Try author match with close timestamp (within 1 minute)
elif thread_author == doc_author:
try:
thread_dt = datetime.strptime(thread_timestamp, '%Y-%m-%d %H:%M')
doc_dt = datetime.strptime(doc_timestamp, '%Y-%m-%d %H:%M')
if abs((thread_dt - doc_dt).total_seconds()) <= 60: # Within 1 minute
thread_info = thread_item
thread_depth = thread_item.get('depth', 0)
break
except:
pass
# Fallback: extract author from content if no thread match
if not thread_info or not author:
content_author = get_author(content)
if not author:
author = content_author
if not thread_info:
logger.warning(f"⚠️ THREADING: No thread match found for '{author}' at {doc_timestamp}")
# Note: doc_id_map could be used for parent resolution if needed in the future
# Determine parent relationship and thread position
parent_id = None
reply_to_author = None
# Set thread_position based on thread structure order, not URL order
if thread_info:
thread_position = next((i for i, item in enumerate(thread_structure) if item == thread_info), index)
else:
thread_position = index # Fallback to URL order if no thread match
if thread_depth > 0 and thread_structure and thread_info:
# Find the parent by looking for the previous message with depth-1
target_depth = thread_depth - 1
current_index = next((i for i, info in enumerate(thread_structure) if info == thread_info), -1)
if current_index > 0:
for i in range(current_index - 1, -1, -1):
prev_info = thread_structure[i]
if prev_info['depth'] == target_depth:
# Create parent document ID based on the parent's anchor
parent_anchor = prev_info['anchor_id']
parent_id = f"mailing-list-{year}-{month:02d}-{parent_anchor}"
reply_to_author = prev_info['author']
break
document = {
"id": doc_id,
"authors": [author],
"title": title,
"body": body_text,
"body_type": "raw",
"created_at": date,
"domain": CUSTOM_URL,
"thread_url": main_url,
"url": f"{main_url}{href}",
# Threading fields
"thread_depth": thread_depth,
"thread_position": thread_position,
"parent_id": parent_id,
"reply_to_author": reply_to_author,
"anchor_id": anchor_id
}
if index == 0:
document['type'] = "original_post"
else:
document['type'] = "reply"
# Log only for significant threading relationships
if thread_depth > 2: # Only log for deeper nested messages
logger.info(f"📝 Deep thread: {author} (depth {thread_depth}) -> {reply_to_author}")
doc.append(document)
except Exception as e:
logger.info(f"{e} \nORIGINAL_URL: {main_url}\n{traceback.format_exc()}")
continue
return doc
def index_documents(docs):
new_docs = 0
existing_docs = 0
threading_docs = 0
updated_docs = 0
for doc in docs:
# Check if document has threading data
has_threading = any([
doc.get('thread_depth', 0) > 0,
doc.get('parent_id') is not None,
doc.get('reply_to_author') is not None,
doc.get('thread_depth') == 0 # Include root messages too
])
if has_threading:
threading_docs += 1
resp = document_view(index_name=INDEX_NAME, doc_id=doc['id'])
if not resp:
# Process all new documents
_ = document_add(index_name=INDEX_NAME, doc=doc, doc_id=doc['id'])
new_docs += 1
if has_threading and doc.get("thread_depth", 0) > 0:
logger.success(f'✅ Added: {doc.get("authors", ["Unknown"])[0]} (depth {doc.get("thread_depth", 0)})')
else:
existing_docs += 1
# Update the existing document with new threading fields
_ = document_add(index_name=INDEX_NAME, doc=doc, doc_id=doc['id'])
updated_docs += 1
if has_threading and doc.get("thread_depth", 0) > 0:
logger.success(f'✅ Updated: {doc.get("authors", ["Unknown"])[0]} (depth {doc.get("thread_depth", 0)})')
logger.success("📊 INDEXING SUMMARY:")
logger.success(f" 📝 Total documents processed: {len(docs)}")
logger.success(f" ✅ New documents added: {new_docs}")
logger.success(f" 📄 Existing documents: {existing_docs}")
logger.success(f" 🔄 Documents updated: {updated_docs}")
logger.success(f" 🧵 Documents with threading data: {threading_docs}")
if __name__ == "__main__":
logger.info("🚀 Starting mailing list scraper with threading support")
if not os.path.exists(DOWNLOAD_PATH):
os.makedirs(DOWNLOAD_PATH)
download_dumps(ORIGINAL_URL, page_visited_count=0)
documents = parse_dumps()
index_documents(documents)