-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathparselinks.py
More file actions
129 lines (105 loc) · 4.63 KB
/
parselinks.py
File metadata and controls
129 lines (105 loc) · 4.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import gzip
import json
import re
import multiprocessing
import time
import os
import sys
# --- Configuration ---
PAGELINKS_FILE = "enwiki-20251101-pagelinks.sql.gz"
LINKTARGET_FILE = "enwiki-20251101-linktarget.sql.gz"
# UPDATE THIS PATH IF NEEDED
DOC_INFO_FILE = r"C:\Users\Arnav\Desktop\Programming\WikiEngine\data_files\doc_info.jsonl"
OUTPUT_CSV = "pagelinks.csv"
# Lowered to 4 to prevent disk choking during initialization
NUM_WORKERS = 4
# --- Regexes ---
LINKTARGET_REGEX = re.compile(r"\(([0-9]+),([0-9]+),'((?:[^'\\]|\\.)*)'\)")
PAGELINKS_REGEX = re.compile(r"\(([0-9]+),([0-9]+),([0-9]+)\)")
# Global Map
target_id_to_page_id = {}
def load_maps_worker():
global target_id_to_page_id
pid = os.getpid()
print(f"[Worker {pid}] Step 1/2: Loading doc_info...", flush=True)
# 1. Load Title -> PageID
title_to_pid = {}
count = 0
try:
with open(DOC_INFO_FILE, 'r', encoding='utf-8') as f:
for line in f:
if not line.strip(): continue
try:
d = json.loads(line)
t = d['title'].strip().replace(' ', '_')
title_to_pid[t] = d['id']
count += 1
if count % 2000000 == 0:
print(f"[Worker {pid}] Loaded {count} titles...", flush=True)
except: continue
except FileNotFoundError:
print(f"[Worker {pid}] ERROR: doc_info file not found at {DOC_INFO_FILE}")
return
print(f"[Worker {pid}] Step 2/2: Parsing linktarget...", flush=True)
# 2. Load TargetID -> Title -> PageID
local_tid_map = {}
valid_targets = 0
if os.path.exists(LINKTARGET_FILE):
with gzip.open(LINKTARGET_FILE, 'rt', encoding='utf-8', errors='replace') as f:
for line in f:
if not line.startswith('INSERT INTO'): continue
for match in LINKTARGET_REGEX.finditer(line):
tid, ns, title_raw = match.groups()
if ns != '0': continue
title = title_raw.replace(r"\'", "'").replace(r'\"', '"').replace(r'\\', '\\')
if title in title_to_pid:
local_tid_map[int(tid)] = title_to_pid[title]
valid_targets += 1
if valid_targets > 0 and valid_targets % 1000000 == 0:
print(f"[Worker {pid}] Resolved {valid_targets} targets...", flush=True)
else:
print(f"[Worker {pid}] ERROR: linktarget file not found")
return
target_id_to_page_id = local_tid_map
# Clear memory
del title_to_pid
print(f"[Worker {pid}] READY. Mapped {valid_targets} link targets.", flush=True)
def parse_pagelinks_chunk(chunk):
global target_id_to_page_id
results = []
for match in PAGELINKS_REGEX.finditer(chunk):
from_id, from_ns, target_id = match.groups()
if from_ns != '0': continue
tid = int(target_id)
if tid in target_id_to_page_id:
pid = target_id_to_page_id[tid]
results.append(f"{from_id},{pid}")
return "\n".join(results)
def generate_chunks(filepath):
with gzip.open(filepath, 'rt', encoding='utf-8', errors='replace') as f:
for line in f:
if not line.startswith('INSERT INTO'): continue
start = line.find('VALUES ')
if start != -1: yield line[start:]
if __name__ == "__main__":
# Quick check for files
if not os.path.exists(PAGELINKS_FILE):
print(f"ERROR: {PAGELINKS_FILE} missing")
sys.exit()
if not os.path.exists(LINKTARGET_FILE):
print(f"ERROR: {LINKTARGET_FILE} missing")
sys.exit()
print(f"Starting with {NUM_WORKERS} workers. Please wait for map loading (approx 5-10 mins)...")
start_time = time.time()
with multiprocessing.Pool(NUM_WORKERS, initializer=load_maps_worker) as pool:
print("Pool created. Reading chunks...")
with open(OUTPUT_CSV, 'w', encoding='utf-8') as f:
gen = generate_chunks(PAGELINKS_FILE)
count = 0
for chunk_res in pool.imap_unordered(parse_pagelinks_chunk, gen):
if chunk_res:
f.write(chunk_res + '\n')
count += chunk_res.count('\n')
if count % 100000 == 0:
print(f"Saved {count} links...", end='\r')
print(f"\nDone. Total links saved: {count}")