Skip to content

Commit a497c6b

Browse files
committed
Cloned files, added jpegs, keep .gitignore
1 parent 0ea4dfa commit a497c6b

21 files changed

Lines changed: 1338 additions & 0 deletions

rekognition-ocr/.gitignore

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
.DS_Store
2+
.ipynb_checkpoints/
3+
data/.DS_Store
4+
data/images
5+
data/
6+
output/
7+
__pycache__/
8+
python/old/
9+
python/show_df.ipynb
10+
python/ocr_examples.py
11+
python/read_ocr_response.py
12+
!/data/

rekognition-ocr/Python/lib/__init__.py

Whitespace-only changes.
Lines changed: 291 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,291 @@
1+
import os
2+
import pandas as pd
3+
import re
4+
import requests
5+
from requests import get
6+
7+
class mechScraper(object):
8+
"""
9+
10+
"""
11+
12+
def __init__(self):
13+
"""
14+
Set initial class variables:
15+
- mech data urls
16+
"""
17+
18+
self.light_url = "https://wiki.mwomercs.com/index.php?title=Light_Mechs&action=edit"
19+
self.medium_url = "https://wiki.mwomercs.com/index.php?title=Medium_Mechs&action=edit"
20+
self.heavy_url = "https://wiki.mwomercs.com/index.php?title=Heavy_Mechs&action=edit"
21+
self.assault_url = "https://wiki.mwomercs.com/index.php?title=Assault_Mechs&action=edit"
22+
self.output_path = "../output/"
23+
24+
def get_mech_df(self, url=None):
25+
"""
26+
Scrapes page data from a passed URL to extract:
27+
- mech names
28+
- mech tonnage
29+
- mech weight class
30+
returns the data as a pandas dataframe
31+
"""
32+
33+
#check if URL was supplied
34+
if not url:
35+
print("must pass URL")
36+
return
37+
#scrape passed URL
38+
print("scraping " + url)
39+
page = requests.get(url)
40+
page_string = page.text
41+
42+
#set webscrape regex patterns
43+
mech_obj = re.compile(r'===\s[\w\s-]+[\s()A-Z0-9-]*\s===')
44+
tonnage_obj = re.compile(r'Tonnage[\']*:[\s\d+]+')
45+
chassis_obj = re.compile(r'Var\w\wnts[\']+:[\sa-zA-Z0-9-,]+')
46+
hero_obj = re.compile(r'[\']+Hero[\']+:[,\s[()\.\'\w-]+')
47+
champ_obj = re.compile(r'[\']+Champion[\']+:\s?[+\s[()\w-]+')
48+
special_obj = re.compile(r'[\']+Special[\']+:\s?[\/,\s[()\w-]*')
49+
50+
#get matching name, tonnage, and variant list
51+
mech_results = mech_obj.finditer(page_string)
52+
tonnage_results = tonnage_obj.finditer(page_string)
53+
chassis_results = chassis_obj.finditer(page_string)
54+
hero_results = hero_obj.finditer(page_string)
55+
champion_results = champ_obj.finditer(page_string)
56+
special_results = special_obj.finditer(page_string)
57+
58+
#clean regex results to get desired text for each mech: name, weight, chassis variants
59+
mech_names = [mech_name.group().replace("===", "").strip() for mech_name in mech_results]
60+
mech_weights = [mech_weight.group().replace("\n", "")[-3:].strip() for mech_weight in tonnage_results]
61+
#get base chassis variants
62+
#chassis variants is a list of lists
63+
chassis_variants = [chassis.group().replace("\n","")[12:].replace(",","").split() for chassis in chassis_results]
64+
65+
66+
#clean scrape data for hero variants
67+
hero_variants = [hero.group().replace("\n","")[11:].strip() for hero in hero_results]
68+
hero_names = [hero[:hero.find("(")].strip() for hero in hero_variants]
69+
#correct for missing single quote in web data
70+
hero_names = [hero.replace("'''Special''","") for hero in hero_names]
71+
72+
for i in range(len(hero_variants)):
73+
#fix Archer Tempest hero typo
74+
if "ACR-T" in hero_variants[i]:
75+
hero_variants[i] = hero_variants[i].replace("ACR-T", "ARC-T")
76+
print("Archer Tempest fixed \n\n")
77+
if "(" in hero_variants[i]:
78+
#take from open parenthesis to the right
79+
hero_variants[i] = hero_variants[i][hero_variants[i].index("("):].replace("'''Special'''","")
80+
81+
if "," in hero_variants[i]:
82+
hero_variants[i] = hero_variants[i].split(",")
83+
84+
for j in range(len(hero_variants[i])):
85+
if "(" in hero_variants[i][j]:
86+
hero_variants[i][j] = hero_variants[i][j][hero_variants[i][j].find("(")+1:]
87+
hero_variants[i][j] = hero_variants[i][j].replace(")","")
88+
else:
89+
hero_variants[i] = [hero_variants[i].replace("'''Special'''","").replace("(","").replace(")","")]
90+
91+
#process scrape data for champion variants
92+
#convert to list from regex object
93+
champion_variants = [champ.group() for champ in champion_results]
94+
#split "champion" out of chassis designation
95+
champion_variants = [champ[champ.index(":")+1:].strip().replace(" ", "") for champ in champion_variants]
96+
#remove blank entries
97+
champion_variants = [champ for champ in champion_variants if champ != "n"]
98+
99+
#process scrape data for special variants to remove clutter
100+
#convert to list from regex
101+
special_variants = [spec.group() for spec in special_results]
102+
#remove "special" from chassis designation
103+
special_variants = [spec[spec.index(":")+1:].strip().replace(" ","") for spec in special_variants]
104+
special_list = [] #use list to hold all special variants as there are fewer than number of chassis
105+
106+
for i in range(len(special_variants)):
107+
if "," in special_variants[i]:
108+
special_variants[i] = special_variants[i].split(",")
109+
else:
110+
special_variants[i] = [special_variants[i]]
111+
#convert special variants to single list
112+
for j in range(len(special_variants[i])):
113+
special_list.append(special_variants[i][j])
114+
115+
#Fix errors in screen pull data
116+
for i in range(len(special_list)):
117+
if special_list[i] == "ACR-2R(S)":
118+
print("Archer special fixed")
119+
special_list[i] = "ARC-2R(S)"
120+
if special_list[i] == "SMNM-F(L)SMN-M(L)":
121+
special_list[i] = "SMNM-F(L)"
122+
special_list.append("SMN-M(L)")
123+
print("Fixing SMNM-F(L) and SMNM-F(L)")
124+
125+
for i in range(len(hero_names)):
126+
if hero_names[i] == "Wrat":
127+
hero_names[i] = "Wrath"
128+
print(hero_names[i])
129+
if hero_names[i] == "Hi Ther":
130+
hero_names[i] = "Hi There"
131+
132+
for i in range(len(hero_variants)):
133+
if hero_variants[i][0] == "HMN-PK":
134+
hero_variants[i][0] = "HMN-PA"
135+
print("Fixing HMN-PK: ", hero_variants[i])
136+
if hero_variants[i][0] == "EBJ-ESP":
137+
hero_variants[i][0] = "EBJ-EC"
138+
if hero_variants[i][0] == "MKII-DS":
139+
hero_variants[i][0] = "MCII-DS"
140+
141+
print()
142+
143+
#FIXME: fafnir wrath is missing h in hero name
144+
#convert lists to dict as preprocess for converstion to dataframe
145+
mech_dict = {
146+
"mechs":mech_names,
147+
"tonnage":mech_weights,
148+
"variants":chassis_variants,
149+
"hero_chassis":hero_variants,
150+
"hero_names":hero_names
151+
}
152+
153+
mech_df = pd.DataFrame(mech_dict)
154+
155+
#match special variants to base chassis to get weight data
156+
#use 3 letter chassis designation as match key
157+
mech_df["special_variants"] = ""
158+
159+
for index, row in mech_df.iterrows():
160+
add_specials = []
161+
for i in range(len(special_list)):
162+
163+
#check for clan IIC model (disambiguation from inner sphere variants)
164+
if "IIC" in row["variants"][0]:
165+
clan = True
166+
else:
167+
clan = False
168+
169+
mech_letters = row["variants"][0][:3].upper()
170+
if clan:
171+
if mech_letters == special_list[i][:3].upper() and "IIC" in special_list[i]:
172+
add_specials.append(special_list[i])
173+
else:
174+
if mech_letters == special_list[i][:3].upper() and "IIC" not in special_list[i]:
175+
add_specials.append(special_list[i])
176+
177+
mech_df.at[index, "special_variants"] = add_specials
178+
#match champion variants to base chassis to get weight data
179+
#use 3 letter chassis designation as match key
180+
mech_df["champion_variants"] = ""
181+
for index, row in mech_df.iterrows():
182+
add_champions = []
183+
for i in range(len(champion_variants)):
184+
185+
#check for clan IIC model (disambiguation from inner sphere variants)
186+
if "IIC" in row["variants"][0]:
187+
clan = True
188+
else:
189+
clan = False
190+
191+
mech_letters = row["variants"][0][:3].upper()
192+
if clan:
193+
if mech_letters == champion_variants[i][:3].upper() and "IIC" in champion_variants[i]:
194+
add_specials.append(special_list[i])
195+
else:
196+
if mech_letters == champion_variants[i][:3].upper() and "IIC" not in champion_variants[i]:
197+
add_champions.append(champion_variants[i])
198+
mech_df.at[index, "champion_variants"] = add_champions
199+
200+
mech_df = mech_df[["mechs", "tonnage","hero_names", "hero_chassis", "variants",
201+
"special_variants", "champion_variants"]]
202+
203+
return mech_df
204+
205+
206+
def save_data(self, data, weight_class, output_path=None):
207+
"""
208+
Writes a pandas df to disc.
209+
Uses the weight class as a name for pipe-delimited text file.
210+
"""
211+
if not output_path:
212+
output_path = self.output_path
213+
if not os.path.exists(output_path):
214+
os.makedirs(output_path)
215+
216+
print("saving data for " + weight_class)
217+
data.to_csv(output_path + weight_class + ".txt", sep="|", index=False)
218+
219+
220+
def main(self):
221+
"""
222+
Scrapes URLs for mech data and compiles them to
223+
pandas dataframes before writing them to disk.
224+
"""
225+
226+
assault_mech_df = self.get_mech_df(url=self.assault_url)
227+
heavy_mech_df = self.get_mech_df(url=self.heavy_url)
228+
medium_mech_df = self.get_mech_df(url=self.medium_url)
229+
light_mech_df = self.get_mech_df(url=self.light_url)
230+
all_weights_df = pd.concat([assault_mech_df, heavy_mech_df, medium_mech_df,
231+
light_mech_df])
232+
233+
self.save_data(assault_mech_df, "assault")
234+
self.save_data(heavy_mech_df, "heavy")
235+
self.save_data(medium_mech_df, "medium")
236+
self.save_data(light_mech_df, "light")
237+
self.save_data(all_weights_df, "all_weights")
238+
#get maximum new columns needed for splitting variants
239+
max_cols = all_weights_df.variants.apply(lambda x: len(x)).max()
240+
melt_cols = []
241+
242+
for i in range(max_cols):
243+
all_weights_df["var_"+str(i)] = ""
244+
melt_cols.append("var_"+str(i))
245+
246+
variant_weights_df = pd.DataFrame()
247+
for index, row in all_weights_df.iterrows():
248+
for i in range(len(row["variants"])):
249+
#add each variant to variant weights as a row with mech, tonnage, variant
250+
new_row_dict = {
251+
"mech_name":row["mechs"],
252+
"tonnage":row["tonnage"],
253+
"variant":row["variants"][i].upper()
254+
}
255+
new_row_df = pd.DataFrame(new_row_dict, index=[0])
256+
variant_weights_df = pd.concat([variant_weights_df, new_row_df])
257+
258+
for i in range(len(row["hero_chassis"])):
259+
new_row_dict = {
260+
"mech_name":row["hero_names"],
261+
"tonnage":row["tonnage"],
262+
"variant":row["hero_chassis"][i].upper()
263+
}
264+
new_row_df = pd.DataFrame(new_row_dict, index=[0])
265+
variant_weights_df = pd.concat([variant_weights_df, new_row_df])
266+
267+
268+
for i in range(len(row["special_variants"])):
269+
new_row_dict = {
270+
"mech_name":row["mechs"],
271+
"tonnage":row["tonnage"],
272+
"variant":row["special_variants"][i].upper()
273+
}
274+
new_row_df = pd.DataFrame(new_row_dict, index=[0])
275+
variant_weights_df = pd.concat([variant_weights_df, new_row_df])
276+
277+
#add champion variants by matching on
278+
for i in range(len(row["champion_variants"])):
279+
new_row_dict = {
280+
"mech_name":row["mechs"],
281+
"tonnage":row["tonnage"],
282+
"variant":row["champion_variants"][i].upper()
283+
}
284+
new_row_df = pd.DataFrame(new_row_dict, index=[0])
285+
variant_weights_df = pd.concat([variant_weights_df, new_row_df])
286+
#remove duplicate rows
287+
variant_weights_df = variant_weights_df[variant_weights_df.duplicated(keep="first")==False]
288+
self.save_data(variant_weights_df, "variant_weights")
289+
290+
if __name__ =="__main__":
291+
mechScraper().main()

0 commit comments

Comments
 (0)