Skip to content

Commit 83f42ff

Browse files
committed
Update profiles parser and related staged changes
1 parent d61f6d4 commit 83f42ff

6 files changed

Lines changed: 797 additions & 237 deletions

File tree

README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,11 @@ Run the tool by changing directory using `cd` to the `api-tools` directory and r
7272
| `./api-tools -scrape -headless` | Runs ChromeDP in headless mode. |
7373
| `./api-tools -o [directory]` | Sets output directory (default: `./data`). |
7474

75+
For profile scraping, you can optionally scope requests by school to reduce API load:
76+
- Set `PROFILE_SCHOOLS` to a comma/semicolon/space-separated list (example: `PROFILE_SCHOOLS=ECS;BBS;AHT`).
77+
- Then run `./api-tools -scrape -profiles` as usual.
78+
- If `PROFILE_SCHOOLS` is not set, the scraper defaults to batched `person` slug requests.
79+
7580
### Parsing Mode:
7681

7782
| Command | Description |

parser/profileLoader.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,13 @@ import (
1010
)
1111

1212
func loadProfiles(inDir string) {
13+
if LoadProfiles(inDir) {
14+
return
15+
}
16+
1317
fptr, err := os.Open(fmt.Sprintf("%s/profiles.json", inDir))
1418
if err != nil {
15-
log.Print("Couldn't find/open profiles.json in the input directory. Skipping profile load.")
19+
log.Print("Couldn't find/open profiles_raw.json or profiles.json in the input directory. Skipping profile load.")
1620
return
1721
}
1822

parser/profiles.go

Lines changed: 327 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,327 @@
1+
package parser
2+
3+
import (
4+
"encoding/json"
5+
"fmt"
6+
"log"
7+
"os"
8+
"regexp"
9+
"strings"
10+
11+
"github.com/UTDNebula/nebula-api/api/schema"
12+
"go.mongodb.org/mongo-driver/bson/primitive"
13+
)
14+
15+
const profilesRawFileName = "profiles_raw.json"
16+
17+
var (
18+
apiPrimaryLocationRegex = regexp.MustCompile(`^(\w+)\s+(\d+\.\d{3}[A-z]?)$`)
19+
apiFallbackLocationRegex = regexp.MustCompile(`^([A-z]+)(\d+)\.?([\d]{3}[A-z]?)$`)
20+
)
21+
22+
type profileIndexResponse struct {
23+
Count int `json:"count"`
24+
Profile []profileIndexRow `json:"profile"`
25+
}
26+
27+
type profileIndexRow struct {
28+
ID int `json:"id"`
29+
FullName string `json:"full_name"`
30+
FirstName string `json:"first_name"`
31+
LastName string `json:"last_name"`
32+
Slug string `json:"slug"`
33+
Public bool `json:"public"`
34+
URL string `json:"url"`
35+
Name string `json:"name"`
36+
ImageURL string `json:"image_url"`
37+
APIURL string `json:"api_url"`
38+
Media []map[string]any `json:"media"`
39+
Information []profileInformation `json:"information"`
40+
Areas []profileArea `json:"areas"`
41+
}
42+
43+
type profileDetailsResponse struct {
44+
Information []profileInformation `json:"information"`
45+
Areas []profileArea `json:"areas"`
46+
}
47+
48+
type profileInformation struct {
49+
Data profileInformationData `json:"data"`
50+
}
51+
52+
type profileInformationData struct {
53+
URL string `json:"url"`
54+
SecondaryURL string `json:"secondary_url"`
55+
TertiaryURL string `json:"tertiary_url"`
56+
QuaternaryURL string `json:"quaternary_url"`
57+
QuinaryURL string `json:"quinary_url"`
58+
Email string `json:"email"`
59+
Phone string `json:"phone"`
60+
Title string `json:"title"`
61+
SecondaryTitle string `json:"secondary_title"`
62+
TertiaryTitle string `json:"tertiary_title"`
63+
DistinguishedTitle string `json:"distinguished_title"`
64+
Location string `json:"location"`
65+
ProfileSummary string `json:"profile_summary"`
66+
AcceptingStudents string `json:"accepting_students"`
67+
NotAcceptingStudents string `json:"not_accepting_students"`
68+
}
69+
70+
type profileArea struct {
71+
Data profileAreaData `json:"data"`
72+
}
73+
74+
type profileAreaData struct {
75+
Title string `json:"title"`
76+
Description string `json:"description"`
77+
}
78+
79+
// LoadProfiles reads scraped profile API data and populates the package maps.
80+
func LoadProfiles(inDir string) bool {
81+
path := fmt.Sprintf("%s/%s", inDir, profilesRawFileName)
82+
fptr, err := os.Open(path)
83+
if err != nil {
84+
return false
85+
}
86+
defer fptr.Close()
87+
88+
var response profileIndexResponse
89+
if err := json.NewDecoder(fptr).Decode(&response); err != nil {
90+
log.Printf("Failed to decode profiles JSON: %v", err)
91+
return false
92+
}
93+
94+
loadedCount := 0
95+
for _, row := range response.Profile {
96+
if !row.Public {
97+
continue
98+
}
99+
100+
prof := buildProfessorFromRow(row)
101+
if prof == nil {
102+
continue
103+
}
104+
105+
professorKey := prof.First_name + prof.Last_name
106+
if _, exists := Professors[professorKey]; exists {
107+
continue
108+
}
109+
Professors[professorKey] = prof
110+
ProfessorIDMap[prof.Id] = professorKey
111+
loadedCount++
112+
}
113+
114+
log.Printf("Loaded %d profiles from %s.", loadedCount, profilesRawFileName)
115+
return true
116+
}
117+
118+
func buildProfessorFromRow(row profileIndexRow) *schema.Professor {
119+
120+
firstName := strings.TrimSpace(row.FirstName)
121+
lastName := strings.TrimSpace(row.LastName)
122+
if firstName == "" || lastName == "" {
123+
firstName, lastName = splitFullName(row.FullName)
124+
}
125+
126+
// Ignore blank names to match the parser's existing professor population behavior.
127+
if firstName == "" || lastName == "" {
128+
return nil
129+
}
130+
131+
titles := collectTitles(row)
132+
info := bestInformationData(row.Information)
133+
134+
prof := &schema.Professor{}
135+
prof.Id = primitive.NewObjectID()
136+
prof.First_name = firstName
137+
prof.Last_name = lastName
138+
prof.Titles = titles
139+
prof.Email = strings.TrimSpace(info.Email)
140+
prof.Phone_number = strings.TrimSpace(info.Phone)
141+
prof.Office = bestLocation(row.Information)
142+
prof.Profile_uri = bestProfileURI(row)
143+
prof.Image_uri = bestImageURI(row)
144+
prof.Office_hours = []schema.Meeting{}
145+
prof.Sections = []primitive.ObjectID{}
146+
147+
return prof
148+
}
149+
150+
func splitFullName(fullName string) (string, string) {
151+
parts := strings.Fields(strings.TrimSpace(fullName))
152+
if len(parts) == 0 {
153+
return "", ""
154+
}
155+
if len(parts) == 1 {
156+
return parts[0], ""
157+
}
158+
return strings.Join(parts[:len(parts)-1], " "), parts[len(parts)-1]
159+
}
160+
161+
func parseAPILocation(text string) schema.Location {
162+
normalized := strings.TrimSpace(text)
163+
if normalized == "" {
164+
return schema.Location{}
165+
}
166+
167+
var building string
168+
var room string
169+
170+
submatches := apiPrimaryLocationRegex.FindStringSubmatch(normalized)
171+
if submatches == nil {
172+
submatches = apiFallbackLocationRegex.FindStringSubmatch(strings.ReplaceAll(normalized, " ", ""))
173+
if submatches == nil {
174+
return schema.Location{}
175+
}
176+
building = submatches[1]
177+
room = fmt.Sprintf("%s.%s", submatches[2], submatches[3])
178+
} else {
179+
building = submatches[1]
180+
room = submatches[2]
181+
}
182+
183+
return schema.Location{
184+
Building: building,
185+
Room: room,
186+
Map_uri: fmt.Sprintf("https://locator.utdallas.edu/%s_%s", building, room),
187+
}
188+
}
189+
190+
func collectTitles(row profileIndexRow) []string {
191+
titles := make([]string, 0, 8)
192+
if row.Name != "" {
193+
titles = append(titles, strings.TrimSpace(row.Name))
194+
}
195+
196+
for _, info := range row.Information {
197+
for _, candidate := range []string{info.Data.Title, info.Data.SecondaryTitle, info.Data.TertiaryTitle, info.Data.DistinguishedTitle} {
198+
trimmed := strings.TrimSpace(candidate)
199+
if trimmed == "" {
200+
continue
201+
}
202+
if !containsString(titles, trimmed) {
203+
titles = append(titles, trimmed)
204+
}
205+
}
206+
}
207+
208+
return titles
209+
}
210+
211+
func bestInformationData(items []profileInformation) profileInformationData {
212+
if len(items) == 0 {
213+
return profileInformationData{}
214+
}
215+
216+
best := items[0].Data
217+
bestScore := informationScore(best)
218+
219+
for _, item := range items[1:] {
220+
score := informationScore(item.Data)
221+
if score > bestScore {
222+
best = item.Data
223+
bestScore = score
224+
}
225+
}
226+
227+
return best
228+
}
229+
230+
func informationScore(data profileInformationData) int {
231+
score := 0
232+
for _, value := range []string{
233+
data.Email,
234+
data.Phone,
235+
data.Location,
236+
data.URL,
237+
data.SecondaryURL,
238+
data.TertiaryURL,
239+
data.QuaternaryURL,
240+
data.QuinaryURL,
241+
data.Title,
242+
data.SecondaryTitle,
243+
data.TertiaryTitle,
244+
data.DistinguishedTitle,
245+
data.ProfileSummary,
246+
data.AcceptingStudents,
247+
data.NotAcceptingStudents,
248+
} {
249+
if strings.TrimSpace(value) != "" {
250+
score++
251+
}
252+
}
253+
254+
return score
255+
}
256+
257+
func bestLocation(items []profileInformation) schema.Location {
258+
for _, item := range items {
259+
location := parseAPILocation(item.Data.Location)
260+
if location.Building != "" || location.Room != "" {
261+
return location
262+
}
263+
}
264+
265+
return schema.Location{}
266+
}
267+
268+
func bestProfileURI(row profileIndexRow) string {
269+
if trimmed := strings.TrimSpace(row.URL); trimmed != "" {
270+
return trimmed
271+
}
272+
273+
for _, info := range row.Information {
274+
for _, candidate := range []string{info.Data.URL, info.Data.SecondaryURL, info.Data.TertiaryURL, info.Data.QuaternaryURL, info.Data.QuinaryURL} {
275+
trimmed := strings.TrimSpace(candidate)
276+
if trimmed != "" {
277+
return trimmed
278+
}
279+
}
280+
}
281+
282+
for _, candidate := range []string{row.APIURL} {
283+
trimmed := strings.TrimSpace(candidate)
284+
if trimmed != "" {
285+
return trimmed
286+
}
287+
}
288+
289+
return ""
290+
}
291+
292+
func bestImageURI(row profileIndexRow) string {
293+
if trimmed := strings.TrimSpace(row.ImageURL); trimmed != "" {
294+
return trimmed
295+
}
296+
297+
for _, media := range row.Media {
298+
for _, key := range []string{"url", "image_url", "src", "uri"} {
299+
if raw, exists := media[key]; exists {
300+
if str, ok := raw.(string); ok {
301+
trimmed := strings.TrimSpace(str)
302+
if trimmed != "" {
303+
return trimmed
304+
}
305+
}
306+
}
307+
}
308+
}
309+
310+
return ""
311+
}
312+
313+
func firstInformationData(items []profileInformation) profileInformationData {
314+
if len(items) == 0 {
315+
return profileInformationData{}
316+
}
317+
return items[0].Data
318+
}
319+
320+
func containsString(values []string, target string) bool {
321+
for _, value := range values {
322+
if value == target {
323+
return true
324+
}
325+
}
326+
return false
327+
}

0 commit comments

Comments
 (0)