|
| 1 | +package parser |
| 2 | + |
| 3 | +import ( |
| 4 | + "encoding/json" |
| 5 | + "fmt" |
| 6 | + "log" |
| 7 | + "os" |
| 8 | + "regexp" |
| 9 | + "strings" |
| 10 | + |
| 11 | + "github.com/UTDNebula/nebula-api/api/schema" |
| 12 | + "go.mongodb.org/mongo-driver/bson/primitive" |
| 13 | +) |
| 14 | + |
| 15 | +const profilesRawFileName = "profiles_raw.json" |
| 16 | + |
| 17 | +var ( |
| 18 | + apiPrimaryLocationRegex = regexp.MustCompile(`^(\w+)\s+(\d+\.\d{3}[A-z]?)$`) |
| 19 | + apiFallbackLocationRegex = regexp.MustCompile(`^([A-z]+)(\d+)\.?([\d]{3}[A-z]?)$`) |
| 20 | +) |
| 21 | + |
| 22 | +type profileIndexResponse struct { |
| 23 | + Count int `json:"count"` |
| 24 | + Profile []profileIndexRow `json:"profile"` |
| 25 | +} |
| 26 | + |
| 27 | +type profileIndexRow struct { |
| 28 | + ID int `json:"id"` |
| 29 | + FullName string `json:"full_name"` |
| 30 | + FirstName string `json:"first_name"` |
| 31 | + LastName string `json:"last_name"` |
| 32 | + Slug string `json:"slug"` |
| 33 | + Public bool `json:"public"` |
| 34 | + URL string `json:"url"` |
| 35 | + Name string `json:"name"` |
| 36 | + ImageURL string `json:"image_url"` |
| 37 | + APIURL string `json:"api_url"` |
| 38 | + Media []map[string]any `json:"media"` |
| 39 | + Information []profileInformation `json:"information"` |
| 40 | + Areas []profileArea `json:"areas"` |
| 41 | +} |
| 42 | + |
| 43 | +type profileDetailsResponse struct { |
| 44 | + Information []profileInformation `json:"information"` |
| 45 | + Areas []profileArea `json:"areas"` |
| 46 | +} |
| 47 | + |
| 48 | +type profileInformation struct { |
| 49 | + Data profileInformationData `json:"data"` |
| 50 | +} |
| 51 | + |
| 52 | +type profileInformationData struct { |
| 53 | + URL string `json:"url"` |
| 54 | + SecondaryURL string `json:"secondary_url"` |
| 55 | + TertiaryURL string `json:"tertiary_url"` |
| 56 | + QuaternaryURL string `json:"quaternary_url"` |
| 57 | + QuinaryURL string `json:"quinary_url"` |
| 58 | + Email string `json:"email"` |
| 59 | + Phone string `json:"phone"` |
| 60 | + Title string `json:"title"` |
| 61 | + SecondaryTitle string `json:"secondary_title"` |
| 62 | + TertiaryTitle string `json:"tertiary_title"` |
| 63 | + DistinguishedTitle string `json:"distinguished_title"` |
| 64 | + Location string `json:"location"` |
| 65 | + ProfileSummary string `json:"profile_summary"` |
| 66 | + AcceptingStudents string `json:"accepting_students"` |
| 67 | + NotAcceptingStudents string `json:"not_accepting_students"` |
| 68 | +} |
| 69 | + |
| 70 | +type profileArea struct { |
| 71 | + Data profileAreaData `json:"data"` |
| 72 | +} |
| 73 | + |
| 74 | +type profileAreaData struct { |
| 75 | + Title string `json:"title"` |
| 76 | + Description string `json:"description"` |
| 77 | +} |
| 78 | + |
| 79 | +// LoadProfiles reads scraped profile API data and populates the package maps. |
| 80 | +func LoadProfiles(inDir string) bool { |
| 81 | + path := fmt.Sprintf("%s/%s", inDir, profilesRawFileName) |
| 82 | + fptr, err := os.Open(path) |
| 83 | + if err != nil { |
| 84 | + return false |
| 85 | + } |
| 86 | + defer fptr.Close() |
| 87 | + |
| 88 | + var response profileIndexResponse |
| 89 | + if err := json.NewDecoder(fptr).Decode(&response); err != nil { |
| 90 | + log.Printf("Failed to decode profiles JSON: %v", err) |
| 91 | + return false |
| 92 | + } |
| 93 | + |
| 94 | + loadedCount := 0 |
| 95 | + for _, row := range response.Profile { |
| 96 | + if !row.Public { |
| 97 | + continue |
| 98 | + } |
| 99 | + |
| 100 | + prof := buildProfessorFromRow(row) |
| 101 | + if prof == nil { |
| 102 | + continue |
| 103 | + } |
| 104 | + |
| 105 | + professorKey := prof.First_name + prof.Last_name |
| 106 | + if _, exists := Professors[professorKey]; exists { |
| 107 | + continue |
| 108 | + } |
| 109 | + Professors[professorKey] = prof |
| 110 | + ProfessorIDMap[prof.Id] = professorKey |
| 111 | + loadedCount++ |
| 112 | + } |
| 113 | + |
| 114 | + log.Printf("Loaded %d profiles from %s.", loadedCount, profilesRawFileName) |
| 115 | + return true |
| 116 | +} |
| 117 | + |
| 118 | +func buildProfessorFromRow(row profileIndexRow) *schema.Professor { |
| 119 | + |
| 120 | + firstName := strings.TrimSpace(row.FirstName) |
| 121 | + lastName := strings.TrimSpace(row.LastName) |
| 122 | + if firstName == "" || lastName == "" { |
| 123 | + firstName, lastName = splitFullName(row.FullName) |
| 124 | + } |
| 125 | + |
| 126 | + // Ignore blank names to match the parser's existing professor population behavior. |
| 127 | + if firstName == "" || lastName == "" { |
| 128 | + return nil |
| 129 | + } |
| 130 | + |
| 131 | + titles := collectTitles(row) |
| 132 | + info := bestInformationData(row.Information) |
| 133 | + |
| 134 | + prof := &schema.Professor{} |
| 135 | + prof.Id = primitive.NewObjectID() |
| 136 | + prof.First_name = firstName |
| 137 | + prof.Last_name = lastName |
| 138 | + prof.Titles = titles |
| 139 | + prof.Email = strings.TrimSpace(info.Email) |
| 140 | + prof.Phone_number = strings.TrimSpace(info.Phone) |
| 141 | + prof.Office = bestLocation(row.Information) |
| 142 | + prof.Profile_uri = bestProfileURI(row) |
| 143 | + prof.Image_uri = bestImageURI(row) |
| 144 | + prof.Office_hours = []schema.Meeting{} |
| 145 | + prof.Sections = []primitive.ObjectID{} |
| 146 | + |
| 147 | + return prof |
| 148 | +} |
| 149 | + |
| 150 | +func splitFullName(fullName string) (string, string) { |
| 151 | + parts := strings.Fields(strings.TrimSpace(fullName)) |
| 152 | + if len(parts) == 0 { |
| 153 | + return "", "" |
| 154 | + } |
| 155 | + if len(parts) == 1 { |
| 156 | + return parts[0], "" |
| 157 | + } |
| 158 | + return strings.Join(parts[:len(parts)-1], " "), parts[len(parts)-1] |
| 159 | +} |
| 160 | + |
| 161 | +func parseAPILocation(text string) schema.Location { |
| 162 | + normalized := strings.TrimSpace(text) |
| 163 | + if normalized == "" { |
| 164 | + return schema.Location{} |
| 165 | + } |
| 166 | + |
| 167 | + var building string |
| 168 | + var room string |
| 169 | + |
| 170 | + submatches := apiPrimaryLocationRegex.FindStringSubmatch(normalized) |
| 171 | + if submatches == nil { |
| 172 | + submatches = apiFallbackLocationRegex.FindStringSubmatch(strings.ReplaceAll(normalized, " ", "")) |
| 173 | + if submatches == nil { |
| 174 | + return schema.Location{} |
| 175 | + } |
| 176 | + building = submatches[1] |
| 177 | + room = fmt.Sprintf("%s.%s", submatches[2], submatches[3]) |
| 178 | + } else { |
| 179 | + building = submatches[1] |
| 180 | + room = submatches[2] |
| 181 | + } |
| 182 | + |
| 183 | + return schema.Location{ |
| 184 | + Building: building, |
| 185 | + Room: room, |
| 186 | + Map_uri: fmt.Sprintf("https://locator.utdallas.edu/%s_%s", building, room), |
| 187 | + } |
| 188 | +} |
| 189 | + |
| 190 | +func collectTitles(row profileIndexRow) []string { |
| 191 | + titles := make([]string, 0, 8) |
| 192 | + if row.Name != "" { |
| 193 | + titles = append(titles, strings.TrimSpace(row.Name)) |
| 194 | + } |
| 195 | + |
| 196 | + for _, info := range row.Information { |
| 197 | + for _, candidate := range []string{info.Data.Title, info.Data.SecondaryTitle, info.Data.TertiaryTitle, info.Data.DistinguishedTitle} { |
| 198 | + trimmed := strings.TrimSpace(candidate) |
| 199 | + if trimmed == "" { |
| 200 | + continue |
| 201 | + } |
| 202 | + if !containsString(titles, trimmed) { |
| 203 | + titles = append(titles, trimmed) |
| 204 | + } |
| 205 | + } |
| 206 | + } |
| 207 | + |
| 208 | + return titles |
| 209 | +} |
| 210 | + |
| 211 | +func bestInformationData(items []profileInformation) profileInformationData { |
| 212 | + if len(items) == 0 { |
| 213 | + return profileInformationData{} |
| 214 | + } |
| 215 | + |
| 216 | + best := items[0].Data |
| 217 | + bestScore := informationScore(best) |
| 218 | + |
| 219 | + for _, item := range items[1:] { |
| 220 | + score := informationScore(item.Data) |
| 221 | + if score > bestScore { |
| 222 | + best = item.Data |
| 223 | + bestScore = score |
| 224 | + } |
| 225 | + } |
| 226 | + |
| 227 | + return best |
| 228 | +} |
| 229 | + |
| 230 | +func informationScore(data profileInformationData) int { |
| 231 | + score := 0 |
| 232 | + for _, value := range []string{ |
| 233 | + data.Email, |
| 234 | + data.Phone, |
| 235 | + data.Location, |
| 236 | + data.URL, |
| 237 | + data.SecondaryURL, |
| 238 | + data.TertiaryURL, |
| 239 | + data.QuaternaryURL, |
| 240 | + data.QuinaryURL, |
| 241 | + data.Title, |
| 242 | + data.SecondaryTitle, |
| 243 | + data.TertiaryTitle, |
| 244 | + data.DistinguishedTitle, |
| 245 | + data.ProfileSummary, |
| 246 | + data.AcceptingStudents, |
| 247 | + data.NotAcceptingStudents, |
| 248 | + } { |
| 249 | + if strings.TrimSpace(value) != "" { |
| 250 | + score++ |
| 251 | + } |
| 252 | + } |
| 253 | + |
| 254 | + return score |
| 255 | +} |
| 256 | + |
| 257 | +func bestLocation(items []profileInformation) schema.Location { |
| 258 | + for _, item := range items { |
| 259 | + location := parseAPILocation(item.Data.Location) |
| 260 | + if location.Building != "" || location.Room != "" { |
| 261 | + return location |
| 262 | + } |
| 263 | + } |
| 264 | + |
| 265 | + return schema.Location{} |
| 266 | +} |
| 267 | + |
| 268 | +func bestProfileURI(row profileIndexRow) string { |
| 269 | + if trimmed := strings.TrimSpace(row.URL); trimmed != "" { |
| 270 | + return trimmed |
| 271 | + } |
| 272 | + |
| 273 | + for _, info := range row.Information { |
| 274 | + for _, candidate := range []string{info.Data.URL, info.Data.SecondaryURL, info.Data.TertiaryURL, info.Data.QuaternaryURL, info.Data.QuinaryURL} { |
| 275 | + trimmed := strings.TrimSpace(candidate) |
| 276 | + if trimmed != "" { |
| 277 | + return trimmed |
| 278 | + } |
| 279 | + } |
| 280 | + } |
| 281 | + |
| 282 | + for _, candidate := range []string{row.APIURL} { |
| 283 | + trimmed := strings.TrimSpace(candidate) |
| 284 | + if trimmed != "" { |
| 285 | + return trimmed |
| 286 | + } |
| 287 | + } |
| 288 | + |
| 289 | + return "" |
| 290 | +} |
| 291 | + |
| 292 | +func bestImageURI(row profileIndexRow) string { |
| 293 | + if trimmed := strings.TrimSpace(row.ImageURL); trimmed != "" { |
| 294 | + return trimmed |
| 295 | + } |
| 296 | + |
| 297 | + for _, media := range row.Media { |
| 298 | + for _, key := range []string{"url", "image_url", "src", "uri"} { |
| 299 | + if raw, exists := media[key]; exists { |
| 300 | + if str, ok := raw.(string); ok { |
| 301 | + trimmed := strings.TrimSpace(str) |
| 302 | + if trimmed != "" { |
| 303 | + return trimmed |
| 304 | + } |
| 305 | + } |
| 306 | + } |
| 307 | + } |
| 308 | + } |
| 309 | + |
| 310 | + return "" |
| 311 | +} |
| 312 | + |
| 313 | +func firstInformationData(items []profileInformation) profileInformationData { |
| 314 | + if len(items) == 0 { |
| 315 | + return profileInformationData{} |
| 316 | + } |
| 317 | + return items[0].Data |
| 318 | +} |
| 319 | + |
| 320 | +func containsString(values []string, target string) bool { |
| 321 | + for _, value := range values { |
| 322 | + if value == target { |
| 323 | + return true |
| 324 | + } |
| 325 | + } |
| 326 | + return false |
| 327 | +} |
0 commit comments