Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
6d90594
Initial coursebook scraper retry setup
justinschwerdtfeger Feb 27, 2026
58d66ca
staticcheck fix - arg is already a string
justinschwerdtfeger Feb 27, 2026
725bee8
Add repeat error detection. (needs to have flag)
justinschwerdtfeger Apr 1, 2026
9f783f0
Change getting section content to error instead of panic
justinschwerdtfeger Apr 1, 2026
0cd852b
Merge remote-tracking branch 'upstream/develop' into coursebook-error…
justinschwerdtfeger Apr 1, 2026
0b193d5
Add retry flag to coursebook scraper
justinschwerdtfeger Apr 1, 2026
007fb97
Move coursebook retry from main.go to coursebook.go
justinschwerdtfeger Apr 1, 2026
fc4beef
Handle netwok errors inline
justinschwerdtfeger Apr 2, 2026
a096768
Merge remote-tracking branch 'upstream/develop' into coursebook-error…
justinschwerdtfeger Apr 3, 2026
4af3991
Merge pull request #154 from justinschwerdtfeger/coursebook-error-retry
justinschwerdtfeger Apr 3, 2026
dc911fc
Improve Verbose logger to output line number of calling function
justinschwerdtfeger Apr 3, 2026
2ca86d3
Simplify Coursebook scraper logic
justinschwerdtfeger Apr 3, 2026
8c1acad
Add netid error
justinschwerdtfeger Apr 3, 2026
ccc1101
Add todo for log format
justinschwerdtfeger Apr 3, 2026
d6957a4
Improve retry logic. needs more work
justinschwerdtfeger Apr 3, 2026
de60e7a
Add Panic recovery to coursebook scraper
justinschwerdtfeger Apr 8, 2026
286f2b3
Merge branch 'coursebook-retry' of github.com:UTDNebula/api-tools int…
justinschwerdtfeger Apr 8, 2026
ea956ad
Merge branch 'develop' into coursebook-retry
justinschwerdtfeger Apr 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 6 additions & 8 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import (

func main() {
// Load environment variables
godotenv.Load()
godotenv.Load() // TODO: I Don't think this does anything

// Setup flags

Expand All @@ -33,6 +33,7 @@ func main() {
term := flag.String("term", "", "Alongside -coursebook, specifies the term to scrape, i.e. 23S")
startPrefix := flag.String("startprefix", "", "Alongside -coursebook, specifies the course prefix to start scraping from, i.e. cp_span")
resume := flag.Bool("resume", false, "Alongside -coursebook, signifies that scraping should begin at the last complete prefix and should not re-scrape existing data")
retry := flag.Int("retry", 0, "Alongside -coursebook, specifies how many times to retry before quitting")

// Flag for profile scraping
scrapeProfiles := flag.Bool("profiles", false, "Alongside -scrape, signifies that professor profiles should be scraped.")
Expand Down Expand Up @@ -80,15 +81,15 @@ func main() {
dateTime := time.Now()
year, month, day := dateTime.Date()
hour, min, sec := dateTime.Clock()
logFile, err := os.Create(fmt.Sprintf("./logs/%d-%d-%dT%d-%d-%d.log", month, day, year, hour, min, sec))
logFile, err := os.Create(fmt.Sprintf("./logs/%d-%d-%dT%d-%d-%d.log", month, day, year, hour, min, sec)) // TODO: This should probably be year month day

if err != nil {
log.Fatal(err)
}

defer logFile.Close()
// Set logging output destination to a SplitWriter that writes to both the log file and stdout
log.SetOutput(utils.NewSplitWriter(logFile, os.Stdout))
// Set logging output destination to a SplitWriter that writes to both the log file and stderr
log.SetOutput(utils.NewSplitWriter(logFile, os.Stdout)) // TODO: Switch to stderr
// Do verbose logging if verbose flag specified
if *verbose {
log.SetFlags(log.Ltime | log.Lmicroseconds | log.Lshortfile | utils.Lverbose)
Expand All @@ -104,10 +105,7 @@ func main() {
case *scrapeProfiles:
scrapers.ScrapeProfiles(*outDir)
case *scrapeCoursebook:
if *term == "" {
log.Panic("No term specified for coursebook scraping! Use -term to specify.")
}
scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume)
scrapers.ScrapeCoursebook(*term, *startPrefix, *outDir, *resume, *retry)
case *scrapeDiscounts:
scrapers.ScrapeDiscounts(*outDir)
case *cometCalendar:
Expand Down
220 changes: 156 additions & 64 deletions scrapers/coursebook.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,82 +30,162 @@ var (
)

const (
reqThrottle = 400 * time.Millisecond
prefixThrottle = 5 * time.Second
httpTimeout = 10 * time.Second
reqThrottle = 400 * time.Millisecond
prefixThrottle = 5 * time.Second
httpTimeout = 10 * time.Second
getSectionContentRetryCount = 8
getSectionIdsForPrefixRetryCount = 8
)

// ScrapeCoursebook scrapes utd coursebook for the provided term (semester)
func ScrapeCoursebook(term string, startPrefix string, outDir string, resume bool) {
// ScrapeCoursebook Scrapes utd coursebook for provided term with specified options
func ScrapeCoursebook(term string, startPrefix string, outDir string, resume bool, retry int) {
// Make sure variables are set right before starting scraping
if term == "" {
log.Fatal("Coursebook Scraping Setup Failed: No term specified for coursebook scraping! Use -term to specify.")
}
if startPrefix != "" && !prefixRegex.MatchString(startPrefix) {
log.Fatalf("Invalid starting prefix %s, must match format cp_{abcde}", startPrefix)
log.Fatalf("Coursebook Scraping Setup Failed: invalid starting prefix %s, must match format cp_{abcde}", startPrefix)
}
if !termRegex.MatchString(term) {
log.Fatalf("Invalid term %s, must match format {00-99}{s/f/u}", term)
log.Fatalf("Coursebook Scraping Setup Failed: invalid term %s, must match format {00-99}{s/f/u}", term)
}
_, err := utils.GetEnv("LOGIN_NETID")
if err != nil {
log.Fatalf("Coursebook Scraping Setup Failed: LOGIN_NETID environment variable was missing. Enter a valid UTD Net ID in .env")
}
_, err = utils.GetEnv("LOGIN_PASSWORD")
if err != nil {
log.Fatalf("Coursebook Scraping Setup Failed: LOGIN_PASSWORD environment variable was missing. Enter a valid Password for UTD Net ID in .env")
}

// if retry is set to something other than 0, and an unexpected error or panic makes it back here, we ca
var lastErr error = nil
repeatErrCount := 0
panicCount := 0
for repeatErrCount <= retry && panicCount <= retry {
err := func() (funcErr error) {
// Recover potential panic from the below and retry
defer func() {
if r := recover(); r != nil {
panicCount++
funcErr = fmt.Errorf("coursebook Scraping Panicked: %v (panic %d of %d)", r, panicCount, retry+1)
}
}()

scraper, err := newCoursebookScraper(term, outDir)
if err != nil {
return err
}
defer scraper.chromedpCancel()

return scraper.Scrape(startPrefix, resume)
}()

// No error, scraped successfully
if err == nil {
return
}

// Context canceled Error (such as when closing chromedp window)
if err.Error() == "context canceled" {
log.Fatalf("Coursebook Scraping Canceled, Exiting")
}

/* Retry Coursebook Scraping */
log.Printf("Coursebook Scraping Failed: %v", err)

if fmt.Sprintf("%v", lastErr) == fmt.Sprintf("%v", err) {
repeatErrCount++
} else {
repeatErrCount = 1
}

lastErr = err

// TODO: ensure all panics are reasonable, and should not be retried
// TODO: Improve retry logic with more exponential retry
}

scraper := newCoursebookScraper(term, outDir)
defer scraper.chromedpCancel()
if retry != 0 {
log.Fatalf("Coursebook Scraping Failed %d times in a row with the same error, Exiting", retry+1)
}
}

// Scrape begins the scraping process for all prefixes
func (s *coursebookScraper) Scrape(startPrefix string, resume bool) error {
if resume && startPrefix == "" {
// providing a starting prefix overrides the resume flag
startPrefix = scraper.lastCompletePrefix()
var err error
startPrefix, err = s.lastCompletePrefix()
if err != nil {
return fmt.Errorf("failed to get last complete prefix while resuming: %v", err)
}
}

log.Printf("[Begin Scrape] Starting scrape for term %s with %d prefixes", term, len(scraper.prefixes))
log.Printf("[Begin Scrape] Starting scrape for term %s with %d prefixes", s.term, len(s.prefixes))

totalTime := time.Now()
for i, prefix := range scraper.prefixes {
for i, prefix := range s.prefixes {
if startPrefix != "" && strings.Compare(prefix, startPrefix) < 0 {
continue
}

start := time.Now()
if err := scraper.ensurePrefixFolder(prefix); err != nil {
log.Fatal(err)
if err := s.scrapePrefix(prefix, resume, i); err != nil {
return err
}
}
log.Printf("[Scrape Complete] Finished scraping term %s in %v. Total sections %d: Total retries %d", s.term, time.Since(totalTime), s.totalScrapedSections, s.reqRetries)

var sectionIds []string
var err error
if err := s.validate(); err != nil {
log.Panicf("Validating failed: %v", err)
}

// if resume we skip existing entries otherwise overwrite them
if resume {
sectionIds, err = scraper.getMissingIdsForPrefix(prefix)
} else {
sectionIds, err = scraper.getSectionIdsForPrefix(prefix)
}
return nil
}

if err != nil {
log.Fatalf("Error getting section ids for %s ", prefix)
}
// scrapePrefix scrapes all sections for a single prefix
func (s *coursebookScraper) scrapePrefix(prefix string, resume bool, index int) error {
start := time.Now()
if err := s.ensurePrefixFolder(prefix); err != nil {
log.Panic(err)
}

if len(sectionIds) == 0 {
log.Printf("No sections found for %s ", prefix)
continue
}
var sectionIds []string
var err error

log.Printf("[Scrape Prefix] %s (%d/%d): Found %d sections to scrape.", prefix, i+1, len(scraper.prefixes), len(sectionIds))
// if resume we skip existing entries otherwise overwrite them
if resume {
sectionIds, err = s.getMissingIdsForPrefix(prefix)
} else {
sectionIds, err = s.getSectionIdsForPrefix(prefix)
}

for _, sectionId := range sectionIds {
content, err := scraper.getSectionContent(sectionId)
if err != nil {
log.Fatalf("Error getting section content for section %s: %v", sectionId, err)
}
if err := scraper.writeSection(prefix, sectionId, content); err != nil {
log.Fatalf("Error writing section %s: %v", sectionId, err)
}
time.Sleep(reqThrottle)
}
if err != nil {
log.Panicf("Error getting section ids for %s ", prefix)
}

// At the end of the prefix loop
log.Printf("[End Prefix] %s: Scraped %d sections in %v.", prefix, len(sectionIds), time.Since(start))
time.Sleep(prefixThrottle)
if len(sectionIds) == 0 {
log.Printf("No sections found for %s ", prefix)
return nil
}
log.Printf("[Scrape Complete] Finished scraping term %s in %v. Total sections %d: Total retries %d", term, time.Since(totalTime), scraper.totalScrapedSections, scraper.reqRetries)

if err := scraper.validate(); err != nil {
log.Fatal("Validating failed: ", err)
log.Printf("[Scrape Prefix] %s (%d/%d): Found %d sections to scrape.", prefix, index+1, len(s.prefixes), len(sectionIds))

for _, sectionId := range sectionIds {
content, err := s.getSectionContent(sectionId)
if err != nil {
return fmt.Errorf("error getting section content for section %s: %v", sectionId, err)
}
if err := s.writeSection(prefix, sectionId, content); err != nil {
log.Panicf("Error writing section %s: %v", sectionId, err)
}
time.Sleep(reqThrottle)
}

// At the end of the prefix loop
log.Printf("[End Prefix] %s: Scraped %d sections in %v.", prefix, len(sectionIds), time.Since(start))
time.Sleep(prefixThrottle)
return nil
}

type coursebookScraper struct {
Expand All @@ -124,38 +204,45 @@ type coursebookScraper struct {
totalScrapedSections int
}

func newCoursebookScraper(term string, outDir string) *coursebookScraper {
func newCoursebookScraper(term string, outDir string) (*coursebookScraper, error) {
ctx, cancel := utils.InitChromeDp()
httpClient := &http.Client{
Timeout: httpTimeout,
}

//prefixes in alphabetical order for skip prefix flag
prefixes := utils.GetCoursePrefixes(ctx)
prefixes, err := utils.GetCoursePrefixes(ctx)
if err != nil {
return nil, err
}
sort.Strings(prefixes)
coursebookHeaders, err := utils.RefreshToken(ctx)
if err != nil {
return nil, err
}
return &coursebookScraper{
chromedpCtx: ctx,
chromedpCancel: cancel,
httpClient: httpClient,
prefixes: prefixes,
coursebookHeaders: utils.RefreshToken(ctx),
coursebookHeaders: coursebookHeaders,
term: term,
outDir: outDir,
prefixIdsCache: make(map[string][]string),
}
}, nil
}

// lastCompletePrefix returns the last prefix (alphabetical order) that contains
// html files for all of its section ids. returns an empty string if there are no
// complete prefixes
func (s *coursebookScraper) lastCompletePrefix() string {
func (s *coursebookScraper) lastCompletePrefix() (string, error) {
if err := s.ensureOutputFolder(); err != nil {
log.Fatal(err)
return "", err
}

dir, err := os.ReadDir(filepath.Join(s.outDir, s.term))
if err != nil {
log.Fatalf("failed to read output directory: %v", err)
return "", fmt.Errorf("failed to read output directory: %w", err)
}

foundPrefixes := make([]string, 0, len(s.prefixes))
Expand All @@ -169,18 +256,17 @@ func (s *coursebookScraper) lastCompletePrefix() string {
for _, prefix := range foundPrefixes {
missing, err := s.getMissingIdsForPrefix(prefix)
if err != nil {
log.Fatalf("Failed to get ids: %v", err)
return "", fmt.Errorf("failed to get ids: %w", err)
}
if len(missing) == 0 {
return prefix
return prefix, nil
}
time.Sleep(reqThrottle)
}
return ""
return "", nil
}

// ensurePrefixFolder creates {outDir}/term if it does not exist

func (s *coursebookScraper) ensureOutputFolder() error {
if err := os.MkdirAll(filepath.Join(s.outDir, s.term), 0755); err != nil {
return fmt.Errorf("failed to create term forlder: %w", err)
Expand Down Expand Up @@ -208,7 +294,7 @@ func (s *coursebookScraper) writeSection(prefix string, id string, content strin
// retries up to 3 times, each time refreshing the token and waiting longer
func (s *coursebookScraper) getSectionContent(id string) (string, error) {
queryStr := fmt.Sprintf("id=%s&req=b30da8ab21637dbef35fd7682f48e1c1W0ypMhaj%%2FdsnYn3Wa03BrxSNgCeyvLfvucSTobcSXRf38SWaUaNfMjJQn%%2BdcabF%%2F7ZuG%%2BdKqHAqmrxEKyg8AdB0FqVGcz4rkff3%%2B3SIUIt8%%3D&action=info", id)
response, err := s.req(queryStr, 3, id)
response, err := s.req(queryStr, getSectionContentRetryCount, id)
if err != nil {
return "", fmt.Errorf("get section content for id %s failed: %w", id, err)
}
Expand All @@ -235,7 +321,7 @@ func (s *coursebookScraper) getMissingIdsForPrefix(prefix string) ([]string, err

dir, err := os.ReadDir(path)
if err != nil {
log.Panicf("Failed to access folder %s: %v", path, err)
return sectionIds, fmt.Errorf("failed to access folder %s: %w", path, err)
}

foundIds := make(map[string]bool)
Expand Down Expand Up @@ -264,7 +350,7 @@ func (s *coursebookScraper) getSectionIdsForPrefix(prefix string) ([]string, err
sections := make([]string, 0, 100)
for _, clevel := range []string{"clevel_u", "clevel_g"} {
queryStr := fmt.Sprintf("action=search&s%%5B%%5D=term_%s&s%%5B%%5D=%s&s%%5B%%5D=%s", s.term, prefix, clevel)
content, err := s.req(queryStr, 10, fmt.Sprintf("%s:%s", prefix, clevel))
content, err := s.req(queryStr, getSectionIdsForPrefixRetryCount, fmt.Sprintf("%s:%s", prefix, clevel))
if err != nil {
return nil, fmt.Errorf("failed to fetch sections: %s", err)
}
Expand All @@ -285,7 +371,7 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s
err := utils.Retry(func() error {
req, err := http.NewRequest("POST", "https://coursebook.utdallas.edu/clips/clip-cb11-hat.zog", strings.NewReader(queryStr))
if err != nil {
log.Fatalf("Http request failed: %v", err)
return fmt.Errorf("http request failed: %w", err)
}
req.Header = s.coursebookHeaders

Expand All @@ -310,7 +396,13 @@ func (s *coursebookScraper) req(queryStr string, retries int, reqName string) (s
return err
}, retries, func(numRetries int) {
utils.VPrintf("[Request Retry] Attempt %d of %d for request %s", numRetries, retries, reqName)
s.coursebookHeaders = utils.RefreshToken(s.chromedpCtx)
coursebookHeaders, err := utils.RefreshToken(s.chromedpCtx)
if err != nil {
utils.VPrintf("[Token Refresh Failed] Failed to refresh token during retry for request %s: %v", reqName, err)
} else {
s.coursebookHeaders = coursebookHeaders
}

s.reqRetries++

//back off exponentially
Expand Down Expand Up @@ -345,7 +437,7 @@ func (s *coursebookScraper) validate() error {
log.Printf("[Validation] Missing %d sections for %s", len(ids), prefix)

if err := s.ensurePrefixFolder(prefix); err != nil {
log.Fatal(err)
log.Panic(err)
}

for _, id := range ids {
Expand Down
Loading
Loading