1- /* Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2020, 2025 Vladimir Panteleev <vladimir@thecybershadow.net>
1+ /* Copyright (C) 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2020, 2025, 2026 Vladimir Panteleev <vladimir@thecybershadow.net>
22 *
33 * This program is free software: you can redistribute it and/or modify
44 * it under the terms of the GNU Affero General Public License as
@@ -57,12 +57,13 @@ import dfeed.web.web.user : user, userSettings;
5757struct JourneyEvent
5858{
5959 string timestamp;
60- string type; // "captcha", "spam_check", "moderation", "posted", "info", "log_file", "approval"
60+ string type; // "captcha", "spam_check", "moderation", "posted", "info", "log_file", "approval", "page_visit", "referrer"
6161 string message;
6262 bool success; // true for success, false for failure
6363 string details; // additional details like spamicity value
6464 string sourceFile; // log file name
6565 int lineNumber; // line number in log file (1-based)
66+ string url; // for page_visit events, the URL to link to
6667}
6768
6869JourneyEvent[] parsePostingJourney (string messageID)
@@ -208,6 +209,126 @@ JourneyEvent[] parsePostingJourney(string messageID)
208209 }
209210 }
210211
212+ // Parse Web.log for page visits matching IP and User-Agent
213+ // Returns the web events separately (not added to main events array)
214+ JourneyEvent[] parseWebLog (string postProcessLogFile, string ip, string userAgent)
215+ {
216+ JourneyEvent[] webEvents;
217+
218+ if (ip.length == 0 )
219+ return webEvents;
220+
221+ // Replace "PostProcess-xxx.log" with "Web.log" to get the Web log for the same date
222+ auto webLogFile = postProcessLogFile.matchFirst(` ^(.* - )PostProcess-[a-z]+\.log$` );
223+ if (! webLogFile)
224+ return webEvents;
225+
226+ auto logPath = webLogFile[1 ] ~ " Web.log" ;
227+ if (! exists(logPath))
228+ return webEvents;
229+
230+ auto content = cast (string )read(logPath);
231+ auto logFileName = baseName(logPath);
232+ int lineNum = 0 ;
233+
234+ // Track referrers we've already added to avoid duplicates
235+ bool [string ] seenReferrers;
236+
237+ foreach (line; content.split(" \n " ))
238+ {
239+ lineNum++ ;
240+ if (line.length < 30 || line[0 ] != ' [' )
241+ continue ;
242+
243+ // Parse log line: [timestamp] \tIP\tSTATUS\tTIME\tMETHOD\tURL\tCONTENT-TYPE[\tREFERER\tUSER-AGENT]
244+ auto closeBracket = line.indexOf(" ]" );
245+ if (closeBracket < 0 )
246+ continue ;
247+ auto timestamp = line[1 .. closeBracket];
248+ auto rest = line[closeBracket + 2 .. $];
249+
250+ auto fields = rest.split(" \t " );
251+ if (fields.length < 7 )
252+ continue ;
253+
254+ // Field indices (first field is empty for alignment):
255+ // [0]=empty, [1]=IP, [2]=STATUS, [3]=TIME, [4]=METHOD, [5]=URL, [6]=CONTENT-TYPE, [7]=REFERER, [8]=USER-AGENT
256+ auto logIP = fields[1 ];
257+ auto status = fields[2 ];
258+ auto method = fields[4 ];
259+ auto url = fields[5 ];
260+ auto contentType = fields[6 ];
261+ string referer = fields.length > 7 ? fields[7 ] : " -" ;
262+ string logUserAgent = fields.length > 8 ? fields[8 ] : " " ;
263+
264+ // Check if this matches our user's IP
265+ if (logIP != ip)
266+ continue ;
267+
268+ // If we have a User-Agent to match, check it (but don't require it)
269+ if (userAgent.length > 0 && logUserAgent.length > 0 && logUserAgent != userAgent)
270+ continue ;
271+
272+ // Only interested in text/html pages (GET and POST requests)
273+ // Also include POST redirects (3xx status with no content type)
274+ if (method != " GET" && method != " POST" )
275+ continue ;
276+ bool isRedirect = status.length >= 1 && status[0 ] == ' 3' ;
277+ if (! contentType.startsWith(" text/html" ) && ! (method == " POST" && isRedirect))
278+ continue ;
279+
280+ // Skip static resources
281+ if (url.canFind(" /static/" ))
282+ continue ;
283+
284+ // Extract just the path from the URL for display
285+ string displayPath = url;
286+ auto hostEnd = url.indexOf(" ://" );
287+ if (hostEnd >= 0 )
288+ {
289+ auto pathStart = url.indexOf(" /" , hostEnd + 3 );
290+ if (pathStart >= 0 )
291+ displayPath = url[pathStart .. $];
292+ }
293+
294+ // Check for external referrer (not from same site)
295+ if (referer != " -" && referer.length > 0 )
296+ {
297+ // Check if referrer is external (doesn't contain our host)
298+ auto urlHost = url.indexOf(" ://" );
299+ string ourHost;
300+ if (urlHost >= 0 )
301+ {
302+ auto hostStart = urlHost + 3 ;
303+ auto hostEndPos = url.indexOf(" /" , hostStart);
304+ ourHost = hostEndPos >= 0 ? url[hostStart .. hostEndPos] : url[hostStart .. $];
305+ }
306+
307+ bool isExternal = ourHost.length > 0 && ! referer.canFind(ourHost);
308+
309+ if (isExternal && referer ! in seenReferrers)
310+ {
311+ seenReferrers[referer] = true ;
312+ auto evt = JourneyEvent(timestamp, " referrer" , " External referrer" , true , " " , logFileName, lineNum);
313+ evt.url = referer;
314+ webEvents ~= evt;
315+ }
316+ }
317+
318+ // Add page visit event
319+ auto eventMessage = method == " POST" ? " Form submission" : " Page visit" ;
320+ auto evt = JourneyEvent(timestamp, " page_visit" , eventMessage, true , displayPath, logFileName, lineNum);
321+ evt.url = url;
322+ webEvents ~= evt;
323+ }
324+
325+ return webEvents;
326+ }
327+
328+ // Track IP and User-Agent for web log correlation
329+ string userIP;
330+ string userAgent;
331+
211332 // Parse each log file
212333 foreach (ref related; relatedLogs)
213334 {
@@ -244,8 +365,13 @@ JourneyEvent[] parsePostingJourney(string messageID)
244365 // Parse different event types
245366 if (message.startsWith(" IP: " ))
246367 {
368+ userIP = message[4 .. $];
247369 events ~= JourneyEvent(timestamp, " info" , " IP Address" , true , message[4 .. $], logFileName, lineNum);
248370 }
371+ else if (message.startsWith(" [Header] User-Agent: " ))
372+ {
373+ userAgent = message[21 .. $];
374+ }
249375 else if (message.startsWith(" CAPTCHA OK" ))
250376 {
251377 events ~= JourneyEvent(timestamp, " captcha" , " CAPTCHA solved successfully" , true , " " , logFileName, lineNum);
@@ -321,10 +447,79 @@ JourneyEvent[] parsePostingJourney(string messageID)
321447 }
322448 }
323449
324- // Search for approval event (added last so it appears in chronological order)
450+ // Search for approval event
325451 if (primaryLog ! is null )
326452 searchBannedLog(postID, primaryLog);
327453
454+ // Parse Web.log for page visits (returns separate array, doesn't modify events)
455+ JourneyEvent[] webEvents;
456+ if (primaryLog ! is null )
457+ webEvents = parseWebLog(primaryLog, userIP, userAgent);
458+
459+ // Interleave web events between PostProcess log sections based on timestamps
460+ if (webEvents.length > 0 )
461+ {
462+ // Sort web events by timestamp
463+ webEvents.sort! ((a, b) => a.timestamp < b.timestamp);
464+
465+ // Split events into sections (each section starts with a log_file header)
466+ struct LogSection
467+ {
468+ size_t startIdx; // Index of log_file header in events array
469+ size_t endIdx; // Index after last event in this section
470+ string firstTimestamp; // First non-header event timestamp
471+ }
472+ LogSection[] sections;
473+
474+ for (size_t i = 0 ; i < events.length; i++ )
475+ {
476+ if (events[i].type == " log_file" )
477+ {
478+ LogSection section;
479+ section.startIdx = i;
480+ // Find the end of this section (next log_file header or end of array)
481+ size_t j = i + 1 ;
482+ while (j < events.length && events[j].type != " log_file" )
483+ {
484+ if (section.firstTimestamp.length == 0 && events[j].timestamp.length > 0 )
485+ section.firstTimestamp = events[j].timestamp;
486+ j++ ;
487+ }
488+ section.endIdx = j;
489+ sections ~= section;
490+ i = j - 1 ; // Continue from end of section
491+ }
492+ }
493+
494+ // Rebuild events with web events interleaved
495+ JourneyEvent[] newEvents;
496+ size_t webIdx = 0 ;
497+
498+ foreach (sectionIdx, ref section; sections)
499+ {
500+ // Insert web events that occurred before this section's first event
501+ while (webIdx < webEvents.length &&
502+ (section.firstTimestamp.length == 0 || webEvents[webIdx].timestamp < section.firstTimestamp))
503+ {
504+ newEvents ~= webEvents[webIdx];
505+ webIdx++ ;
506+ }
507+
508+ // Add this section's events
509+ for (size_t i = section.startIdx; i < section.endIdx; i++ )
510+ newEvents ~= events[i];
511+ }
512+
513+ // Add any remaining web events after all sections
514+ while (webIdx < webEvents.length)
515+ {
516+ newEvents ~= webEvents[webIdx];
517+ webIdx++ ;
518+ }
519+
520+ events = newEvents;
521+ }
522+
328523 return events;
329524}
330525
@@ -348,10 +543,13 @@ void renderJourneyTimeline(JourneyEvent[] events)
348543 ` .journey-event.spam_detail { border-left-color: #A85; background: #FFFAF5; padding: 0.33em 0.75em; }` ~
349544 ` .journey-event.log_file { border-left-color: #85A; background: #F5F5F5; }` ~
350545 ` .journey-event.approval { border-left-color: #5A5; background: #F0FFF0; }` ~
546+ ` .journey-event.page_visit { border-left-color: #59A; background: #F5FAFF; }` ~
547+ ` .journey-event.referrer { border-left-color: #A59; background: #FAF5FF; }` ~
351548 ` .journey-event.log_file:not(:first-child) { margin-top: 1em; border-top: 2px solid #E6E6E6; }` ~
352549 ` .journey-timestamp { color: #666; font-size: 0.95em; }` ~
353550 ` .journey-message { font-weight: bold; }` ~
354551 ` .journey-details { color: #666; font-size: 0.95em; margin-top: 0.25em; }` ~
552+ ` .journey-details a { color: #369; }` ~
355553 ` .journey-source { color: #999; font-size: 0.9em; float: right; }` ~
356554 ` </style>` ~
357555 ` <div class="journey-timeline">` ~
@@ -368,6 +566,10 @@ void renderJourneyTimeline(JourneyEvent[] events)
368566 cssClass = " spam_detail" ;
369567 else if (event.type == " approval" )
370568 cssClass = " approval" ;
569+ else if (event.type == " page_visit" )
570+ cssClass = " page_visit" ;
571+ else if (event.type == " referrer" )
572+ cssClass = " referrer" ;
371573 else if (event.success)
372574 cssClass = " success" ;
373575 else if (event.type == " info" )
@@ -396,10 +598,25 @@ void renderJourneyTimeline(JourneyEvent[] events)
396598 html.putEncodedEntities(event.message);
397599 html.put(` </span>` );
398600
399- if (event.details.length > 0 )
601+ if (event.details.length > 0 || event.url.length > 0 )
400602 {
401603 html.put(` <div class="journey-details">` );
402- html.putEncodedEntities(event.details);
604+ if (event.url.length > 0 )
605+ {
606+ html.put(` <a href="` );
607+ html.putEncodedEntities(event.url);
608+ html.put(` " target="_blank" rel="noopener">` );
609+ // For page visits, show the path; for referrers, show the full URL
610+ if (event.details.length > 0 )
611+ html.putEncodedEntities(event.details);
612+ else
613+ html.putEncodedEntities(event.url);
614+ html.put(` </a>` );
615+ }
616+ else
617+ {
618+ html.putEncodedEntities(event.details);
619+ }
403620 html.put(` </div>` );
404621 }
405622 html.put(` </div>` );
0 commit comments