Skip to content

Commit aa02d1a

Browse files
authored
Merge pull request #1236 from kennethrioja/gray-scott-ingestor
[HEP Training ingestors] added a custom event ingestor (Gray Scott events)
2 parents 8d6831e + ebd45cb commit aa02d1a

8 files changed

Lines changed: 228 additions & 7 deletions

File tree

lib/ingestors/github_ingestor.rb

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def to_material(repo_data) # rubocop:disable Metrics/AbcSize
104104
github_io_homepage = github_io_homepage? repo_data['homepage']
105105
url = github_io_homepage ? repo_data['homepage'] : repo_data['html_url']
106106
redirected_url = get_redirected_url(url)
107-
html = get_html(redirected_url)
107+
html = get_html_from_url(redirected_url)
108108

109109
material = OpenStruct.new
110110
material.title = repo_data['name'].titleize
@@ -131,11 +131,6 @@ def github_io_homepage?(homepage)
131131
url.host&.downcase&.end_with?('.github.io')
132132
end
133133

134-
def get_html(url)
135-
response = HTTParty.get(url, follow_redirects: true, headers: { 'User-Agent' => config[:user_agent] })
136-
Nokogiri::HTML(response.body)
137-
end
138-
139134
# DEFINITION – Opens the GitHub homepage, fetches the 3 first >50 char <p> tags'text
140135
# and joins them with a 'Read more...' link at the end of the description
141136
# Some of the first <p> tags were not descriptive, thus skipping them
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
require 'icalendar'
2+
require 'nokogiri'
3+
require 'open-uri'
4+
require 'tzinfo'
5+
6+
module Ingestors
7+
module Heptraining
8+
class GrayScottIngestor < Ingestor
9+
def self.config
10+
{
11+
key: 'gray_scott_event',
12+
title: 'Gray Scott Events API',
13+
category: :events,
14+
user_agent: 'TeSS Gray Scott ingestor'
15+
}
16+
end
17+
18+
def read(url)
19+
@verbose = false
20+
process_gray_scott(url)
21+
end
22+
23+
private
24+
25+
def process_gray_scott(url)
26+
events = Icalendar::Event.parse(open_url(url, raise: true).set_encoding('utf-8'))
27+
raise 'Not found' if events.nil? || events.empty?
28+
29+
events.each do |e|
30+
process_calevent(e, url)
31+
end
32+
end
33+
34+
def process_calevent(calevent, url)
35+
# puts "calevent: #{calevent.inspect}"
36+
gs_url = calevent.custom_properties.find { |key, _| key.include?('http') }&.last&.first&.strip&.gsub(%r{^[/\s]+|[/\s]+$}, '')&.prepend('https://')
37+
html = get_html_from_url(get_gray_scott_redirection(gs_url))
38+
39+
event = OpenStruct.new
40+
event.title = calevent.summary.to_s
41+
event.url = gs_url
42+
html_description = html.css('.paragraphStyle').text.to_s.strip
43+
event.description = html_description.empty? ? calevent.description.to_s : html_description
44+
45+
event.end = calevent.dtend&.to_time&.utc
46+
unless calevent.dtstart.nil?
47+
dtstart = calevent.dtstart
48+
event.start = dtstart&.to_time&.utc
49+
tzid = dtstart.ical_params['tzid']
50+
event.timezone = tzid.first.to_s if !tzid.nil? && tzid.size.positive?
51+
end
52+
event.venue = clean_html(calevent.location.to_s)
53+
event.organizer = html.css('h3:contains("Speakers") + ul li a')&.map(&:text)&.map(&:strip)&.join(', ') # coma separated if multiple speakers
54+
55+
@events << event
56+
end
57+
58+
def get_gray_scott_redirection(url)
59+
uri = URI.parse(url)
60+
label = CGI.parse(uri.query)['label']&.first
61+
62+
script_content = get_html_from_url(url).css('script').find { |s| s.content.include?('var dictReference') }&.content
63+
dict_match = script_content&.match(/var\s+dictReference\s*=\s*({[^}]+})/)
64+
return unless dict_match
65+
66+
dict = JSON.parse(dict_match[1])
67+
matched_value = dict[label]
68+
return url unless matched_value
69+
70+
"#{uri.scheme}://#{uri.host}#{uri.path.sub(%r{/[^/]+$}, '')}/#{matched_value}"
71+
end
72+
73+
def clean_html(html)
74+
Nokogiri::HTML::DocumentFragment.parse(html).text.strip
75+
end
76+
end
77+
end
78+
end

lib/ingestors/ingestor.rb

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,11 @@ def open_url(url, raise: false, token: nil)
8484
end
8585
end
8686

87+
def get_html_from_url(url)
88+
response = HTTParty.get(url, follow_redirects: true, headers: { 'User-Agent' => config[:user_agent] })
89+
Nokogiri::HTML(response.body)
90+
end
91+
8792
# Some URLs automatically redirects the user to another webpage
8893
# This method gets a URL and returns the last redirected URL (as shown by a 30X response or a `meta[http-equiv="Refresh"]` tag)
8994
def get_redirected_url(url, limit = 5) # rubocop:disable Metrics/AbcSize

lib/ingestors/ingestor_factory.rb

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def self.ingestors
1313
Ingestors::ZenodoIngestor,
1414
Ingestors::OaiPmhIngestor,
1515
Ingestors::GithubIngestor,
16-
] + taxila_ingestors + llm_ingestors
16+
] + taxila_ingestors + llm_ingestors + heptraining_ingestors
1717
end
1818

1919
def self.taxila_ingestors
@@ -51,6 +51,12 @@ def self.llm_ingestors
5151
]
5252
end
5353

54+
def self.heptraining_ingestors
55+
[
56+
Ingestors::Heptraining::GrayScottIngestor
57+
]
58+
end
59+
5460
def self.ingestor_config
5561
@ingestor_config ||= ingestors.map do |i|
5662
[i.config[:key], i.config.merge(ingestor: i)]
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
BEGIN:VCALENDAR
2+
VERSION:2.0
3+
PRODID:-//PhoenixTex2Html//gray_scott_2026_webinars/
4+
BEGIN:VEVENT
5+
CLASS:PUBLIC
6+
DTSTAMP:20260212T103600
7+
UID:TH8WMR_PNR0012_20260212T103600
8+
DTSTART;TZID=Europe/Paris:20260226T100000
9+
DTEND;TZID=Europe/Paris:20260226T113000
10+
SUMMARY:Memory allocation, why and how to profile applications
11+
12+
LOCATION:Registration : <a id="0" href="https://teratec.webex.com/blabla">https://teratec.webex.com/blabla</a>
13+
14+
DESCRIPTION:Memory allocation, why and how to profile applications
15+
\n
16+
https://cta-lapp.pages.in2p3.fr/cours/gray_scott_revolutions/grayscott2026/redirect.html?label=sec_gray_scott_webinar_memory_allocation_memory_profiling\n
17+
BEGIN:VALARM
18+
TRIGGER:-PT10M
19+
ACTION:DISPLAY
20+
DESCRIPTION:Reminder
21+
END:VALARM
22+
END:VEVENT
23+
END:VCALENDAR
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
2+
<!DOCTYPE html>
3+
<html class="js sidebar-visible navy" lang="fr">
4+
<head>
5+
<meta charset="UTF-8">
6+
<title>Memory allocation, why and how to profile applications
7+
</title>
8+
<meta content="text/html; charset=UTF-8" http-equiv="Content-Type">
9+
<meta name="description" content="Memory allocation, why and how to profile applications
10+
">
11+
<meta name="viewport" content="width=device-width, initial-scale=1">
12+
<meta name="theme-color" content="rgba(0, 0, 0, 0)">
13+
<link rel="stylesheet" href="variables.css">
14+
<link rel="stylesheet" href="dark_style.css" />
15+
<link rel="stylesheet" href="general.css">
16+
<link rel="stylesheet" href="chrome.css">
17+
<link rel="stylesheet" href="highlight.css" disabled="">
18+
<link rel="stylesheet" href="tomorrow-night.css">
19+
<link rel="stylesheet" href="ayu-highlight.css" disabled="">
20+
<!-- Fonts -->
21+
<link rel="stylesheet" href="font-awesome.css">
22+
<link rel="stylesheet" href="fonts.css">
23+
<!-- <script src="" async></script> -->
24+
<!-- <script src=""></script> -->
25+
</head>
26+
<body>
27+
28+
<a id="450" href="invitation/gray_scott_webinar_memory_allocation_memory_profiling.ics"><div class="rendezvousStyle"></div></a><b>Date</b> : 26/02/2026<br />
29+
<b>Location</b> : Registration : <a id="458" href="https://teratec.webex.com/webappng/sites/teratec/webinar/webinarSeries/register/0465b64b919540de9910a5b84077b878">https://teratec.webex.com/webappng/sites/teratec/webinar/webinarSeries/register/0465b64b919540de9910a5b84077b878</a>
30+
<br />
31+
<b>Start at</b> : 10:00<br />
32+
<b>Stop at</b> : 11:30 <h3 id="466">Speakers</h3>
33+
<ul>
34+
<li><a href="2-3-5-4513.html">Someone
35+
</a></li>
36+
<li><a href="2-3-5-4513.html">SomeoneElse
37+
</a></li>
38+
</ul>
39+
<h3 id="471">Description</h3>
40+
<p id="472" class="paragraphStyle">
41+
Sometimes memory has become a major problem in applications, with its bandwidth but also by the incresing size needed by more and more complex and dynamic applications. So, how to track these errors and point problematic patterns ? How to find where the memory is consumed when the application reaches the hardware limit ? After my PhD on memory management in HPC context (NUMA, parallel, etc) I had the opportunity to develop two profilers (malloc and numa) now open-sources for C/C++/Fortran and Rust. I will briefly present these tools with some examples and expected observations.
42+
</p>
43+
44+
</body>
45+
</html>
46+
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
2+
<!DOCTYPE html>
3+
<html lang="fr">
4+
<head>
5+
<meta charset="utf-8" />
6+
<title>Page redirection</title>
7+
<link rel="stylesheet" href="dark_style.css" />
8+
<script type="text/javascript">
9+
function redirectionWithLabelReference(){
10+
var parameters = location.search.substring(1).split("?");
11+
var tmp = parameters[0].split("=");
12+
referenceName = unescape(tmp[1]);
13+
var dictReference = {
14+
"sec_gray_scott_webinar_memory_allocation_memory_profiling": "1-1-5-1-449.html"
15+
};
16+
if(referenceName in dictReference){
17+
document.location.href=dictReference[referenceName];
18+
}else{
19+
document.location.href="index.html";
20+
}
21+
}
22+
</script>
23+
</head>
24+
<body onLoad="setTimeout('redirectionWithLabelReference()', 1000)">
25+
<div>Dans 2 secondes vous allez être redirigé vers la page que vous avez demandée... normalement</div>
26+
</body>
27+
</html>
28+
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
require 'test_helper'
2+
3+
class GrayScottIngestorTest < ActiveSupport::TestCase
4+
setup do
5+
@ingestor = Ingestors::Heptraining::GrayScottIngestor.new
6+
@user = users(:regular_user)
7+
@content_provider = content_providers(:another_portal_provider)
8+
9+
webmock('https://cta-lapp.pages.in2p3.fr/COURS/GRAY_SCOTT_REVOLUTIONS/GrayScott2026/invitation/gray_scott_2026_webinars.ics', 'heptraining/grayscott/grayscott-event.ics')
10+
webmock('https://cta-lapp.pages.in2p3.fr/cours/gray_scott_revolutions/grayscott2026/redirect.html?label=sec_gray_scott_webinar_memory_allocation_memory_profiling', 'heptraining/grayscott/grayscott-redirect.html')
11+
webmock('https://cta-lapp.pages.in2p3.fr/cours/gray_scott_revolutions/grayscott2026/1-1-5-1-449.html', 'heptraining/grayscott/grayscott-page.html')
12+
end
13+
14+
teardown do
15+
reset_timezone
16+
end
17+
18+
test 'should read Gray Scott ics' do
19+
@ingestor.read('https://cta-lapp.pages.in2p3.fr/COURS/GRAY_SCOTT_REVOLUTIONS/GrayScott2026/invitation/gray_scott_2026_webinars.ics')
20+
@ingestor.write(@user, @content_provider)
21+
22+
sample = @ingestor.events.detect { |e| e.title == 'Memory allocation, why and how to profile applications' }
23+
assert sample.persisted?
24+
25+
assert_equal sample.url, 'https://cta-lapp.pages.in2p3.fr/cours/gray_scott_revolutions/grayscott2026/redirect.html?label=sec_gray_scott_webinar_memory_allocation_memory_profiling'
26+
assert_includes sample.description, 'Sometimes memory has become a major problem in applications'
27+
assert_equal sample.end, '2026-02-26 10:30:00 +0000'
28+
assert_equal sample.start, '2026-02-26 09:00:00 +0000'
29+
assert_equal sample.timezone, 'Paris'
30+
assert_includes sample.venue, 'teratec.webex.com'
31+
assert_equal sample.organizer, 'Someone, SomeoneElse'
32+
end
33+
34+
private
35+
36+
def webmock(url, filename)
37+
file = Rails.root.join('test', 'fixtures', 'files', 'ingestion', filename)
38+
WebMock.stub_request(:get, url).to_return(status: 200, headers: {}, body: file.read)
39+
end
40+
end

0 commit comments

Comments
 (0)