forked from alshedivat/al-folio
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathexternal-posts.rb
More file actions
124 lines (109 loc) · 4.03 KB
/
external-posts.rb
File metadata and controls
124 lines (109 loc) · 4.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
require 'feedjira'
require 'httparty'
require 'jekyll'
require 'nokogiri'
require 'time'
module ExternalPosts
class ExternalPostsGenerator < Jekyll::Generator
safe true
priority :high
def generate(site)
if site.config['external_sources'] != nil
site.config['external_sources'].each do |src|
puts "Fetching external posts from #{src['name']}:"
if src['rss_url']
fetch_from_rss(site, src)
elsif src['posts']
fetch_from_urls(site, src)
end
end
end
end
def fetch_from_rss(site, src)
xml = HTTParty.get(src['rss_url']).body
return if xml.nil?
begin
feed = Feedjira.parse(xml)
rescue StandardError => e
puts "Error parsing RSS feed from #{src['rss_url']} - #{e.message}"
return
end
process_entries(site, src, feed.entries)
end
def process_entries(site, src, entries)
entries.each do |e|
puts "...fetching #{e.url}"
create_document(site, src['name'], e.url, {
title: e.title,
content: e.content,
summary: e.summary,
published: e.published
}, src)
end
end
def create_document(site, source_name, url, content, src = {})
# check if title is composed only of whitespace or foreign characters
if content[:title].gsub(/[^\w]/, '').strip.empty?
# use the source name and last url segment as fallback
slug = "#{source_name.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')}-#{url.split('/').last}"
else
# parse title from the post or use the source name and last url segment as fallback
slug = content[:title].downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')
slug = "#{source_name.downcase.strip.gsub(' ', '-').gsub(/[^\w-]/, '')}-#{url.split('/').last}" if slug.empty?
end
path = site.in_source_dir("_posts/#{slug}.md")
doc = Jekyll::Document.new(
path, { :site => site, :collection => site.collections['posts'] }
)
doc.data['external_source'] = source_name
doc.data['title'] = content[:title]
doc.data['feed_content'] = content[:content]
doc.data['description'] = content[:summary]
doc.data['date'] = content[:published]
doc.data['redirect'] = url
# Apply default categories and tags from source configuration
if src['categories'] && src['categories'].is_a?(Array) && !src['categories'].empty?
doc.data['categories'] = src['categories']
end
if src['tags'] && src['tags'].is_a?(Array) && !src['tags'].empty?
doc.data['tags'] = src['tags']
end
doc.content = content[:content]
site.collections['posts'].docs << doc
end
def fetch_from_urls(site, src)
src['posts'].each do |post|
puts "...fetching #{post['url']}"
content = fetch_content_from_url(post['url'])
content[:published] = parse_published_date(post['published_date'])
create_document(site, src['name'], post['url'], content, src)
end
end
def parse_published_date(published_date)
case published_date
when String
Time.parse(published_date).utc
when Date
published_date.to_time.utc
else
raise "Invalid date format for #{published_date}"
end
end
def fetch_content_from_url(url)
html = HTTParty.get(url).body
parsed_html = Nokogiri::HTML(html)
title = parsed_html.at('head title')&.text.strip || ''
description = parsed_html.at('head meta[name="description"]')&.attr('content')
description ||= parsed_html.at('head meta[name="og:description"]')&.attr('content')
description ||= parsed_html.at('head meta[property="og:description"]')&.attr('content')
body_content = parsed_html.search('p').map { |e| e.text }
body_content = body_content.join() || ''
{
title: title,
content: body_content,
summary: description
# Note: The published date is now added in the fetch_from_urls method.
}
end
end
end