init commit
This commit is contained in:
232
lib/url_to_article/article_extractor.rb
Normal file
232
lib/url_to_article/article_extractor.rb
Normal file
@@ -0,0 +1,232 @@
|
||||
# frozen_string_literal: true
|
||||
|
||||
require "nokogiri"
|
||||
require "reverse_markdown"
|
||||
require "net/http"
|
||||
require "uri"
|
||||
require "timeout"
|
||||
|
||||
module UrlToArticle
|
||||
class ArticleExtractor
|
||||
# Tags that are almost never article content
|
||||
NOISE_SELECTORS = %w[
|
||||
script style noscript iframe nav footer header
|
||||
.navigation .nav .menu .sidebar .widget .ad .advertisement
|
||||
.cookie-banner .cookie-notice .popup .modal .overlay
|
||||
.social-share .share-buttons .related-posts .comments
|
||||
#comments #sidebar #navigation #footer #header
|
||||
[role=navigation] [role=banner] [role=contentinfo]
|
||||
[aria-label=navigation] [aria-label=footer]
|
||||
].freeze
|
||||
|
||||
# Candidate content selectors tried in order
|
||||
ARTICLE_SELECTORS = %w[
|
||||
article[class*=content]
|
||||
article[class*=post]
|
||||
article[class*=article]
|
||||
article
|
||||
[role=main]
|
||||
main
|
||||
.post-content
|
||||
.article-content
|
||||
.entry-content
|
||||
.article-body
|
||||
.story-body
|
||||
.post-body
|
||||
.content-body
|
||||
.page-content
|
||||
#article-body
|
||||
#post-content
|
||||
#main-content
|
||||
].freeze
|
||||
|
||||
Result = Struct.new(:title, :byline, :site_name, :description, :markdown, :url, keyword_init: true)
|
||||
|
||||
def self.extract(url)
|
||||
new(url).extract
|
||||
end
|
||||
|
||||
def initialize(url)
|
||||
@url = url
|
||||
@uri = URI.parse(url)
|
||||
end
|
||||
|
||||
def extract
|
||||
html = fetch_html
|
||||
doc = Nokogiri::HTML(html)
|
||||
|
||||
title = extract_title(doc)
|
||||
byline = extract_byline(doc)
|
||||
site_name = extract_site_name(doc)
|
||||
description = extract_description(doc)
|
||||
content_node = find_content_node(doc)
|
||||
|
||||
clean_node!(content_node)
|
||||
markdown = node_to_markdown(content_node)
|
||||
markdown = truncate(markdown)
|
||||
|
||||
Result.new(
|
||||
title: title,
|
||||
byline: byline,
|
||||
site_name: site_name,
|
||||
description: description,
|
||||
markdown: markdown,
|
||||
url: @url
|
||||
)
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def fetch_html
|
||||
Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do
|
||||
http = Net::HTTP.new(@uri.host, @uri.port)
|
||||
http.use_ssl = @uri.scheme == "https"
|
||||
http.open_timeout = 5
|
||||
http.read_timeout = SiteSetting.url_to_article_fetch_timeout
|
||||
|
||||
request = Net::HTTP::Get.new(@uri.request_uri)
|
||||
request["User-Agent"] = "Mozilla/5.0 (compatible; Discourse URL-to-Article Bot/1.0)"
|
||||
request["Accept"] = "text/html,application/xhtml+xml"
|
||||
request["Accept-Language"] = "en-US,en;q=0.9"
|
||||
|
||||
response = http.request(request)
|
||||
|
||||
# Follow one redirect
|
||||
if response.is_a?(Net::HTTPRedirection) && response["location"]
|
||||
redirect_uri = URI.parse(response["location"])
|
||||
@uri = redirect_uri
|
||||
http = Net::HTTP.new(@uri.host, @uri.port)
|
||||
http.use_ssl = @uri.scheme == "https"
|
||||
response = http.get(@uri.request_uri, "User-Agent" => request["User-Agent"])
|
||||
end
|
||||
|
||||
raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess)
|
||||
response.body.force_encoding("UTF-8")
|
||||
end
|
||||
end
|
||||
|
||||
def extract_title(doc)
|
||||
# Try OG title first, then twitter:title, then <title>
|
||||
og = doc.at_css('meta[property="og:title"]')&.attr("content")
|
||||
return og.strip if og.present?
|
||||
|
||||
tw = doc.at_css('meta[name="twitter:title"]')&.attr("content")
|
||||
return tw.strip if tw.present?
|
||||
|
||||
h1 = doc.at_css("h1")&.text
|
||||
return h1.strip if h1.present?
|
||||
|
||||
doc.at_css("title")&.text&.strip || @uri.host
|
||||
end
|
||||
|
||||
def extract_byline(doc)
|
||||
candidates = [
|
||||
doc.at_css('meta[name="author"]')&.attr("content"),
|
||||
doc.at_css('[rel="author"]')&.text,
|
||||
doc.at_css(".author")&.text,
|
||||
doc.at_css('[class*="byline"]')&.text,
|
||||
doc.at_css("address")&.text,
|
||||
]
|
||||
candidates.compact.map(&:strip).reject(&:empty?).first
|
||||
end
|
||||
|
||||
def extract_site_name(doc)
|
||||
doc.at_css('meta[property="og:site_name"]')&.attr("content")&.strip ||
|
||||
@uri.host.sub(/^www\./, "")
|
||||
end
|
||||
|
||||
def extract_description(doc)
|
||||
doc.at_css('meta[property="og:description"]')&.attr("content")&.strip ||
|
||||
doc.at_css('meta[name="description"]')&.attr("content")&.strip
|
||||
end
|
||||
|
||||
def find_content_node(doc)
|
||||
# Try known article selectors
|
||||
ARTICLE_SELECTORS.each do |sel|
|
||||
node = doc.at_css(sel)
|
||||
next unless node
|
||||
text = node.text.strip
|
||||
# Make sure it has meaningful content (>200 chars of text)
|
||||
return node if text.length > 200
|
||||
end
|
||||
|
||||
# Fallback: score all <div> and <section> blocks by text density
|
||||
score_and_pick(doc)
|
||||
end
|
||||
|
||||
def score_and_pick(doc)
|
||||
candidates = doc.css("div, section, td").map do |node|
|
||||
text = node.text.strip
|
||||
next if text.length < 150
|
||||
|
||||
# Score = text length - penalize nodes with lots of tags (nav-heavy)
|
||||
tag_count = node.css("*").size.to_f
|
||||
text_length = text.length.to_f
|
||||
score = text_length - (tag_count * 3)
|
||||
|
||||
[score, node]
|
||||
end.compact.sort_by { |s, _| -s }
|
||||
|
||||
candidates.first&.last || doc.at_css("body") || doc
|
||||
end
|
||||
|
||||
def clean_node!(node)
|
||||
return unless node
|
||||
|
||||
# Remove noise elements
|
||||
NOISE_SELECTORS.each do |sel|
|
||||
node.css(sel).each(&:remove)
|
||||
end
|
||||
|
||||
# Remove hidden elements
|
||||
node.css("[style]").each do |el|
|
||||
el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i
|
||||
end
|
||||
|
||||
# Remove empty tags (except br, img, hr)
|
||||
node.css("span, div, p, section").each do |el|
|
||||
el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty?
|
||||
end
|
||||
|
||||
# Strip all attributes except allowed ones on certain tags
|
||||
allowed = {
|
||||
"a" => %w[href title],
|
||||
"img" => %w[src alt title width height],
|
||||
"td" => %w[colspan rowspan],
|
||||
"th" => %w[colspan rowspan scope],
|
||||
"ol" => %w[start type],
|
||||
"li" => %w[value],
|
||||
"code" => %w[class],
|
||||
"pre" => %w[class],
|
||||
}
|
||||
node.css("*").each do |el|
|
||||
tag = el.name.downcase
|
||||
permitted = allowed[tag] || []
|
||||
el.attributes.each_key do |attr|
|
||||
el.remove_attribute(attr) unless permitted.include?(attr)
|
||||
end
|
||||
|
||||
# Make relative image URLs absolute
|
||||
if tag == "img" && el["src"] && !el["src"].start_with?("http", "//", "data:")
|
||||
el["src"] = URI.join(@url, el["src"]).to_s rescue nil
|
||||
end
|
||||
if tag == "a" && el["href"] && !el["href"].start_with?("http", "//", "#", "mailto:")
|
||||
el["href"] = URI.join(@url, el["href"]).to_s rescue nil
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def node_to_markdown(node)
|
||||
return "" unless node
|
||||
ReverseMarkdown.convert(node.to_html, unknown_tags: :bypass, github_flavored: true)
|
||||
.gsub(/\n{3,}/, "\n\n") # collapse excessive blank lines
|
||||
.strip
|
||||
end
|
||||
|
||||
def truncate(text)
|
||||
max = SiteSetting.url_to_article_max_content_length
|
||||
return text if text.length <= max
|
||||
text[0...max] + "\n\n*[Content truncated — visit the original article for the full text.]*"
|
||||
end
|
||||
end
|
||||
end
|
||||
Reference in New Issue
Block a user