233 lines
6.9 KiB
Ruby
233 lines
6.9 KiB
Ruby
|
|
# frozen_string_literal: true
|
||
|
|
|
||
|
|
require "nokogiri"
|
||
|
|
require "reverse_markdown"
|
||
|
|
require "net/http"
|
||
|
|
require "uri"
|
||
|
|
require "timeout"
|
||
|
|
|
||
|
|
module UrlToArticle
|
||
|
|
class ArticleExtractor
|
||
|
|
# Tags that are almost never article content
|
||
|
|
NOISE_SELECTORS = %w[
|
||
|
|
script style noscript iframe nav footer header
|
||
|
|
.navigation .nav .menu .sidebar .widget .ad .advertisement
|
||
|
|
.cookie-banner .cookie-notice .popup .modal .overlay
|
||
|
|
.social-share .share-buttons .related-posts .comments
|
||
|
|
#comments #sidebar #navigation #footer #header
|
||
|
|
[role=navigation] [role=banner] [role=contentinfo]
|
||
|
|
[aria-label=navigation] [aria-label=footer]
|
||
|
|
].freeze
|
||
|
|
|
||
|
|
# Candidate content selectors tried in order
|
||
|
|
ARTICLE_SELECTORS = %w[
|
||
|
|
article[class*=content]
|
||
|
|
article[class*=post]
|
||
|
|
article[class*=article]
|
||
|
|
article
|
||
|
|
[role=main]
|
||
|
|
main
|
||
|
|
.post-content
|
||
|
|
.article-content
|
||
|
|
.entry-content
|
||
|
|
.article-body
|
||
|
|
.story-body
|
||
|
|
.post-body
|
||
|
|
.content-body
|
||
|
|
.page-content
|
||
|
|
#article-body
|
||
|
|
#post-content
|
||
|
|
#main-content
|
||
|
|
].freeze
|
||
|
|
|
||
|
|
Result = Struct.new(:title, :byline, :site_name, :description, :markdown, :url, keyword_init: true)
|
||
|
|
|
||
|
|
def self.extract(url)
|
||
|
|
new(url).extract
|
||
|
|
end
|
||
|
|
|
||
|
|
def initialize(url)
|
||
|
|
@url = url
|
||
|
|
@uri = URI.parse(url)
|
||
|
|
end
|
||
|
|
|
||
|
|
def extract
|
||
|
|
html = fetch_html
|
||
|
|
doc = Nokogiri::HTML(html)
|
||
|
|
|
||
|
|
title = extract_title(doc)
|
||
|
|
byline = extract_byline(doc)
|
||
|
|
site_name = extract_site_name(doc)
|
||
|
|
description = extract_description(doc)
|
||
|
|
content_node = find_content_node(doc)
|
||
|
|
|
||
|
|
clean_node!(content_node)
|
||
|
|
markdown = node_to_markdown(content_node)
|
||
|
|
markdown = truncate(markdown)
|
||
|
|
|
||
|
|
Result.new(
|
||
|
|
title: title,
|
||
|
|
byline: byline,
|
||
|
|
site_name: site_name,
|
||
|
|
description: description,
|
||
|
|
markdown: markdown,
|
||
|
|
url: @url
|
||
|
|
)
|
||
|
|
end
|
||
|
|
|
||
|
|
private
|
||
|
|
|
||
|
|
def fetch_html
|
||
|
|
Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do
|
||
|
|
http = Net::HTTP.new(@uri.host, @uri.port)
|
||
|
|
http.use_ssl = @uri.scheme == "https"
|
||
|
|
http.open_timeout = 5
|
||
|
|
http.read_timeout = SiteSetting.url_to_article_fetch_timeout
|
||
|
|
|
||
|
|
request = Net::HTTP::Get.new(@uri.request_uri)
|
||
|
|
request["User-Agent"] = "Mozilla/5.0 (compatible; Discourse URL-to-Article Bot/1.0)"
|
||
|
|
request["Accept"] = "text/html,application/xhtml+xml"
|
||
|
|
request["Accept-Language"] = "en-US,en;q=0.9"
|
||
|
|
|
||
|
|
response = http.request(request)
|
||
|
|
|
||
|
|
# Follow one redirect
|
||
|
|
if response.is_a?(Net::HTTPRedirection) && response["location"]
|
||
|
|
redirect_uri = URI.parse(response["location"])
|
||
|
|
@uri = redirect_uri
|
||
|
|
http = Net::HTTP.new(@uri.host, @uri.port)
|
||
|
|
http.use_ssl = @uri.scheme == "https"
|
||
|
|
response = http.get(@uri.request_uri, "User-Agent" => request["User-Agent"])
|
||
|
|
end
|
||
|
|
|
||
|
|
raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess)
|
||
|
|
response.body.force_encoding("UTF-8")
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
def extract_title(doc)
|
||
|
|
# Try OG title first, then twitter:title, then <title>
|
||
|
|
og = doc.at_css('meta[property="og:title"]')&.attr("content")
|
||
|
|
return og.strip if og.present?
|
||
|
|
|
||
|
|
tw = doc.at_css('meta[name="twitter:title"]')&.attr("content")
|
||
|
|
return tw.strip if tw.present?
|
||
|
|
|
||
|
|
h1 = doc.at_css("h1")&.text
|
||
|
|
return h1.strip if h1.present?
|
||
|
|
|
||
|
|
doc.at_css("title")&.text&.strip || @uri.host
|
||
|
|
end
|
||
|
|
|
||
|
|
def extract_byline(doc)
|
||
|
|
candidates = [
|
||
|
|
doc.at_css('meta[name="author"]')&.attr("content"),
|
||
|
|
doc.at_css('[rel="author"]')&.text,
|
||
|
|
doc.at_css(".author")&.text,
|
||
|
|
doc.at_css('[class*="byline"]')&.text,
|
||
|
|
doc.at_css("address")&.text,
|
||
|
|
]
|
||
|
|
candidates.compact.map(&:strip).reject(&:empty?).first
|
||
|
|
end
|
||
|
|
|
||
|
|
def extract_site_name(doc)
|
||
|
|
doc.at_css('meta[property="og:site_name"]')&.attr("content")&.strip ||
|
||
|
|
@uri.host.sub(/^www\./, "")
|
||
|
|
end
|
||
|
|
|
||
|
|
def extract_description(doc)
|
||
|
|
doc.at_css('meta[property="og:description"]')&.attr("content")&.strip ||
|
||
|
|
doc.at_css('meta[name="description"]')&.attr("content")&.strip
|
||
|
|
end
|
||
|
|
|
||
|
|
def find_content_node(doc)
|
||
|
|
# Try known article selectors
|
||
|
|
ARTICLE_SELECTORS.each do |sel|
|
||
|
|
node = doc.at_css(sel)
|
||
|
|
next unless node
|
||
|
|
text = node.text.strip
|
||
|
|
# Make sure it has meaningful content (>200 chars of text)
|
||
|
|
return node if text.length > 200
|
||
|
|
end
|
||
|
|
|
||
|
|
# Fallback: score all <div> and <section> blocks by text density
|
||
|
|
score_and_pick(doc)
|
||
|
|
end
|
||
|
|
|
||
|
|
def score_and_pick(doc)
|
||
|
|
candidates = doc.css("div, section, td").map do |node|
|
||
|
|
text = node.text.strip
|
||
|
|
next if text.length < 150
|
||
|
|
|
||
|
|
# Score = text length - penalize nodes with lots of tags (nav-heavy)
|
||
|
|
tag_count = node.css("*").size.to_f
|
||
|
|
text_length = text.length.to_f
|
||
|
|
score = text_length - (tag_count * 3)
|
||
|
|
|
||
|
|
[score, node]
|
||
|
|
end.compact.sort_by { |s, _| -s }
|
||
|
|
|
||
|
|
candidates.first&.last || doc.at_css("body") || doc
|
||
|
|
end
|
||
|
|
|
||
|
|
def clean_node!(node)
|
||
|
|
return unless node
|
||
|
|
|
||
|
|
# Remove noise elements
|
||
|
|
NOISE_SELECTORS.each do |sel|
|
||
|
|
node.css(sel).each(&:remove)
|
||
|
|
end
|
||
|
|
|
||
|
|
# Remove hidden elements
|
||
|
|
node.css("[style]").each do |el|
|
||
|
|
el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i
|
||
|
|
end
|
||
|
|
|
||
|
|
# Remove empty tags (except br, img, hr)
|
||
|
|
node.css("span, div, p, section").each do |el|
|
||
|
|
el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty?
|
||
|
|
end
|
||
|
|
|
||
|
|
# Strip all attributes except allowed ones on certain tags
|
||
|
|
allowed = {
|
||
|
|
"a" => %w[href title],
|
||
|
|
"img" => %w[src alt title width height],
|
||
|
|
"td" => %w[colspan rowspan],
|
||
|
|
"th" => %w[colspan rowspan scope],
|
||
|
|
"ol" => %w[start type],
|
||
|
|
"li" => %w[value],
|
||
|
|
"code" => %w[class],
|
||
|
|
"pre" => %w[class],
|
||
|
|
}
|
||
|
|
node.css("*").each do |el|
|
||
|
|
tag = el.name.downcase
|
||
|
|
permitted = allowed[tag] || []
|
||
|
|
el.attributes.each_key do |attr|
|
||
|
|
el.remove_attribute(attr) unless permitted.include?(attr)
|
||
|
|
end
|
||
|
|
|
||
|
|
# Make relative image URLs absolute
|
||
|
|
if tag == "img" && el["src"] && !el["src"].start_with?("http", "//", "data:")
|
||
|
|
el["src"] = URI.join(@url, el["src"]).to_s rescue nil
|
||
|
|
end
|
||
|
|
if tag == "a" && el["href"] && !el["href"].start_with?("http", "//", "#", "mailto:")
|
||
|
|
el["href"] = URI.join(@url, el["href"]).to_s rescue nil
|
||
|
|
end
|
||
|
|
end
|
||
|
|
end
|
||
|
|
|
||
|
|
def node_to_markdown(node)
|
||
|
|
return "" unless node
|
||
|
|
ReverseMarkdown.convert(node.to_html, unknown_tags: :bypass, github_flavored: true)
|
||
|
|
.gsub(/\n{3,}/, "\n\n") # collapse excessive blank lines
|
||
|
|
.strip
|
||
|
|
end
|
||
|
|
|
||
|
|
def truncate(text)
|
||
|
|
max = SiteSetting.url_to_article_max_content_length
|
||
|
|
return text if text.length <= max
|
||
|
|
text[0...max] + "\n\n*[Content truncated — visit the original article for the full text.]*"
|
||
|
|
end
|
||
|
|
end
|
||
|
|
end
|