init commit

This commit is contained in:
2026-03-18 11:10:07 -04:00
commit b1ef516348
8 changed files with 730 additions and 0 deletions

View File

@@ -0,0 +1,232 @@
# frozen_string_literal: true
require "nokogiri"
require "reverse_markdown"
require "net/http"
require "uri"
require "timeout"
module UrlToArticle
class ArticleExtractor
# Tags that are almost never article content
NOISE_SELECTORS = %w[
script style noscript iframe nav footer header
.navigation .nav .menu .sidebar .widget .ad .advertisement
.cookie-banner .cookie-notice .popup .modal .overlay
.social-share .share-buttons .related-posts .comments
#comments #sidebar #navigation #footer #header
[role=navigation] [role=banner] [role=contentinfo]
[aria-label=navigation] [aria-label=footer]
].freeze
# Candidate content selectors tried in order
ARTICLE_SELECTORS = %w[
article[class*=content]
article[class*=post]
article[class*=article]
article
[role=main]
main
.post-content
.article-content
.entry-content
.article-body
.story-body
.post-body
.content-body
.page-content
#article-body
#post-content
#main-content
].freeze
Result = Struct.new(:title, :byline, :site_name, :description, :markdown, :url, keyword_init: true)
def self.extract(url)
new(url).extract
end
def initialize(url)
@url = url
@uri = URI.parse(url)
end
def extract
html = fetch_html
doc = Nokogiri::HTML(html)
title = extract_title(doc)
byline = extract_byline(doc)
site_name = extract_site_name(doc)
description = extract_description(doc)
content_node = find_content_node(doc)
clean_node!(content_node)
markdown = node_to_markdown(content_node)
markdown = truncate(markdown)
Result.new(
title: title,
byline: byline,
site_name: site_name,
description: description,
markdown: markdown,
url: @url
)
end
private
def fetch_html
Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do
http = Net::HTTP.new(@uri.host, @uri.port)
http.use_ssl = @uri.scheme == "https"
http.open_timeout = 5
http.read_timeout = SiteSetting.url_to_article_fetch_timeout
request = Net::HTTP::Get.new(@uri.request_uri)
request["User-Agent"] = "Mozilla/5.0 (compatible; Discourse URL-to-Article Bot/1.0)"
request["Accept"] = "text/html,application/xhtml+xml"
request["Accept-Language"] = "en-US,en;q=0.9"
response = http.request(request)
# Follow one redirect
if response.is_a?(Net::HTTPRedirection) && response["location"]
redirect_uri = URI.parse(response["location"])
@uri = redirect_uri
http = Net::HTTP.new(@uri.host, @uri.port)
http.use_ssl = @uri.scheme == "https"
response = http.get(@uri.request_uri, "User-Agent" => request["User-Agent"])
end
raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess)
response.body.force_encoding("UTF-8")
end
end
def extract_title(doc)
# Try OG title first, then twitter:title, then <title>
og = doc.at_css('meta[property="og:title"]')&.attr("content")
return og.strip if og.present?
tw = doc.at_css('meta[name="twitter:title"]')&.attr("content")
return tw.strip if tw.present?
h1 = doc.at_css("h1")&.text
return h1.strip if h1.present?
doc.at_css("title")&.text&.strip || @uri.host
end
def extract_byline(doc)
candidates = [
doc.at_css('meta[name="author"]')&.attr("content"),
doc.at_css('[rel="author"]')&.text,
doc.at_css(".author")&.text,
doc.at_css('[class*="byline"]')&.text,
doc.at_css("address")&.text,
]
candidates.compact.map(&:strip).reject(&:empty?).first
end
def extract_site_name(doc)
doc.at_css('meta[property="og:site_name"]')&.attr("content")&.strip ||
@uri.host.sub(/^www\./, "")
end
def extract_description(doc)
doc.at_css('meta[property="og:description"]')&.attr("content")&.strip ||
doc.at_css('meta[name="description"]')&.attr("content")&.strip
end
def find_content_node(doc)
# Try known article selectors
ARTICLE_SELECTORS.each do |sel|
node = doc.at_css(sel)
next unless node
text = node.text.strip
# Make sure it has meaningful content (>200 chars of text)
return node if text.length > 200
end
# Fallback: score all <div> and <section> blocks by text density
score_and_pick(doc)
end
def score_and_pick(doc)
candidates = doc.css("div, section, td").map do |node|
text = node.text.strip
next if text.length < 150
# Score = text length - penalize nodes with lots of tags (nav-heavy)
tag_count = node.css("*").size.to_f
text_length = text.length.to_f
score = text_length - (tag_count * 3)
[score, node]
end.compact.sort_by { |s, _| -s }
candidates.first&.last || doc.at_css("body") || doc
end
def clean_node!(node)
return unless node
# Remove noise elements
NOISE_SELECTORS.each do |sel|
node.css(sel).each(&:remove)
end
# Remove hidden elements
node.css("[style]").each do |el|
el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i
end
# Remove empty tags (except br, img, hr)
node.css("span, div, p, section").each do |el|
el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty?
end
# Strip all attributes except allowed ones on certain tags
allowed = {
"a" => %w[href title],
"img" => %w[src alt title width height],
"td" => %w[colspan rowspan],
"th" => %w[colspan rowspan scope],
"ol" => %w[start type],
"li" => %w[value],
"code" => %w[class],
"pre" => %w[class],
}
node.css("*").each do |el|
tag = el.name.downcase
permitted = allowed[tag] || []
el.attributes.each_key do |attr|
el.remove_attribute(attr) unless permitted.include?(attr)
end
# Make relative image URLs absolute
if tag == "img" && el["src"] && !el["src"].start_with?("http", "//", "data:")
el["src"] = URI.join(@url, el["src"]).to_s rescue nil
end
if tag == "a" && el["href"] && !el["href"].start_with?("http", "//", "#", "mailto:")
el["href"] = URI.join(@url, el["href"]).to_s rescue nil
end
end
end
def node_to_markdown(node)
return "" unless node
ReverseMarkdown.convert(node.to_html, unknown_tags: :bypass, github_flavored: true)
.gsub(/\n{3,}/, "\n\n") # collapse excessive blank lines
.strip
end
def truncate(text)
max = SiteSetting.url_to_article_max_content_length
return text if text.length <= max
text[0...max] + "\n\n*[Content truncated — visit the original article for the full text.]*"
end
end
end