2026-03-18 11:10:07 -04:00
|
|
|
# frozen_string_literal: true
|
|
|
|
|
|
2026-03-18 11:28:55 -04:00
|
|
|
# Uses only Nokogiri (already bundled with Discourse) — no extra gems needed.
|
|
|
|
|
|
2026-03-18 11:10:07 -04:00
|
|
|
require "nokogiri"
|
|
|
|
|
require "net/http"
|
|
|
|
|
require "uri"
|
|
|
|
|
require "timeout"
|
|
|
|
|
|
|
|
|
|
module UrlToArticle
|
|
|
|
|
class ArticleExtractor
|
|
|
|
|
NOISE_SELECTORS = %w[
|
|
|
|
|
script style noscript iframe nav footer header
|
|
|
|
|
.navigation .nav .menu .sidebar .widget .ad .advertisement
|
|
|
|
|
.cookie-banner .cookie-notice .popup .modal .overlay
|
|
|
|
|
.social-share .share-buttons .related-posts .comments
|
|
|
|
|
#comments #sidebar #navigation #footer #header
|
|
|
|
|
[role=navigation] [role=banner] [role=contentinfo]
|
|
|
|
|
[aria-label=navigation] [aria-label=footer]
|
|
|
|
|
].freeze
|
|
|
|
|
|
|
|
|
|
ARTICLE_SELECTORS = %w[
|
|
|
|
|
article[class*=content]
|
|
|
|
|
article[class*=post]
|
|
|
|
|
article[class*=article]
|
|
|
|
|
article
|
|
|
|
|
[role=main]
|
|
|
|
|
main
|
|
|
|
|
.post-content
|
|
|
|
|
.article-content
|
|
|
|
|
.entry-content
|
|
|
|
|
.article-body
|
|
|
|
|
.story-body
|
|
|
|
|
.post-body
|
|
|
|
|
.content-body
|
|
|
|
|
.page-content
|
|
|
|
|
#article-body
|
|
|
|
|
#post-content
|
|
|
|
|
#main-content
|
|
|
|
|
].freeze
|
|
|
|
|
|
|
|
|
|
Result = Struct.new(:title, :byline, :site_name, :description, :markdown, :url, keyword_init: true)
|
|
|
|
|
|
|
|
|
|
def self.extract(url)
|
|
|
|
|
new(url).extract
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def initialize(url)
|
|
|
|
|
@url = url
|
|
|
|
|
@uri = URI.parse(url)
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def extract
|
2026-03-18 11:28:55 -04:00
|
|
|
html = fetch_html
|
|
|
|
|
doc = Nokogiri::HTML(html)
|
|
|
|
|
title = extract_title(doc)
|
|
|
|
|
byline = extract_byline(doc)
|
|
|
|
|
site_name = extract_site_name(doc)
|
2026-03-18 11:10:07 -04:00
|
|
|
description = extract_description(doc)
|
|
|
|
|
|
2026-03-18 11:28:55 -04:00
|
|
|
content_node = find_content_node(doc)
|
2026-03-18 11:10:07 -04:00
|
|
|
clean_node!(content_node)
|
|
|
|
|
markdown = node_to_markdown(content_node)
|
|
|
|
|
markdown = truncate(markdown)
|
|
|
|
|
|
|
|
|
|
Result.new(
|
2026-03-18 11:28:55 -04:00
|
|
|
title: title,
|
|
|
|
|
byline: byline,
|
|
|
|
|
site_name: site_name,
|
2026-03-18 11:10:07 -04:00
|
|
|
description: description,
|
2026-03-18 11:28:55 -04:00
|
|
|
markdown: markdown,
|
|
|
|
|
url: @url,
|
2026-03-18 11:10:07 -04:00
|
|
|
)
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
private
|
|
|
|
|
|
2026-03-18 11:28:55 -04:00
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
# HTTP fetch
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
|
2026-03-18 11:10:07 -04:00
|
|
|
def fetch_html
|
|
|
|
|
Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do
|
2026-03-18 11:28:55 -04:00
|
|
|
response = do_get(@uri)
|
2026-03-18 11:10:07 -04:00
|
|
|
|
|
|
|
|
if response.is_a?(Net::HTTPRedirection) && response["location"]
|
2026-03-18 11:28:55 -04:00
|
|
|
@uri = URI.parse(response["location"])
|
|
|
|
|
response = do_get(@uri)
|
2026-03-18 11:10:07 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
|
|
raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess)
|
2026-03-18 11:28:55 -04:00
|
|
|
|
|
|
|
|
body = response.body
|
|
|
|
|
body = body.force_encoding("UTF-8") rescue body
|
|
|
|
|
body
|
2026-03-18 11:10:07 -04:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
2026-03-18 11:28:55 -04:00
|
|
|
def do_get(uri)
|
|
|
|
|
http = Net::HTTP.new(uri.host, uri.port)
|
|
|
|
|
http.use_ssl = uri.scheme == "https"
|
|
|
|
|
http.open_timeout = 5
|
|
|
|
|
http.read_timeout = SiteSetting.url_to_article_fetch_timeout
|
|
|
|
|
|
|
|
|
|
req = Net::HTTP::Get.new(uri.request_uri)
|
|
|
|
|
req["User-Agent"] = "Mozilla/5.0 (compatible; Discourse/url-to-article)"
|
|
|
|
|
req["Accept"] = "text/html,application/xhtml+xml"
|
|
|
|
|
req["Accept-Language"] = "en-US,en;q=0.9"
|
|
|
|
|
|
|
|
|
|
http.request(req)
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
# Metadata extraction
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
|
2026-03-18 11:10:07 -04:00
|
|
|
def extract_title(doc)
|
|
|
|
|
og = doc.at_css('meta[property="og:title"]')&.attr("content")
|
|
|
|
|
return og.strip if og.present?
|
|
|
|
|
|
|
|
|
|
tw = doc.at_css('meta[name="twitter:title"]')&.attr("content")
|
|
|
|
|
return tw.strip if tw.present?
|
|
|
|
|
|
|
|
|
|
h1 = doc.at_css("h1")&.text
|
|
|
|
|
return h1.strip if h1.present?
|
|
|
|
|
|
|
|
|
|
doc.at_css("title")&.text&.strip || @uri.host
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def extract_byline(doc)
|
2026-03-18 11:28:55 -04:00
|
|
|
[
|
2026-03-18 11:10:07 -04:00
|
|
|
doc.at_css('meta[name="author"]')&.attr("content"),
|
|
|
|
|
doc.at_css('[rel="author"]')&.text,
|
|
|
|
|
doc.at_css(".author")&.text,
|
|
|
|
|
doc.at_css('[class*="byline"]')&.text,
|
|
|
|
|
doc.at_css("address")&.text,
|
2026-03-18 11:28:55 -04:00
|
|
|
].compact.map(&:strip).reject(&:empty?).first
|
2026-03-18 11:10:07 -04:00
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def extract_site_name(doc)
|
|
|
|
|
doc.at_css('meta[property="og:site_name"]')&.attr("content")&.strip ||
|
|
|
|
|
@uri.host.sub(/^www\./, "")
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def extract_description(doc)
|
|
|
|
|
doc.at_css('meta[property="og:description"]')&.attr("content")&.strip ||
|
|
|
|
|
doc.at_css('meta[name="description"]')&.attr("content")&.strip
|
|
|
|
|
end
|
|
|
|
|
|
2026-03-18 11:28:55 -04:00
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
# Content node selection
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
|
2026-03-18 11:10:07 -04:00
|
|
|
def find_content_node(doc)
|
|
|
|
|
ARTICLE_SELECTORS.each do |sel|
|
|
|
|
|
node = doc.at_css(sel)
|
2026-03-18 11:28:55 -04:00
|
|
|
return node if node && node.text.strip.length > 200
|
2026-03-18 11:10:07 -04:00
|
|
|
end
|
|
|
|
|
score_and_pick(doc)
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def score_and_pick(doc)
|
2026-03-18 11:28:55 -04:00
|
|
|
best = doc.css("div, section, td").filter_map do |node|
|
|
|
|
|
text_len = node.text.strip.length
|
|
|
|
|
next if text_len < 150
|
2026-03-18 11:10:07 -04:00
|
|
|
tag_count = node.css("*").size.to_f
|
2026-03-18 11:28:55 -04:00
|
|
|
score = text_len - (tag_count * 3)
|
2026-03-18 11:10:07 -04:00
|
|
|
[score, node]
|
2026-03-18 11:28:55 -04:00
|
|
|
end.max_by { |score, _| score }
|
2026-03-18 11:10:07 -04:00
|
|
|
|
2026-03-18 11:28:55 -04:00
|
|
|
best&.last || doc.at_css("body") || doc
|
2026-03-18 11:10:07 -04:00
|
|
|
end
|
|
|
|
|
|
2026-03-18 11:28:55 -04:00
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
# Node cleaning
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
|
2026-03-18 11:10:07 -04:00
|
|
|
def clean_node!(node)
|
|
|
|
|
return unless node
|
|
|
|
|
|
2026-03-18 11:28:55 -04:00
|
|
|
NOISE_SELECTORS.each { |sel| node.css(sel).each(&:remove) }
|
2026-03-18 11:10:07 -04:00
|
|
|
|
|
|
|
|
node.css("[style]").each do |el|
|
|
|
|
|
el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
node.css("span, div, p, section").each do |el|
|
|
|
|
|
el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty?
|
|
|
|
|
end
|
|
|
|
|
|
2026-03-18 11:28:55 -04:00
|
|
|
allowed_attrs = {
|
|
|
|
|
"a" => %w[href title],
|
|
|
|
|
"img" => %w[src alt title],
|
|
|
|
|
"td" => %w[colspan rowspan],
|
|
|
|
|
"th" => %w[colspan rowspan scope],
|
|
|
|
|
"ol" => %w[start],
|
|
|
|
|
"li" => %w[value],
|
2026-03-18 11:10:07 -04:00
|
|
|
"code" => %w[class],
|
|
|
|
|
"pre" => %w[class],
|
|
|
|
|
}
|
2026-03-18 11:28:55 -04:00
|
|
|
|
2026-03-18 11:10:07 -04:00
|
|
|
node.css("*").each do |el|
|
2026-03-18 11:28:55 -04:00
|
|
|
tag = el.name.downcase
|
|
|
|
|
permitted = allowed_attrs[tag] || []
|
|
|
|
|
el.attributes.each_key { |attr| el.remove_attribute(attr) unless permitted.include?(attr) }
|
2026-03-18 11:10:07 -04:00
|
|
|
|
2026-03-18 11:28:55 -04:00
|
|
|
if tag == "img" && el["src"] && !el["src"].to_s.start_with?("http", "//", "data:")
|
|
|
|
|
el["src"] = (URI.join(@url, el["src"]).to_s rescue nil)
|
2026-03-18 11:10:07 -04:00
|
|
|
end
|
2026-03-18 11:28:55 -04:00
|
|
|
if tag == "a" && el["href"] && !el["href"].to_s.start_with?("http", "//", "#", "mailto:")
|
|
|
|
|
el["href"] = (URI.join(@url, el["href"]).to_s rescue nil)
|
2026-03-18 11:10:07 -04:00
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
2026-03-18 11:28:55 -04:00
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
# HTML → Markdown (zero external dependencies)
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
|
2026-03-18 11:10:07 -04:00
|
|
|
def node_to_markdown(node)
|
|
|
|
|
return "" unless node
|
2026-03-18 11:28:55 -04:00
|
|
|
convert_node(node, context: {}).strip.gsub(/\n{3,}/, "\n\n")
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def convert_node(node, context:)
|
|
|
|
|
return "" unless node
|
|
|
|
|
|
|
|
|
|
case node.node_type
|
|
|
|
|
when Nokogiri::XML::Node::TEXT_NODE
|
|
|
|
|
node.text.gsub(/[[:space:]]+/, " ")
|
|
|
|
|
when Nokogiri::XML::Node::ELEMENT_NODE
|
|
|
|
|
convert_element(node, context: context)
|
|
|
|
|
when Nokogiri::XML::Node::DOCUMENT_NODE
|
|
|
|
|
node.children.map { |c| convert_node(c, context: context) }.join
|
|
|
|
|
else
|
|
|
|
|
""
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def children_md(node, context)
|
|
|
|
|
node.children.map { |c| convert_node(c, context: context) }.join
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def convert_element(node, context:) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity
|
|
|
|
|
tag = node.name.downcase
|
|
|
|
|
|
|
|
|
|
case tag
|
|
|
|
|
when "h1" then "\n\n# #{children_md(node, context).strip}\n\n"
|
|
|
|
|
when "h2" then "\n\n## #{children_md(node, context).strip}\n\n"
|
|
|
|
|
when "h3" then "\n\n### #{children_md(node, context).strip}\n\n"
|
|
|
|
|
when "h4" then "\n\n#### #{children_md(node, context).strip}\n\n"
|
|
|
|
|
when "h5" then "\n\n##### #{children_md(node, context).strip}\n\n"
|
|
|
|
|
when "h6" then "\n\n###### #{children_md(node, context).strip}\n\n"
|
|
|
|
|
when "p" then "\n\n#{children_md(node, context).strip}\n\n"
|
|
|
|
|
when "br" then " \n"
|
|
|
|
|
when "hr" then "\n\n---\n\n"
|
|
|
|
|
|
|
|
|
|
when "strong", "b" then "**#{children_md(node, context).strip}**"
|
|
|
|
|
when "em", "i" then "*#{children_md(node, context).strip}*"
|
|
|
|
|
when "del", "s" then "~~#{children_md(node, context).strip}~~"
|
|
|
|
|
|
|
|
|
|
when "code"
|
|
|
|
|
context[:pre] ? children_md(node, context) : "`#{children_md(node, context)}`"
|
|
|
|
|
|
|
|
|
|
when "pre"
|
|
|
|
|
lang = node.at_css("code")&.attr("class")&.match(/language-(\w+)/)&.[](1) || ""
|
|
|
|
|
"\n\n```#{lang}\n#{node.text}\n```\n\n"
|
|
|
|
|
|
|
|
|
|
when "blockquote"
|
|
|
|
|
quoted = children_md(node, context).strip.gsub(/^/, "> ")
|
|
|
|
|
"\n\n#{quoted}\n\n"
|
|
|
|
|
|
|
|
|
|
when "a"
|
|
|
|
|
href = node["href"].to_s.strip
|
|
|
|
|
text = children_md(node, context).strip
|
|
|
|
|
href.empty? ? text : "[#{text}](#{href})"
|
|
|
|
|
|
|
|
|
|
when "img"
|
|
|
|
|
src = node["src"].to_s.strip
|
|
|
|
|
alt = node["alt"].to_s.strip
|
|
|
|
|
src.empty? ? "" : ""
|
|
|
|
|
|
|
|
|
|
when "ul"
|
|
|
|
|
items = node.css("> li").map do |li|
|
|
|
|
|
"- #{children_md(li, context).strip}"
|
|
|
|
|
end.join("\n")
|
|
|
|
|
"\n\n#{items}\n\n"
|
|
|
|
|
|
|
|
|
|
when "ol"
|
|
|
|
|
start = (node["start"] || 1).to_i
|
|
|
|
|
items = node.css("> li").each_with_index.map do |li, idx|
|
|
|
|
|
"#{start + idx}. #{children_md(li, context).strip}"
|
|
|
|
|
end.join("\n")
|
|
|
|
|
"\n\n#{items}\n\n"
|
|
|
|
|
|
|
|
|
|
when "table"
|
|
|
|
|
convert_table(node, context: context)
|
|
|
|
|
|
|
|
|
|
when "figure"
|
|
|
|
|
img_el = node.at_css("img")
|
|
|
|
|
cap = node.at_css("figcaption")&.text&.strip
|
|
|
|
|
img_md = img_el ? convert_element(img_el, context: context) : ""
|
|
|
|
|
cap_md = cap ? "\n*#{cap}*" : ""
|
|
|
|
|
"\n\n#{img_md}#{cap_md}\n\n"
|
|
|
|
|
|
|
|
|
|
when "script", "style", "noscript", "button", "input",
|
|
|
|
|
"select", "textarea", "iframe", "object", "embed",
|
|
|
|
|
"head", "link", "meta"
|
|
|
|
|
""
|
|
|
|
|
|
|
|
|
|
else
|
|
|
|
|
children_md(node, context)
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
def convert_table(table, context:)
|
|
|
|
|
rows = table.css("tr")
|
|
|
|
|
return "" if rows.empty?
|
|
|
|
|
|
|
|
|
|
md_rows = rows.map do |row|
|
|
|
|
|
cells = row.css("th, td").map do |cell|
|
|
|
|
|
children_md(cell, context).strip.gsub("|", "\\|")
|
|
|
|
|
end
|
|
|
|
|
"| #{cells.join(" | ")} |"
|
|
|
|
|
end
|
|
|
|
|
|
|
|
|
|
cols = rows.first.css("th, td").size
|
|
|
|
|
sep = "| #{Array.new(cols, "---").join(" | ")} |"
|
|
|
|
|
|
|
|
|
|
"\n\n#{md_rows.first}\n#{sep}\n#{md_rows[1..].join("\n")}\n\n"
|
2026-03-18 11:10:07 -04:00
|
|
|
end
|
|
|
|
|
|
2026-03-18 11:28:55 -04:00
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
# Helpers
|
|
|
|
|
# ------------------------------------------------------------------ #
|
|
|
|
|
|
2026-03-18 11:10:07 -04:00
|
|
|
def truncate(text)
|
|
|
|
|
max = SiteSetting.url_to_article_max_content_length
|
|
|
|
|
return text if text.length <= max
|
|
|
|
|
text[0...max] + "\n\n*[Content truncated — visit the original article for the full text.]*"
|
|
|
|
|
end
|
|
|
|
|
end
|
|
|
|
|
end
|