diff --git a/lib/url_to_article/article_extractor.rb b/lib/url_to_article/article_extractor.rb
index a1b87be..d671399 100644
--- a/lib/url_to_article/article_extractor.rb
+++ b/lib/url_to_article/article_extractor.rb
@@ -1,14 +1,14 @@
# frozen_string_literal: true
+# Uses only Nokogiri (already bundled with Discourse) — no extra gems needed.
+
require "nokogiri"
-require "reverse_markdown"
require "net/http"
require "uri"
require "timeout"
module UrlToArticle
class ArticleExtractor
- # Tags that are almost never article content
NOISE_SELECTORS = %w[
script style noscript iframe nav footer header
.navigation .nav .menu .sidebar .widget .ad .advertisement
@@ -19,7 +19,6 @@ module UrlToArticle
[aria-label=navigation] [aria-label=footer]
].freeze
- # Candidate content selectors tried in order
ARTICLE_SELECTORS = %w[
article[class*=content]
article[class*=post]
@@ -52,61 +51,70 @@ module UrlToArticle
end
def extract
- html = fetch_html
- doc = Nokogiri::HTML(html)
-
- title = extract_title(doc)
- byline = extract_byline(doc)
- site_name = extract_site_name(doc)
+ html = fetch_html
+ doc = Nokogiri::HTML(html)
+ title = extract_title(doc)
+ byline = extract_byline(doc)
+ site_name = extract_site_name(doc)
description = extract_description(doc)
- content_node = find_content_node(doc)
+ content_node = find_content_node(doc)
clean_node!(content_node)
markdown = node_to_markdown(content_node)
markdown = truncate(markdown)
Result.new(
- title: title,
- byline: byline,
- site_name: site_name,
+ title: title,
+ byline: byline,
+ site_name: site_name,
description: description,
- markdown: markdown,
- url: @url
+ markdown: markdown,
+ url: @url,
)
end
private
+ # ------------------------------------------------------------------ #
+ # HTTP fetch
+ # ------------------------------------------------------------------ #
+
def fetch_html
Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do
- http = Net::HTTP.new(@uri.host, @uri.port)
- http.use_ssl = @uri.scheme == "https"
- http.open_timeout = 5
- http.read_timeout = SiteSetting.url_to_article_fetch_timeout
+ response = do_get(@uri)
- request = Net::HTTP::Get.new(@uri.request_uri)
- request["User-Agent"] = "Mozilla/5.0 (compatible; Discourse URL-to-Article Bot/1.0)"
- request["Accept"] = "text/html,application/xhtml+xml"
- request["Accept-Language"] = "en-US,en;q=0.9"
-
- response = http.request(request)
-
- # Follow one redirect
if response.is_a?(Net::HTTPRedirection) && response["location"]
- redirect_uri = URI.parse(response["location"])
- @uri = redirect_uri
- http = Net::HTTP.new(@uri.host, @uri.port)
- http.use_ssl = @uri.scheme == "https"
- response = http.get(@uri.request_uri, "User-Agent" => request["User-Agent"])
+ @uri = URI.parse(response["location"])
+ response = do_get(@uri)
end
raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess)
- response.body.force_encoding("UTF-8")
+
+ body = response.body
+ body = body.force_encoding("UTF-8") rescue body
+ body
end
end
+ def do_get(uri)
+ http = Net::HTTP.new(uri.host, uri.port)
+ http.use_ssl = uri.scheme == "https"
+ http.open_timeout = 5
+ http.read_timeout = SiteSetting.url_to_article_fetch_timeout
+
+ req = Net::HTTP::Get.new(uri.request_uri)
+ req["User-Agent"] = "Mozilla/5.0 (compatible; Discourse/url-to-article)"
+ req["Accept"] = "text/html,application/xhtml+xml"
+ req["Accept-Language"] = "en-US,en;q=0.9"
+
+ http.request(req)
+ end
+
+ # ------------------------------------------------------------------ #
+ # Metadata extraction
+ # ------------------------------------------------------------------ #
+
def extract_title(doc)
- # Try OG title first, then twitter:title, then
og = doc.at_css('meta[property="og:title"]')&.attr("content")
return og.strip if og.present?
@@ -120,14 +128,13 @@ module UrlToArticle
end
def extract_byline(doc)
- candidates = [
+ [
doc.at_css('meta[name="author"]')&.attr("content"),
doc.at_css('[rel="author"]')&.text,
doc.at_css(".author")&.text,
doc.at_css('[class*="byline"]')&.text,
doc.at_css("address")&.text,
- ]
- candidates.compact.map(&:strip).reject(&:empty?).first
+ ].compact.map(&:strip).reject(&:empty?).first
end
def extract_site_name(doc)
@@ -140,89 +147,193 @@ module UrlToArticle
doc.at_css('meta[name="description"]')&.attr("content")&.strip
end
+ # ------------------------------------------------------------------ #
+ # Content node selection
+ # ------------------------------------------------------------------ #
+
def find_content_node(doc)
- # Try known article selectors
ARTICLE_SELECTORS.each do |sel|
node = doc.at_css(sel)
- next unless node
- text = node.text.strip
- # Make sure it has meaningful content (>200 chars of text)
- return node if text.length > 200
+ return node if node && node.text.strip.length > 200
end
-
- # Fallback: score all and blocks by text density
score_and_pick(doc)
end
def score_and_pick(doc)
- candidates = doc.css("div, section, td").map do |node|
- text = node.text.strip
- next if text.length < 150
-
- # Score = text length - penalize nodes with lots of tags (nav-heavy)
+ best = doc.css("div, section, td").filter_map do |node|
+ text_len = node.text.strip.length
+ next if text_len < 150
tag_count = node.css("*").size.to_f
- text_length = text.length.to_f
- score = text_length - (tag_count * 3)
-
+ score = text_len - (tag_count * 3)
[score, node]
- end.compact.sort_by { |s, _| -s }
+ end.max_by { |score, _| score }
- candidates.first&.last || doc.at_css("body") || doc
+ best&.last || doc.at_css("body") || doc
end
+ # ------------------------------------------------------------------ #
+ # Node cleaning
+ # ------------------------------------------------------------------ #
+
def clean_node!(node)
return unless node
- # Remove noise elements
- NOISE_SELECTORS.each do |sel|
- node.css(sel).each(&:remove)
- end
+ NOISE_SELECTORS.each { |sel| node.css(sel).each(&:remove) }
- # Remove hidden elements
node.css("[style]").each do |el|
el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i
end
- # Remove empty tags (except br, img, hr)
node.css("span, div, p, section").each do |el|
el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty?
end
- # Strip all attributes except allowed ones on certain tags
- allowed = {
- "a" => %w[href title],
- "img" => %w[src alt title width height],
- "td" => %w[colspan rowspan],
- "th" => %w[colspan rowspan scope],
- "ol" => %w[start type],
- "li" => %w[value],
+ allowed_attrs = {
+ "a" => %w[href title],
+ "img" => %w[src alt title],
+ "td" => %w[colspan rowspan],
+ "th" => %w[colspan rowspan scope],
+ "ol" => %w[start],
+ "li" => %w[value],
"code" => %w[class],
"pre" => %w[class],
}
- node.css("*").each do |el|
- tag = el.name.downcase
- permitted = allowed[tag] || []
- el.attributes.each_key do |attr|
- el.remove_attribute(attr) unless permitted.include?(attr)
- end
- # Make relative image URLs absolute
- if tag == "img" && el["src"] && !el["src"].start_with?("http", "//", "data:")
- el["src"] = URI.join(@url, el["src"]).to_s rescue nil
+ node.css("*").each do |el|
+ tag = el.name.downcase
+ permitted = allowed_attrs[tag] || []
+ el.attributes.each_key { |attr| el.remove_attribute(attr) unless permitted.include?(attr) }
+
+ if tag == "img" && el["src"] && !el["src"].to_s.start_with?("http", "//", "data:")
+ el["src"] = (URI.join(@url, el["src"]).to_s rescue nil)
end
- if tag == "a" && el["href"] && !el["href"].start_with?("http", "//", "#", "mailto:")
- el["href"] = URI.join(@url, el["href"]).to_s rescue nil
+ if tag == "a" && el["href"] && !el["href"].to_s.start_with?("http", "//", "#", "mailto:")
+ el["href"] = (URI.join(@url, el["href"]).to_s rescue nil)
end
end
end
+ # ------------------------------------------------------------------ #
+ # HTML → Markdown (zero external dependencies)
+ # ------------------------------------------------------------------ #
+
def node_to_markdown(node)
return "" unless node
- ReverseMarkdown.convert(node.to_html, unknown_tags: :bypass, github_flavored: true)
- .gsub(/\n{3,}/, "\n\n") # collapse excessive blank lines
- .strip
+ convert_node(node, context: {}).strip.gsub(/\n{3,}/, "\n\n")
end
+ def convert_node(node, context:)
+ return "" unless node
+
+ case node.node_type
+ when Nokogiri::XML::Node::TEXT_NODE
+ node.text.gsub(/[[:space:]]+/, " ")
+ when Nokogiri::XML::Node::ELEMENT_NODE
+ convert_element(node, context: context)
+ when Nokogiri::XML::Node::DOCUMENT_NODE
+ node.children.map { |c| convert_node(c, context: context) }.join
+ else
+ ""
+ end
+ end
+
+ def children_md(node, context)
+ node.children.map { |c| convert_node(c, context: context) }.join
+ end
+
+ def convert_element(node, context:) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity
+ tag = node.name.downcase
+
+ case tag
+ when "h1" then "\n\n# #{children_md(node, context).strip}\n\n"
+ when "h2" then "\n\n## #{children_md(node, context).strip}\n\n"
+ when "h3" then "\n\n### #{children_md(node, context).strip}\n\n"
+ when "h4" then "\n\n#### #{children_md(node, context).strip}\n\n"
+ when "h5" then "\n\n##### #{children_md(node, context).strip}\n\n"
+ when "h6" then "\n\n###### #{children_md(node, context).strip}\n\n"
+ when "p" then "\n\n#{children_md(node, context).strip}\n\n"
+ when "br" then " \n"
+ when "hr" then "\n\n---\n\n"
+
+ when "strong", "b" then "**#{children_md(node, context).strip}**"
+ when "em", "i" then "*#{children_md(node, context).strip}*"
+ when "del", "s" then "~~#{children_md(node, context).strip}~~"
+
+ when "code"
+ context[:pre] ? children_md(node, context) : "`#{children_md(node, context)}`"
+
+ when "pre"
+ lang = node.at_css("code")&.attr("class")&.match(/language-(\w+)/)&.[](1) || ""
+ "\n\n```#{lang}\n#{node.text}\n```\n\n"
+
+ when "blockquote"
+ quoted = children_md(node, context).strip.gsub(/^/, "> ")
+ "\n\n#{quoted}\n\n"
+
+ when "a"
+ href = node["href"].to_s.strip
+ text = children_md(node, context).strip
+ href.empty? ? text : "[#{text}](#{href})"
+
+ when "img"
+ src = node["src"].to_s.strip
+ alt = node["alt"].to_s.strip
+ src.empty? ? "" : ""
+
+ when "ul"
+ items = node.css("> li").map do |li|
+ "- #{children_md(li, context).strip}"
+ end.join("\n")
+ "\n\n#{items}\n\n"
+
+ when "ol"
+ start = (node["start"] || 1).to_i
+ items = node.css("> li").each_with_index.map do |li, idx|
+ "#{start + idx}. #{children_md(li, context).strip}"
+ end.join("\n")
+ "\n\n#{items}\n\n"
+
+ when "table"
+ convert_table(node, context: context)
+
+ when "figure"
+ img_el = node.at_css("img")
+ cap = node.at_css("figcaption")&.text&.strip
+ img_md = img_el ? convert_element(img_el, context: context) : ""
+ cap_md = cap ? "\n*#{cap}*" : ""
+ "\n\n#{img_md}#{cap_md}\n\n"
+
+ when "script", "style", "noscript", "button", "input",
+ "select", "textarea", "iframe", "object", "embed",
+ "head", "link", "meta"
+ ""
+
+ else
+ children_md(node, context)
+ end
+ end
+
+ def convert_table(table, context:)
+ rows = table.css("tr")
+ return "" if rows.empty?
+
+ md_rows = rows.map do |row|
+ cells = row.css("th, td").map do |cell|
+ children_md(cell, context).strip.gsub("|", "\\|")
+ end
+ "| #{cells.join(" | ")} |"
+ end
+
+ cols = rows.first.css("th, td").size
+ sep = "| #{Array.new(cols, "---").join(" | ")} |"
+
+ "\n\n#{md_rows.first}\n#{sep}\n#{md_rows[1..].join("\n")}\n\n"
+ end
+
+ # ------------------------------------------------------------------ #
+ # Helpers
+ # ------------------------------------------------------------------ #
+
def truncate(text)
max = SiteSetting.url_to_article_max_content_length
return text if text.length <= max
diff --git a/plugin.rb b/plugin.rb
index 9c5b22f..b11f95f 100644
--- a/plugin.rb
+++ b/plugin.rb
@@ -6,9 +6,6 @@
# authors: Your Name
# url: https://github.com/yourname/discourse-url-to-article
-gem "nokogiri", "1.16.4"
-gem "reverse_markdown", "2.1.1"
-
enabled_site_setting :url_to_article_enabled
after_initialize do