From 09d46a9d10f5983c42c88bf07efb4737cd8013b5 Mon Sep 17 00:00:00 2001 From: robert Date: Wed, 18 Mar 2026 11:28:55 -0400 Subject: [PATCH] Removed dependencies. --- lib/url_to_article/article_extractor.rb | 275 +++++++++++++++++------- plugin.rb | 3 - 2 files changed, 193 insertions(+), 85 deletions(-) diff --git a/lib/url_to_article/article_extractor.rb b/lib/url_to_article/article_extractor.rb index a1b87be..d671399 100644 --- a/lib/url_to_article/article_extractor.rb +++ b/lib/url_to_article/article_extractor.rb @@ -1,14 +1,14 @@ # frozen_string_literal: true +# Uses only Nokogiri (already bundled with Discourse) — no extra gems needed. + require "nokogiri" -require "reverse_markdown" require "net/http" require "uri" require "timeout" module UrlToArticle class ArticleExtractor - # Tags that are almost never article content NOISE_SELECTORS = %w[ script style noscript iframe nav footer header .navigation .nav .menu .sidebar .widget .ad .advertisement @@ -19,7 +19,6 @@ module UrlToArticle [aria-label=navigation] [aria-label=footer] ].freeze - # Candidate content selectors tried in order ARTICLE_SELECTORS = %w[ article[class*=content] article[class*=post] @@ -52,61 +51,70 @@ module UrlToArticle end def extract - html = fetch_html - doc = Nokogiri::HTML(html) - - title = extract_title(doc) - byline = extract_byline(doc) - site_name = extract_site_name(doc) + html = fetch_html + doc = Nokogiri::HTML(html) + title = extract_title(doc) + byline = extract_byline(doc) + site_name = extract_site_name(doc) description = extract_description(doc) - content_node = find_content_node(doc) + content_node = find_content_node(doc) clean_node!(content_node) markdown = node_to_markdown(content_node) markdown = truncate(markdown) Result.new( - title: title, - byline: byline, - site_name: site_name, + title: title, + byline: byline, + site_name: site_name, description: description, - markdown: markdown, - url: @url + markdown: markdown, + url: @url, ) end private + # ------------------------------------------------------------------ # + # HTTP fetch + # ------------------------------------------------------------------ # + def fetch_html Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do - http = Net::HTTP.new(@uri.host, @uri.port) - http.use_ssl = @uri.scheme == "https" - http.open_timeout = 5 - http.read_timeout = SiteSetting.url_to_article_fetch_timeout + response = do_get(@uri) - request = Net::HTTP::Get.new(@uri.request_uri) - request["User-Agent"] = "Mozilla/5.0 (compatible; Discourse URL-to-Article Bot/1.0)" - request["Accept"] = "text/html,application/xhtml+xml" - request["Accept-Language"] = "en-US,en;q=0.9" - - response = http.request(request) - - # Follow one redirect if response.is_a?(Net::HTTPRedirection) && response["location"] - redirect_uri = URI.parse(response["location"]) - @uri = redirect_uri - http = Net::HTTP.new(@uri.host, @uri.port) - http.use_ssl = @uri.scheme == "https" - response = http.get(@uri.request_uri, "User-Agent" => request["User-Agent"]) + @uri = URI.parse(response["location"]) + response = do_get(@uri) end raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess) - response.body.force_encoding("UTF-8") + + body = response.body + body = body.force_encoding("UTF-8") rescue body + body end end + def do_get(uri) + http = Net::HTTP.new(uri.host, uri.port) + http.use_ssl = uri.scheme == "https" + http.open_timeout = 5 + http.read_timeout = SiteSetting.url_to_article_fetch_timeout + + req = Net::HTTP::Get.new(uri.request_uri) + req["User-Agent"] = "Mozilla/5.0 (compatible; Discourse/url-to-article)" + req["Accept"] = "text/html,application/xhtml+xml" + req["Accept-Language"] = "en-US,en;q=0.9" + + http.request(req) + end + + # ------------------------------------------------------------------ # + # Metadata extraction + # ------------------------------------------------------------------ # + def extract_title(doc) - # Try OG title first, then twitter:title, then og = doc.at_css('meta[property="og:title"]')&.attr("content") return og.strip if og.present? @@ -120,14 +128,13 @@ module UrlToArticle end def extract_byline(doc) - candidates = [ + [ doc.at_css('meta[name="author"]')&.attr("content"), doc.at_css('[rel="author"]')&.text, doc.at_css(".author")&.text, doc.at_css('[class*="byline"]')&.text, doc.at_css("address")&.text, - ] - candidates.compact.map(&:strip).reject(&:empty?).first + ].compact.map(&:strip).reject(&:empty?).first end def extract_site_name(doc) @@ -140,89 +147,193 @@ module UrlToArticle doc.at_css('meta[name="description"]')&.attr("content")&.strip end + # ------------------------------------------------------------------ # + # Content node selection + # ------------------------------------------------------------------ # + def find_content_node(doc) - # Try known article selectors ARTICLE_SELECTORS.each do |sel| node = doc.at_css(sel) - next unless node - text = node.text.strip - # Make sure it has meaningful content (>200 chars of text) - return node if text.length > 200 + return node if node && node.text.strip.length > 200 end - - # Fallback: score all <div> and <section> blocks by text density score_and_pick(doc) end def score_and_pick(doc) - candidates = doc.css("div, section, td").map do |node| - text = node.text.strip - next if text.length < 150 - - # Score = text length - penalize nodes with lots of tags (nav-heavy) + best = doc.css("div, section, td").filter_map do |node| + text_len = node.text.strip.length + next if text_len < 150 tag_count = node.css("*").size.to_f - text_length = text.length.to_f - score = text_length - (tag_count * 3) - + score = text_len - (tag_count * 3) [score, node] - end.compact.sort_by { |s, _| -s } + end.max_by { |score, _| score } - candidates.first&.last || doc.at_css("body") || doc + best&.last || doc.at_css("body") || doc end + # ------------------------------------------------------------------ # + # Node cleaning + # ------------------------------------------------------------------ # + def clean_node!(node) return unless node - # Remove noise elements - NOISE_SELECTORS.each do |sel| - node.css(sel).each(&:remove) - end + NOISE_SELECTORS.each { |sel| node.css(sel).each(&:remove) } - # Remove hidden elements node.css("[style]").each do |el| el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i end - # Remove empty tags (except br, img, hr) node.css("span, div, p, section").each do |el| el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty? end - # Strip all attributes except allowed ones on certain tags - allowed = { - "a" => %w[href title], - "img" => %w[src alt title width height], - "td" => %w[colspan rowspan], - "th" => %w[colspan rowspan scope], - "ol" => %w[start type], - "li" => %w[value], + allowed_attrs = { + "a" => %w[href title], + "img" => %w[src alt title], + "td" => %w[colspan rowspan], + "th" => %w[colspan rowspan scope], + "ol" => %w[start], + "li" => %w[value], "code" => %w[class], "pre" => %w[class], } - node.css("*").each do |el| - tag = el.name.downcase - permitted = allowed[tag] || [] - el.attributes.each_key do |attr| - el.remove_attribute(attr) unless permitted.include?(attr) - end - # Make relative image URLs absolute - if tag == "img" && el["src"] && !el["src"].start_with?("http", "//", "data:") - el["src"] = URI.join(@url, el["src"]).to_s rescue nil + node.css("*").each do |el| + tag = el.name.downcase + permitted = allowed_attrs[tag] || [] + el.attributes.each_key { |attr| el.remove_attribute(attr) unless permitted.include?(attr) } + + if tag == "img" && el["src"] && !el["src"].to_s.start_with?("http", "//", "data:") + el["src"] = (URI.join(@url, el["src"]).to_s rescue nil) end - if tag == "a" && el["href"] && !el["href"].start_with?("http", "//", "#", "mailto:") - el["href"] = URI.join(@url, el["href"]).to_s rescue nil + if tag == "a" && el["href"] && !el["href"].to_s.start_with?("http", "//", "#", "mailto:") + el["href"] = (URI.join(@url, el["href"]).to_s rescue nil) end end end + # ------------------------------------------------------------------ # + # HTML → Markdown (zero external dependencies) + # ------------------------------------------------------------------ # + def node_to_markdown(node) return "" unless node - ReverseMarkdown.convert(node.to_html, unknown_tags: :bypass, github_flavored: true) - .gsub(/\n{3,}/, "\n\n") # collapse excessive blank lines - .strip + convert_node(node, context: {}).strip.gsub(/\n{3,}/, "\n\n") end + def convert_node(node, context:) + return "" unless node + + case node.node_type + when Nokogiri::XML::Node::TEXT_NODE + node.text.gsub(/[[:space:]]+/, " ") + when Nokogiri::XML::Node::ELEMENT_NODE + convert_element(node, context: context) + when Nokogiri::XML::Node::DOCUMENT_NODE + node.children.map { |c| convert_node(c, context: context) }.join + else + "" + end + end + + def children_md(node, context) + node.children.map { |c| convert_node(c, context: context) }.join + end + + def convert_element(node, context:) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity + tag = node.name.downcase + + case tag + when "h1" then "\n\n# #{children_md(node, context).strip}\n\n" + when "h2" then "\n\n## #{children_md(node, context).strip}\n\n" + when "h3" then "\n\n### #{children_md(node, context).strip}\n\n" + when "h4" then "\n\n#### #{children_md(node, context).strip}\n\n" + when "h5" then "\n\n##### #{children_md(node, context).strip}\n\n" + when "h6" then "\n\n###### #{children_md(node, context).strip}\n\n" + when "p" then "\n\n#{children_md(node, context).strip}\n\n" + when "br" then " \n" + when "hr" then "\n\n---\n\n" + + when "strong", "b" then "**#{children_md(node, context).strip}**" + when "em", "i" then "*#{children_md(node, context).strip}*" + when "del", "s" then "~~#{children_md(node, context).strip}~~" + + when "code" + context[:pre] ? children_md(node, context) : "`#{children_md(node, context)}`" + + when "pre" + lang = node.at_css("code")&.attr("class")&.match(/language-(\w+)/)&.[](1) || "" + "\n\n```#{lang}\n#{node.text}\n```\n\n" + + when "blockquote" + quoted = children_md(node, context).strip.gsub(/^/, "> ") + "\n\n#{quoted}\n\n" + + when "a" + href = node["href"].to_s.strip + text = children_md(node, context).strip + href.empty? ? text : "[#{text}](#{href})" + + when "img" + src = node["src"].to_s.strip + alt = node["alt"].to_s.strip + src.empty? ? "" : "![#{alt}](#{src})" + + when "ul" + items = node.css("> li").map do |li| + "- #{children_md(li, context).strip}" + end.join("\n") + "\n\n#{items}\n\n" + + when "ol" + start = (node["start"] || 1).to_i + items = node.css("> li").each_with_index.map do |li, idx| + "#{start + idx}. #{children_md(li, context).strip}" + end.join("\n") + "\n\n#{items}\n\n" + + when "table" + convert_table(node, context: context) + + when "figure" + img_el = node.at_css("img") + cap = node.at_css("figcaption")&.text&.strip + img_md = img_el ? convert_element(img_el, context: context) : "" + cap_md = cap ? "\n*#{cap}*" : "" + "\n\n#{img_md}#{cap_md}\n\n" + + when "script", "style", "noscript", "button", "input", + "select", "textarea", "iframe", "object", "embed", + "head", "link", "meta" + "" + + else + children_md(node, context) + end + end + + def convert_table(table, context:) + rows = table.css("tr") + return "" if rows.empty? + + md_rows = rows.map do |row| + cells = row.css("th, td").map do |cell| + children_md(cell, context).strip.gsub("|", "\\|") + end + "| #{cells.join(" | ")} |" + end + + cols = rows.first.css("th, td").size + sep = "| #{Array.new(cols, "---").join(" | ")} |" + + "\n\n#{md_rows.first}\n#{sep}\n#{md_rows[1..].join("\n")}\n\n" + end + + # ------------------------------------------------------------------ # + # Helpers + # ------------------------------------------------------------------ # + def truncate(text) max = SiteSetting.url_to_article_max_content_length return text if text.length <= max diff --git a/plugin.rb b/plugin.rb index 9c5b22f..b11f95f 100644 --- a/plugin.rb +++ b/plugin.rb @@ -6,9 +6,6 @@ # authors: Your Name # url: https://github.com/yourname/discourse-url-to-article -gem "nokogiri", "1.16.4" -gem "reverse_markdown", "2.1.1" - enabled_site_setting :url_to_article_enabled after_initialize do