# frozen_string_literal: true # Uses only Nokogiri (already bundled with Discourse) — no extra gems needed. require "nokogiri" require "net/http" require "uri" require "timeout" module UrlToArticle class ArticleExtractor NOISE_SELECTORS = %w[ script style noscript iframe nav footer header .navigation .nav .menu .sidebar .widget .ad .advertisement .cookie-banner .cookie-notice .popup .modal .overlay .social-share .share-buttons .related-posts .comments #comments #sidebar #navigation #footer #header [role=navigation] [role=banner] [role=contentinfo] [aria-label=navigation] [aria-label=footer] ].freeze ARTICLE_SELECTORS = %w[ article[class*=content] article[class*=post] article[class*=article] article [role=main] main .post-content .article-content .entry-content .article-body .story-body .post-body .content-body .page-content #article-body #post-content #main-content ].freeze Result = Struct.new(:title, :byline, :site_name, :description, :markdown, :url, keyword_init: true) def self.extract(url) new(url).extract end def initialize(url) @url = url @uri = URI.parse(url) end def extract html = fetch_html doc = Nokogiri::HTML(html) title = extract_title(doc) byline = extract_byline(doc) site_name = extract_site_name(doc) description = extract_description(doc) content_node = find_content_node(doc) clean_node!(content_node) markdown = node_to_markdown(content_node) markdown = truncate(markdown) Result.new( title: title, byline: byline, site_name: site_name, description: description, markdown: markdown, url: @url, ) end private # ------------------------------------------------------------------ # # HTTP fetch # ------------------------------------------------------------------ # def fetch_html Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do response = do_get(@uri) if response.is_a?(Net::HTTPRedirection) && response["location"] @uri = URI.parse(response["location"]) response = do_get(@uri) end raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess) body = response.body body = body.force_encoding("UTF-8") rescue body body end end def do_get(uri) http = Net::HTTP.new(uri.host, uri.port) http.use_ssl = uri.scheme == "https" http.open_timeout = 5 http.read_timeout = SiteSetting.url_to_article_fetch_timeout req = Net::HTTP::Get.new(uri.request_uri) req["User-Agent"] = "Mozilla/5.0 (compatible; Discourse/url-to-article)" req["Accept"] = "text/html,application/xhtml+xml" req["Accept-Language"] = "en-US,en;q=0.9" http.request(req) end # ------------------------------------------------------------------ # # Metadata extraction # ------------------------------------------------------------------ # def extract_title(doc) og = doc.at_css('meta[property="og:title"]')&.attr("content") return og.strip if og.present? tw = doc.at_css('meta[name="twitter:title"]')&.attr("content") return tw.strip if tw.present? h1 = doc.at_css("h1")&.text return h1.strip if h1.present? doc.at_css("title")&.text&.strip || @uri.host end def extract_byline(doc) [ doc.at_css('meta[name="author"]')&.attr("content"), doc.at_css('[rel="author"]')&.text, doc.at_css(".author")&.text, doc.at_css('[class*="byline"]')&.text, doc.at_css("address")&.text, ].compact.map(&:strip).reject(&:empty?).first end def extract_site_name(doc) doc.at_css('meta[property="og:site_name"]')&.attr("content")&.strip || @uri.host.sub(/^www\./, "") end def extract_description(doc) doc.at_css('meta[property="og:description"]')&.attr("content")&.strip || doc.at_css('meta[name="description"]')&.attr("content")&.strip end # ------------------------------------------------------------------ # # Content node selection # ------------------------------------------------------------------ # def find_content_node(doc) ARTICLE_SELECTORS.each do |sel| node = doc.at_css(sel) return node if node && node.text.strip.length > 200 end score_and_pick(doc) end def score_and_pick(doc) best = doc.css("div, section, td").filter_map do |node| text_len = node.text.strip.length next if text_len < 150 tag_count = node.css("*").size.to_f score = text_len - (tag_count * 3) [score, node] end.max_by { |score, _| score } best&.last || doc.at_css("body") || doc end # ------------------------------------------------------------------ # # Node cleaning # ------------------------------------------------------------------ # def clean_node!(node) return unless node NOISE_SELECTORS.each { |sel| node.css(sel).each(&:remove) } node.css("[style]").each do |el| el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i end node.css("span, div, p, section").each do |el| el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty? end allowed_attrs = { "a" => %w[href title], "img" => %w[src alt title], "td" => %w[colspan rowspan], "th" => %w[colspan rowspan scope], "ol" => %w[start], "li" => %w[value], "code" => %w[class], "pre" => %w[class], } node.css("*").each do |el| tag = el.name.downcase permitted = allowed_attrs[tag] || [] el.attributes.each_key { |attr| el.remove_attribute(attr) unless permitted.include?(attr) } if tag == "img" && el["src"] && !el["src"].to_s.start_with?("http", "//", "data:") el["src"] = (URI.join(@url, el["src"]).to_s rescue nil) end if tag == "a" && el["href"] && !el["href"].to_s.start_with?("http", "//", "#", "mailto:") el["href"] = (URI.join(@url, el["href"]).to_s rescue nil) end end end # ------------------------------------------------------------------ # # HTML → Markdown (zero external dependencies) # ------------------------------------------------------------------ # def node_to_markdown(node) return "" unless node convert_node(node, context: {}).strip.gsub(/\n{3,}/, "\n\n") end def convert_node(node, context:) return "" unless node case node.node_type when Nokogiri::XML::Node::TEXT_NODE node.text.gsub(/[[:space:]]+/, " ") when Nokogiri::XML::Node::ELEMENT_NODE convert_element(node, context: context) when Nokogiri::XML::Node::DOCUMENT_NODE node.children.map { |c| convert_node(c, context: context) }.join else "" end end def children_md(node, context) node.children.map { |c| convert_node(c, context: context) }.join end def convert_element(node, context:) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity tag = node.name.downcase case tag when "h1" then "\n\n# #{children_md(node, context).strip}\n\n" when "h2" then "\n\n## #{children_md(node, context).strip}\n\n" when "h3" then "\n\n### #{children_md(node, context).strip}\n\n" when "h4" then "\n\n#### #{children_md(node, context).strip}\n\n" when "h5" then "\n\n##### #{children_md(node, context).strip}\n\n" when "h6" then "\n\n###### #{children_md(node, context).strip}\n\n" when "p" then "\n\n#{children_md(node, context).strip}\n\n" when "br" then " \n" when "hr" then "\n\n---\n\n" when "strong", "b" then "**#{children_md(node, context).strip}**" when "em", "i" then "*#{children_md(node, context).strip}*" when "del", "s" then "~~#{children_md(node, context).strip}~~" when "code" context[:pre] ? children_md(node, context) : "`#{children_md(node, context)}`" when "pre" lang = node.at_css("code")&.attr("class")&.match(/language-(\w+)/)&.[](1) || "" "\n\n```#{lang}\n#{node.text}\n```\n\n" when "blockquote" quoted = children_md(node, context).strip.gsub(/^/, "> ") "\n\n#{quoted}\n\n" when "a" href = node["href"].to_s.strip text = children_md(node, context).strip href.empty? ? text : "[#{text}](#{href})" when "img" src = node["src"].to_s.strip alt = node["alt"].to_s.strip src.empty? ? "" : "![#{alt}](#{src})" when "ul" items = node.css("> li").map do |li| "- #{children_md(li, context).strip}" end.join("\n") "\n\n#{items}\n\n" when "ol" start = (node["start"] || 1).to_i items = node.css("> li").each_with_index.map do |li, idx| "#{start + idx}. #{children_md(li, context).strip}" end.join("\n") "\n\n#{items}\n\n" when "table" convert_table(node, context: context) when "figure" img_el = node.at_css("img") cap = node.at_css("figcaption")&.text&.strip img_md = img_el ? convert_element(img_el, context: context) : "" cap_md = cap ? "\n*#{cap}*" : "" "\n\n#{img_md}#{cap_md}\n\n" when "script", "style", "noscript", "button", "input", "select", "textarea", "iframe", "object", "embed", "head", "link", "meta" "" else children_md(node, context) end end def convert_table(table, context:) rows = table.css("tr") return "" if rows.empty? md_rows = rows.map do |row| cells = row.css("th, td").map do |cell| children_md(cell, context).strip.gsub("|", "\\|") end "| #{cells.join(" | ")} |" end cols = rows.first.css("th, td").size sep = "| #{Array.new(cols, "---").join(" | ")} |" "\n\n#{md_rows.first}\n#{sep}\n#{md_rows[1..].join("\n")}\n\n" end # ------------------------------------------------------------------ # # Helpers # ------------------------------------------------------------------ # def truncate(text) max = SiteSetting.url_to_article_max_content_length return text if text.length <= max text[0...max] + "\n\n*[Content truncated — visit the original article for the full text.]*" end end end