# frozen_string_literal: true require "nokogiri" require "reverse_markdown" require "net/http" require "uri" require "timeout" module UrlToArticle class ArticleExtractor # Tags that are almost never article content NOISE_SELECTORS = %w[ script style noscript iframe nav footer header .navigation .nav .menu .sidebar .widget .ad .advertisement .cookie-banner .cookie-notice .popup .modal .overlay .social-share .share-buttons .related-posts .comments #comments #sidebar #navigation #footer #header [role=navigation] [role=banner] [role=contentinfo] [aria-label=navigation] [aria-label=footer] ].freeze # Candidate content selectors tried in order ARTICLE_SELECTORS = %w[ article[class*=content] article[class*=post] article[class*=article] article [role=main] main .post-content .article-content .entry-content .article-body .story-body .post-body .content-body .page-content #article-body #post-content #main-content ].freeze Result = Struct.new(:title, :byline, :site_name, :description, :markdown, :url, keyword_init: true) def self.extract(url) new(url).extract end def initialize(url) @url = url @uri = URI.parse(url) end def extract html = fetch_html doc = Nokogiri::HTML(html) title = extract_title(doc) byline = extract_byline(doc) site_name = extract_site_name(doc) description = extract_description(doc) content_node = find_content_node(doc) clean_node!(content_node) markdown = node_to_markdown(content_node) markdown = truncate(markdown) Result.new( title: title, byline: byline, site_name: site_name, description: description, markdown: markdown, url: @url ) end private def fetch_html Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do http = Net::HTTP.new(@uri.host, @uri.port) http.use_ssl = @uri.scheme == "https" http.open_timeout = 5 http.read_timeout = SiteSetting.url_to_article_fetch_timeout request = Net::HTTP::Get.new(@uri.request_uri) request["User-Agent"] = "Mozilla/5.0 (compatible; Discourse URL-to-Article Bot/1.0)" request["Accept"] = "text/html,application/xhtml+xml" request["Accept-Language"] = "en-US,en;q=0.9" response = http.request(request) # Follow one redirect if response.is_a?(Net::HTTPRedirection) && response["location"] redirect_uri = URI.parse(response["location"]) @uri = redirect_uri http = Net::HTTP.new(@uri.host, @uri.port) http.use_ssl = @uri.scheme == "https" response = http.get(@uri.request_uri, "User-Agent" => request["User-Agent"]) end raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess) response.body.force_encoding("UTF-8") end end def extract_title(doc) # Try OG title first, then twitter:title, then og = doc.at_css('meta[property="og:title"]')&.attr("content") return og.strip if og.present? tw = doc.at_css('meta[name="twitter:title"]')&.attr("content") return tw.strip if tw.present? h1 = doc.at_css("h1")&.text return h1.strip if h1.present? doc.at_css("title")&.text&.strip || @uri.host end def extract_byline(doc) candidates = [ doc.at_css('meta[name="author"]')&.attr("content"), doc.at_css('[rel="author"]')&.text, doc.at_css(".author")&.text, doc.at_css('[class*="byline"]')&.text, doc.at_css("address")&.text, ] candidates.compact.map(&:strip).reject(&:empty?).first end def extract_site_name(doc) doc.at_css('meta[property="og:site_name"]')&.attr("content")&.strip || @uri.host.sub(/^www\./, "") end def extract_description(doc) doc.at_css('meta[property="og:description"]')&.attr("content")&.strip || doc.at_css('meta[name="description"]')&.attr("content")&.strip end def find_content_node(doc) # Try known article selectors ARTICLE_SELECTORS.each do |sel| node = doc.at_css(sel) next unless node text = node.text.strip # Make sure it has meaningful content (>200 chars of text) return node if text.length > 200 end # Fallback: score all <div> and <section> blocks by text density score_and_pick(doc) end def score_and_pick(doc) candidates = doc.css("div, section, td").map do |node| text = node.text.strip next if text.length < 150 # Score = text length - penalize nodes with lots of tags (nav-heavy) tag_count = node.css("*").size.to_f text_length = text.length.to_f score = text_length - (tag_count * 3) [score, node] end.compact.sort_by { |s, _| -s } candidates.first&.last || doc.at_css("body") || doc end def clean_node!(node) return unless node # Remove noise elements NOISE_SELECTORS.each do |sel| node.css(sel).each(&:remove) end # Remove hidden elements node.css("[style]").each do |el| el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i end # Remove empty tags (except br, img, hr) node.css("span, div, p, section").each do |el| el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty? end # Strip all attributes except allowed ones on certain tags allowed = { "a" => %w[href title], "img" => %w[src alt title width height], "td" => %w[colspan rowspan], "th" => %w[colspan rowspan scope], "ol" => %w[start type], "li" => %w[value], "code" => %w[class], "pre" => %w[class], } node.css("*").each do |el| tag = el.name.downcase permitted = allowed[tag] || [] el.attributes.each_key do |attr| el.remove_attribute(attr) unless permitted.include?(attr) end # Make relative image URLs absolute if tag == "img" && el["src"] && !el["src"].start_with?("http", "//", "data:") el["src"] = URI.join(@url, el["src"]).to_s rescue nil end if tag == "a" && el["href"] && !el["href"].start_with?("http", "//", "#", "mailto:") el["href"] = URI.join(@url, el["href"]).to_s rescue nil end end end def node_to_markdown(node) return "" unless node ReverseMarkdown.convert(node.to_html, unknown_tags: :bypass, github_flavored: true) .gsub(/\n{3,}/, "\n\n") # collapse excessive blank lines .strip end def truncate(text) max = SiteSetting.url_to_article_max_content_length return text if text.length <= max text[0...max] + "\n\n*[Content truncated — visit the original article for the full text.]*" end end end