init commit

2026-03-18 11:10:07 -04:00
commit b1ef516348
8 changed files with 730 additions and 0 deletions
--- a/lib/url_to_article/article_extractor.rb
+++ b/lib/url_to_article/article_extractor.rb
@@ -0,0 +1,232 @@
+# frozen_string_literal: true
+
+require "nokogiri"
+require "reverse_markdown"
+require "net/http"
+require "uri"
+require "timeout"
+
+module UrlToArticle
+  class ArticleExtractor
+    # Tags that are almost never article content
+    NOISE_SELECTORS = %w[
+      script style noscript iframe nav footer header
+      .navigation .nav .menu .sidebar .widget .ad .advertisement
+      .cookie-banner .cookie-notice .popup .modal .overlay
+      .social-share .share-buttons .related-posts .comments
+      #comments #sidebar #navigation #footer #header
+      [role=navigation] [role=banner] [role=contentinfo]
+      [aria-label=navigation] [aria-label=footer]
+    ].freeze
+
+    # Candidate content selectors tried in order
+    ARTICLE_SELECTORS = %w[
+      article[class*=content]
+      article[class*=post]
+      article[class*=article]
+      article
+      [role=main]
+      main
+      .post-content
+      .article-content
+      .entry-content
+      .article-body
+      .story-body
+      .post-body
+      .content-body
+      .page-content
+      #article-body
+      #post-content
+      #main-content
+    ].freeze
+
+    Result = Struct.new(:title, :byline, :site_name, :description, :markdown, :url, keyword_init: true)
+
+    def self.extract(url)
+      new(url).extract
+    end
+
+    def initialize(url)
+      @url = url
+      @uri = URI.parse(url)
+    end
+
+    def extract
+      html = fetch_html
+      doc = Nokogiri::HTML(html)
+
+      title     = extract_title(doc)
+      byline    = extract_byline(doc)
+      site_name = extract_site_name(doc)
+      description = extract_description(doc)
+      content_node = find_content_node(doc)
+
+      clean_node!(content_node)
+      markdown = node_to_markdown(content_node)
+      markdown = truncate(markdown)
+
+      Result.new(
+        title: title,
+        byline: byline,
+        site_name: site_name,
+        description: description,
+        markdown: markdown,
+        url: @url
+      )
+    end
+
+    private
+
+    def fetch_html
+      Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do
+        http = Net::HTTP.new(@uri.host, @uri.port)
+        http.use_ssl = @uri.scheme == "https"
+        http.open_timeout = 5
+        http.read_timeout = SiteSetting.url_to_article_fetch_timeout
+
+        request = Net::HTTP::Get.new(@uri.request_uri)
+        request["User-Agent"] = "Mozilla/5.0 (compatible; Discourse URL-to-Article Bot/1.0)"
+        request["Accept"] = "text/html,application/xhtml+xml"
+        request["Accept-Language"] = "en-US,en;q=0.9"
+
+        response = http.request(request)
+
+        # Follow one redirect
+        if response.is_a?(Net::HTTPRedirection) && response["location"]
+          redirect_uri = URI.parse(response["location"])
+          @uri = redirect_uri
+          http = Net::HTTP.new(@uri.host, @uri.port)
+          http.use_ssl = @uri.scheme == "https"
+          response = http.get(@uri.request_uri, "User-Agent" => request["User-Agent"])
+        end
+
+        raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess)
+        response.body.force_encoding("UTF-8")
+      end
+    end
+
+    def extract_title(doc)
+      # Try OG title first, then twitter:title, then <title>
+      og = doc.at_css('meta[property="og:title"]')&.attr("content")
+      return og.strip if og.present?
+
+      tw = doc.at_css('meta[name="twitter:title"]')&.attr("content")
+      return tw.strip if tw.present?
+
+      h1 = doc.at_css("h1")&.text
+      return h1.strip if h1.present?
+
+      doc.at_css("title")&.text&.strip || @uri.host
+    end
+
+    def extract_byline(doc)
+      candidates = [
+        doc.at_css('meta[name="author"]')&.attr("content"),
+        doc.at_css('[rel="author"]')&.text,
+        doc.at_css(".author")&.text,
+        doc.at_css('[class*="byline"]')&.text,
+        doc.at_css("address")&.text,
+      ]
+      candidates.compact.map(&:strip).reject(&:empty?).first
+    end
+
+    def extract_site_name(doc)
+      doc.at_css('meta[property="og:site_name"]')&.attr("content")&.strip ||
+        @uri.host.sub(/^www\./, "")
+    end
+
+    def extract_description(doc)
+      doc.at_css('meta[property="og:description"]')&.attr("content")&.strip ||
+        doc.at_css('meta[name="description"]')&.attr("content")&.strip
+    end
+
+    def find_content_node(doc)
+      # Try known article selectors
+      ARTICLE_SELECTORS.each do |sel|
+        node = doc.at_css(sel)
+        next unless node
+        text = node.text.strip
+        # Make sure it has meaningful content (>200 chars of text)
+        return node if text.length > 200
+      end
+
+      # Fallback: score all <div> and <section> blocks by text density
+      score_and_pick(doc)
+    end
+
+    def score_and_pick(doc)
+      candidates = doc.css("div, section, td").map do |node|
+        text = node.text.strip
+        next if text.length < 150
+
+        # Score = text length - penalize nodes with lots of tags (nav-heavy)
+        tag_count = node.css("*").size.to_f
+        text_length = text.length.to_f
+        score = text_length - (tag_count * 3)
+
+        [score, node]
+      end.compact.sort_by { |s, _| -s }
+
+      candidates.first&.last || doc.at_css("body") || doc
+    end
+
+    def clean_node!(node)
+      return unless node
+
+      # Remove noise elements
+      NOISE_SELECTORS.each do |sel|
+        node.css(sel).each(&:remove)
+      end
+
+      # Remove hidden elements
+      node.css("[style]").each do |el|
+        el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i
+      end
+
+      # Remove empty tags (except br, img, hr)
+      node.css("span, div, p, section").each do |el|
+        el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty?
+      end
+
+      # Strip all attributes except allowed ones on certain tags
+      allowed = {
+        "a"   => %w[href title],
+        "img" => %w[src alt title width height],
+        "td"  => %w[colspan rowspan],
+        "th"  => %w[colspan rowspan scope],
+        "ol"  => %w[start type],
+        "li"  => %w[value],
+        "code" => %w[class],
+        "pre"  => %w[class],
+      }
+      node.css("*").each do |el|
+        tag = el.name.downcase
+        permitted = allowed[tag] || []
+        el.attributes.each_key do |attr|
+          el.remove_attribute(attr) unless permitted.include?(attr)
+        end
+
+        # Make relative image URLs absolute
+        if tag == "img" && el["src"] && !el["src"].start_with?("http", "//", "data:")
+          el["src"] = URI.join(@url, el["src"]).to_s rescue nil
+        end
+        if tag == "a" && el["href"] && !el["href"].start_with?("http", "//", "#", "mailto:")
+          el["href"] = URI.join(@url, el["href"]).to_s rescue nil
+        end
+      end
+    end
+
+    def node_to_markdown(node)
+      return "" unless node
+      ReverseMarkdown.convert(node.to_html, unknown_tags: :bypass, github_flavored: true)
+        .gsub(/\n{3,}/, "\n\n")  # collapse excessive blank lines
+        .strip
+    end
+
+    def truncate(text)
+      max = SiteSetting.url_to_article_max_content_length
+      return text if text.length <= max
+      text[0...max] + "\n\n*[Content truncated — visit the original article for the full text.]*"
+    end
+  end
+end