lib/url_to_article/article_extractor.rb

# frozen_string_literal: true

require "nokogiri"
require "reverse_markdown"
require "net/http"
require "uri"
require "timeout"

module UrlToArticle
  class ArticleExtractor
    # Tags that are almost never article content
    NOISE_SELECTORS = %w[
      script style noscript iframe nav footer header
      .navigation .nav .menu .sidebar .widget .ad .advertisement
      .cookie-banner .cookie-notice .popup .modal .overlay
      .social-share .share-buttons .related-posts .comments
      #comments #sidebar #navigation #footer #header
      [role=navigation] [role=banner] [role=contentinfo]
      [aria-label=navigation] [aria-label=footer]
    ].freeze

    # Candidate content selectors tried in order
    ARTICLE_SELECTORS = %w[
      article[class*=content]
      article[class*=post]
      article[class*=article]
      article
      [role=main]
      main
      .post-content
      .article-content
      .entry-content
      .article-body
      .story-body
      .post-body
      .content-body
      .page-content
      #article-body
      #post-content
      #main-content
    ].freeze

    Result = Struct.new(:title, :byline, :site_name, :description, :markdown, :url, keyword_init: true)

    def self.extract(url)
      new(url).extract
    end

    def initialize(url)
      @url = url
      @uri = URI.parse(url)
    end

    def extract
      html = fetch_html
      doc = Nokogiri::HTML(html)

      title     = extract_title(doc)
      byline    = extract_byline(doc)
      site_name = extract_site_name(doc)
      description = extract_description(doc)
      content_node = find_content_node(doc)

      clean_node!(content_node)
      markdown = node_to_markdown(content_node)
      markdown = truncate(markdown)

      Result.new(
        title: title,
        byline: byline,
        site_name: site_name,
        description: description,
        markdown: markdown,
        url: @url
      )
    end

    private

    def fetch_html
      Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do
        http = Net::HTTP.new(@uri.host, @uri.port)
        http.use_ssl = @uri.scheme == "https"
        http.open_timeout = 5
        http.read_timeout = SiteSetting.url_to_article_fetch_timeout

        request = Net::HTTP::Get.new(@uri.request_uri)
        request["User-Agent"] = "Mozilla/5.0 (compatible; Discourse URL-to-Article Bot/1.0)"
        request["Accept"] = "text/html,application/xhtml+xml"
        request["Accept-Language"] = "en-US,en;q=0.9"

        response = http.request(request)

        # Follow one redirect
        if response.is_a?(Net::HTTPRedirection) && response["location"]
          redirect_uri = URI.parse(response["location"])
          @uri = redirect_uri
          http = Net::HTTP.new(@uri.host, @uri.port)
          http.use_ssl = @uri.scheme == "https"
          response = http.get(@uri.request_uri, "User-Agent" => request["User-Agent"])
        end

        raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess)
        response.body.force_encoding("UTF-8")
      end
    end

    def extract_title(doc)
      # Try OG title first, then twitter:title, then <title>
      og = doc.at_css('meta[property="og:title"]')&.attr("content")
      return og.strip if og.present?

      tw = doc.at_css('meta[name="twitter:title"]')&.attr("content")
      return tw.strip if tw.present?

      h1 = doc.at_css("h1")&.text
      return h1.strip if h1.present?

      doc.at_css("title")&.text&.strip || @uri.host
    end

    def extract_byline(doc)
      candidates = [
        doc.at_css('meta[name="author"]')&.attr("content"),
        doc.at_css('[rel="author"]')&.text,
        doc.at_css(".author")&.text,
        doc.at_css('[class*="byline"]')&.text,
        doc.at_css("address")&.text,
      ]
      candidates.compact.map(&:strip).reject(&:empty?).first
    end

    def extract_site_name(doc)
      doc.at_css('meta[property="og:site_name"]')&.attr("content")&.strip ||
        @uri.host.sub(/^www\./, "")
    end

    def extract_description(doc)
      doc.at_css('meta[property="og:description"]')&.attr("content")&.strip ||
        doc.at_css('meta[name="description"]')&.attr("content")&.strip
    end

    def find_content_node(doc)
      # Try known article selectors
      ARTICLE_SELECTORS.each do |sel|
        node = doc.at_css(sel)
        next unless node
        text = node.text.strip
        # Make sure it has meaningful content (>200 chars of text)
        return node if text.length > 200
      end

      # Fallback: score all <div> and <section> blocks by text density
      score_and_pick(doc)
    end

    def score_and_pick(doc)
      candidates = doc.css("div, section, td").map do |node|
        text = node.text.strip
        next if text.length < 150

        # Score = text length - penalize nodes with lots of tags (nav-heavy)
        tag_count = node.css("*").size.to_f
        text_length = text.length.to_f
        score = text_length - (tag_count * 3)

        [score, node]
      end.compact.sort_by { |s, _| -s }

      candidates.first&.last || doc.at_css("body") || doc
    end

    def clean_node!(node)
      return unless node

      # Remove noise elements
      NOISE_SELECTORS.each do |sel|
        node.css(sel).each(&:remove)
      end

      # Remove hidden elements
      node.css("[style]").each do |el|
        el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i
      end

      # Remove empty tags (except br, img, hr)
      node.css("span, div, p, section").each do |el|
        el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty?
      end

      # Strip all attributes except allowed ones on certain tags
      allowed = {
        "a"   => %w[href title],
        "img" => %w[src alt title width height],
        "td"  => %w[colspan rowspan],
        "th"  => %w[colspan rowspan scope],
        "ol"  => %w[start type],
        "li"  => %w[value],
        "code" => %w[class],
        "pre"  => %w[class],
      }
      node.css("*").each do |el|
        tag = el.name.downcase
        permitted = allowed[tag] || []
        el.attributes.each_key do |attr|
          el.remove_attribute(attr) unless permitted.include?(attr)
        end

        # Make relative image URLs absolute
        if tag == "img" && el["src"] && !el["src"].start_with?("http", "//", "data:")
          el["src"] = URI.join(@url, el["src"]).to_s rescue nil
        end
        if tag == "a" && el["href"] && !el["href"].start_with?("http", "//", "#", "mailto:")
          el["href"] = URI.join(@url, el["href"]).to_s rescue nil
        end
      end
    end

    def node_to_markdown(node)
      return "" unless node
      ReverseMarkdown.convert(node.to_html, unknown_tags: :bypass, github_flavored: true)
        .gsub(/\n{3,}/, "\n\n")  # collapse excessive blank lines
        .strip
    end

    def truncate(text)
      max = SiteSetting.url_to_article_max_content_length
      return text if text.length <= max
      text[0...max] + "\n\n*[Content truncated — visit the original article for the full text.]*"
    end
  end
end
init commit 2026-03-18 11:10:07 -04:00			`# frozen_string_literal: true`

			`require "nokogiri"`
			`require "reverse_markdown"`
			`require "net/http"`
			`require "uri"`
			`require "timeout"`

			`module UrlToArticle`
			`class ArticleExtractor`
			`# Tags that are almost never article content`
			`NOISE_SELECTORS = %w[`
			`script style noscript iframe nav footer header`
			`.navigation .nav .menu .sidebar .widget .ad .advertisement`
			`.cookie-banner .cookie-notice .popup .modal .overlay`
			`.social-share .share-buttons .related-posts .comments`
			`#comments #sidebar #navigation #footer #header`
			`[role=navigation] [role=banner] [role=contentinfo]`
			`[aria-label=navigation] [aria-label=footer]`
			`].freeze`

			`# Candidate content selectors tried in order`
			`ARTICLE_SELECTORS = %w[`
			`article[class*=content]`
			`article[class*=post]`
			`article[class*=article]`
			`article`
			`[role=main]`
			`main`
			`.post-content`
			`.article-content`
			`.entry-content`
			`.article-body`
			`.story-body`
			`.post-body`
			`.content-body`
			`.page-content`
			`#article-body`
			`#post-content`
			`#main-content`
			`].freeze`

			`Result = Struct.new(:title, :byline, :site_name, :description, :markdown, :url, keyword_init: true)`

			`def self.extract(url)`
			`new(url).extract`
			`end`

			`def initialize(url)`
			`@url = url`
			`@uri = URI.parse(url)`
			`end`

			`def extract`
			`html = fetch_html`
			`doc = Nokogiri::HTML(html)`

			`title = extract_title(doc)`
			`byline = extract_byline(doc)`
			`site_name = extract_site_name(doc)`
			`description = extract_description(doc)`
			`content_node = find_content_node(doc)`

			`clean_node!(content_node)`
			`markdown = node_to_markdown(content_node)`
			`markdown = truncate(markdown)`

			`Result.new(`
			`title: title,`
			`byline: byline,`
			`site_name: site_name,`
			`description: description,`
			`markdown: markdown,`
			`url: @url`
			`)`
			`end`

			`private`

			`def fetch_html`
			`Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do`
			`http = Net::HTTP.new(@uri.host, @uri.port)`
			`http.use_ssl = @uri.scheme == "https"`
			`http.open_timeout = 5`
			`http.read_timeout = SiteSetting.url_to_article_fetch_timeout`

			`request = Net::HTTP::Get.new(@uri.request_uri)`
			`request["User-Agent"] = "Mozilla/5.0 (compatible; Discourse URL-to-Article Bot/1.0)"`
			`request["Accept"] = "text/html,application/xhtml+xml"`
			`request["Accept-Language"] = "en-US,en;q=0.9"`

			`response = http.request(request)`

			`# Follow one redirect`
			`if response.is_a?(Net::HTTPRedirection) && response["location"]`
			`redirect_uri = URI.parse(response["location"])`
			`@uri = redirect_uri`
			`http = Net::HTTP.new(@uri.host, @uri.port)`
			`http.use_ssl = @uri.scheme == "https"`
			`response = http.get(@uri.request_uri, "User-Agent" => request["User-Agent"])`
			`end`

			`raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess)`
			`response.body.force_encoding("UTF-8")`
			`end`
			`end`

			`def extract_title(doc)`
			`# Try OG title first, then twitter:title, then <title>`
			`og = doc.at_css('meta[property="og:title"]')&.attr("content")`
			`return og.strip if og.present?`

			`tw = doc.at_css('meta[name="twitter:title"]')&.attr("content")`
			`return tw.strip if tw.present?`

			`h1 = doc.at_css("h1")&.text`
			`return h1.strip if h1.present?`

			`doc.at_css("title")&.text&.strip \|\| @uri.host`
			`end`

			`def extract_byline(doc)`
			`candidates = [`
			`doc.at_css('meta[name="author"]')&.attr("content"),`
			`doc.at_css('[rel="author"]')&.text,`
			`doc.at_css(".author")&.text,`
			`doc.at_css('[class*="byline"]')&.text,`
			`doc.at_css("address")&.text,`
			`]`
			`candidates.compact.map(&:strip).reject(&:empty?).first`
			`end`

			`def extract_site_name(doc)`
			`doc.at_css('meta[property="og:site_name"]')&.attr("content")&.strip \|\|`
			`@uri.host.sub(/^www\./, "")`
			`end`

			`def extract_description(doc)`
			`doc.at_css('meta[property="og:description"]')&.attr("content")&.strip \|\|`
			`doc.at_css('meta[name="description"]')&.attr("content")&.strip`
			`end`

			`def find_content_node(doc)`
			`# Try known article selectors`
			`ARTICLE_SELECTORS.each do \|sel\|`
			`node = doc.at_css(sel)`
			`next unless node`
			`text = node.text.strip`
			`# Make sure it has meaningful content (>200 chars of text)`
			`return node if text.length > 200`
			`end`

			`# Fallback: score all <div> and <section> blocks by text density`
			`score_and_pick(doc)`
			`end`

			`def score_and_pick(doc)`
			`candidates = doc.css("div, section, td").map do \|node\|`
			`text = node.text.strip`
			`next if text.length < 150`

			`# Score = text length - penalize nodes with lots of tags (nav-heavy)`
			`tag_count = node.css("*").size.to_f`
			`text_length = text.length.to_f`
			`score = text_length - (tag_count * 3)`

			`[score, node]`
			`end.compact.sort_by { \|s, _\| -s }`

			`candidates.first&.last \|\| doc.at_css("body") \|\| doc`
			`end`

			`def clean_node!(node)`
			`return unless node`

			`# Remove noise elements`
			`NOISE_SELECTORS.each do \|sel\|`
			`node.css(sel).each(&:remove)`
			`end`

			`# Remove hidden elements`
			`node.css("[style]").each do \|el\|`
			`el.remove if el["style"] =~ /display\s:\snone\|visibility\s:\shidden/i`
			`end`

			`# Remove empty tags (except br, img, hr)`
			`node.css("span, div, p, section").each do \|el\|`
			`el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty?`
			`end`

			`# Strip all attributes except allowed ones on certain tags`
			`allowed = {`
			`"a" => %w[href title],`
			`"img" => %w[src alt title width height],`
			`"td" => %w[colspan rowspan],`
			`"th" => %w[colspan rowspan scope],`
			`"ol" => %w[start type],`
			`"li" => %w[value],`
			`"code" => %w[class],`
			`"pre" => %w[class],`
			`}`
			`node.css("*").each do \|el\|`
			`tag = el.name.downcase`
			`permitted = allowed[tag] \|\| []`
			`el.attributes.each_key do \|attr\|`
			`el.remove_attribute(attr) unless permitted.include?(attr)`
			`end`

			`# Make relative image URLs absolute`
			`if tag == "img" && el["src"] && !el["src"].start_with?("http", "//", "data:")`
			`el["src"] = URI.join(@url, el["src"]).to_s rescue nil`
			`end`
			`if tag == "a" && el["href"] && !el["href"].start_with?("http", "//", "#", "mailto:")`
			`el["href"] = URI.join(@url, el["href"]).to_s rescue nil`
			`end`
			`end`
			`end`

			`def node_to_markdown(node)`
			`return "" unless node`
			`ReverseMarkdown.convert(node.to_html, unknown_tags: :bypass, github_flavored: true)`
			`.gsub(/\n{3,}/, "\n\n") # collapse excessive blank lines`
			`.strip`
			`end`

			`def truncate(text)`
			`max = SiteSetting.url_to_article_max_content_length`
			`return text if text.length <= max`
			`text[0...max] + "\n\n[Content truncated — visit the original article for the full text.]"`
			`end`
			`end`
			`end`