From 09d46a9d10f5983c42c88bf07efb4737cd8013b5 Mon Sep 17 00:00:00 2001
From: robert <robert@myemail.cloud>
Date: Wed, 18 Mar 2026 11:28:55 -0400
Subject: [PATCH] Removed dependencies.

---
 lib/url_to_article/article_extractor.rb | 275 +++++++++++++++++-------
 plugin.rb                               |   3 -
 2 files changed, 193 insertions(+), 85 deletions(-)

diff --git a/lib/url_to_article/article_extractor.rb b/lib/url_to_article/article_extractor.rb
index a1b87be..d671399 100644
--- a/lib/url_to_article/article_extractor.rb
+++ b/lib/url_to_article/article_extractor.rb
@@ -1,14 +1,14 @@
 # frozen_string_literal: true
 
+# Uses only Nokogiri (already bundled with Discourse) — no extra gems needed.
+
 require "nokogiri"
-require "reverse_markdown"
 require "net/http"
 require "uri"
 require "timeout"
 
 module UrlToArticle
   class ArticleExtractor
-    # Tags that are almost never article content
     NOISE_SELECTORS = %w[
       script style noscript iframe nav footer header
       .navigation .nav .menu .sidebar .widget .ad .advertisement
@@ -19,7 +19,6 @@ module UrlToArticle
       [aria-label=navigation] [aria-label=footer]
     ].freeze
 
-    # Candidate content selectors tried in order
     ARTICLE_SELECTORS = %w[
       article[class*=content]
       article[class*=post]
@@ -52,61 +51,70 @@ module UrlToArticle
     end
 
     def extract
-      html = fetch_html
-      doc = Nokogiri::HTML(html)
-
-      title     = extract_title(doc)
-      byline    = extract_byline(doc)
-      site_name = extract_site_name(doc)
+      html        = fetch_html
+      doc         = Nokogiri::HTML(html)
+      title       = extract_title(doc)
+      byline      = extract_byline(doc)
+      site_name   = extract_site_name(doc)
       description = extract_description(doc)
-      content_node = find_content_node(doc)
 
+      content_node = find_content_node(doc)
       clean_node!(content_node)
       markdown = node_to_markdown(content_node)
       markdown = truncate(markdown)
 
       Result.new(
-        title: title,
-        byline: byline,
-        site_name: site_name,
+        title:       title,
+        byline:      byline,
+        site_name:   site_name,
         description: description,
-        markdown: markdown,
-        url: @url
+        markdown:    markdown,
+        url:         @url,
       )
     end
 
     private
 
+    # ------------------------------------------------------------------ #
+    # HTTP fetch
+    # ------------------------------------------------------------------ #
+
     def fetch_html
       Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do
-        http = Net::HTTP.new(@uri.host, @uri.port)
-        http.use_ssl = @uri.scheme == "https"
-        http.open_timeout = 5
-        http.read_timeout = SiteSetting.url_to_article_fetch_timeout
+        response = do_get(@uri)
 
-        request = Net::HTTP::Get.new(@uri.request_uri)
-        request["User-Agent"] = "Mozilla/5.0 (compatible; Discourse URL-to-Article Bot/1.0)"
-        request["Accept"] = "text/html,application/xhtml+xml"
-        request["Accept-Language"] = "en-US,en;q=0.9"
-
-        response = http.request(request)
-
-        # Follow one redirect
         if response.is_a?(Net::HTTPRedirection) && response["location"]
-          redirect_uri = URI.parse(response["location"])
-          @uri = redirect_uri
-          http = Net::HTTP.new(@uri.host, @uri.port)
-          http.use_ssl = @uri.scheme == "https"
-          response = http.get(@uri.request_uri, "User-Agent" => request["User-Agent"])
+          @uri = URI.parse(response["location"])
+          response = do_get(@uri)
         end
 
         raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess)
-        response.body.force_encoding("UTF-8")
+
+        body = response.body
+        body = body.force_encoding("UTF-8") rescue body
+        body
       end
     end
 
+    def do_get(uri)
+      http              = Net::HTTP.new(uri.host, uri.port)
+      http.use_ssl      = uri.scheme == "https"
+      http.open_timeout = 5
+      http.read_timeout = SiteSetting.url_to_article_fetch_timeout
+
+      req                  = Net::HTTP::Get.new(uri.request_uri)
+      req["User-Agent"]    = "Mozilla/5.0 (compatible; Discourse/url-to-article)"
+      req["Accept"]        = "text/html,application/xhtml+xml"
+      req["Accept-Language"] = "en-US,en;q=0.9"
+
+      http.request(req)
+    end
+
+    # ------------------------------------------------------------------ #
+    # Metadata extraction
+    # ------------------------------------------------------------------ #
+
     def extract_title(doc)
-      # Try OG title first, then twitter:title, then <title>
       og = doc.at_css('meta[property="og:title"]')&.attr("content")
       return og.strip if og.present?
 
@@ -120,14 +128,13 @@ module UrlToArticle
     end
 
     def extract_byline(doc)
-      candidates = [
+      [
         doc.at_css('meta[name="author"]')&.attr("content"),
         doc.at_css('[rel="author"]')&.text,
         doc.at_css(".author")&.text,
         doc.at_css('[class*="byline"]')&.text,
         doc.at_css("address")&.text,
-      ]
-      candidates.compact.map(&:strip).reject(&:empty?).first
+      ].compact.map(&:strip).reject(&:empty?).first
     end
 
     def extract_site_name(doc)
@@ -140,89 +147,193 @@ module UrlToArticle
         doc.at_css('meta[name="description"]')&.attr("content")&.strip
     end
 
+    # ------------------------------------------------------------------ #
+    # Content node selection
+    # ------------------------------------------------------------------ #
+
     def find_content_node(doc)
-      # Try known article selectors
       ARTICLE_SELECTORS.each do |sel|
         node = doc.at_css(sel)
-        next unless node
-        text = node.text.strip
-        # Make sure it has meaningful content (>200 chars of text)
-        return node if text.length > 200
+        return node if node && node.text.strip.length > 200
       end
-
-      # Fallback: score all <div> and <section> blocks by text density
       score_and_pick(doc)
     end
 
     def score_and_pick(doc)
-      candidates = doc.css("div, section, td").map do |node|
-        text = node.text.strip
-        next if text.length < 150
-
-        # Score = text length - penalize nodes with lots of tags (nav-heavy)
+      best = doc.css("div, section, td").filter_map do |node|
+        text_len = node.text.strip.length
+        next if text_len < 150
         tag_count = node.css("*").size.to_f
-        text_length = text.length.to_f
-        score = text_length - (tag_count * 3)
-
+        score = text_len - (tag_count * 3)
         [score, node]
-      end.compact.sort_by { |s, _| -s }
+      end.max_by { |score, _| score }
 
-      candidates.first&.last || doc.at_css("body") || doc
+      best&.last || doc.at_css("body") || doc
     end
 
+    # ------------------------------------------------------------------ #
+    # Node cleaning
+    # ------------------------------------------------------------------ #
+
     def clean_node!(node)
       return unless node
 
-      # Remove noise elements
-      NOISE_SELECTORS.each do |sel|
-        node.css(sel).each(&:remove)
-      end
+      NOISE_SELECTORS.each { |sel| node.css(sel).each(&:remove) }
 
-      # Remove hidden elements
       node.css("[style]").each do |el|
         el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i
       end
 
-      # Remove empty tags (except br, img, hr)
       node.css("span, div, p, section").each do |el|
         el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty?
       end
 
-      # Strip all attributes except allowed ones on certain tags
-      allowed = {
-        "a"   => %w[href title],
-        "img" => %w[src alt title width height],
-        "td"  => %w[colspan rowspan],
-        "th"  => %w[colspan rowspan scope],
-        "ol"  => %w[start type],
-        "li"  => %w[value],
+      allowed_attrs = {
+        "a"    => %w[href title],
+        "img"  => %w[src alt title],
+        "td"   => %w[colspan rowspan],
+        "th"   => %w[colspan rowspan scope],
+        "ol"   => %w[start],
+        "li"   => %w[value],
         "code" => %w[class],
         "pre"  => %w[class],
       }
-      node.css("*").each do |el|
-        tag = el.name.downcase
-        permitted = allowed[tag] || []
-        el.attributes.each_key do |attr|
-          el.remove_attribute(attr) unless permitted.include?(attr)
-        end
 
-        # Make relative image URLs absolute
-        if tag == "img" && el["src"] && !el["src"].start_with?("http", "//", "data:")
-          el["src"] = URI.join(@url, el["src"]).to_s rescue nil
+      node.css("*").each do |el|
+        tag       = el.name.downcase
+        permitted = allowed_attrs[tag] || []
+        el.attributes.each_key { |attr| el.remove_attribute(attr) unless permitted.include?(attr) }
+
+        if tag == "img" && el["src"] && !el["src"].to_s.start_with?("http", "//", "data:")
+          el["src"] = (URI.join(@url, el["src"]).to_s rescue nil)
         end
-        if tag == "a" && el["href"] && !el["href"].start_with?("http", "//", "#", "mailto:")
-          el["href"] = URI.join(@url, el["href"]).to_s rescue nil
+        if tag == "a" && el["href"] && !el["href"].to_s.start_with?("http", "//", "#", "mailto:")
+          el["href"] = (URI.join(@url, el["href"]).to_s rescue nil)
         end
       end
     end
 
+    # ------------------------------------------------------------------ #
+    # HTML → Markdown (zero external dependencies)
+    # ------------------------------------------------------------------ #
+
     def node_to_markdown(node)
       return "" unless node
-      ReverseMarkdown.convert(node.to_html, unknown_tags: :bypass, github_flavored: true)
-        .gsub(/\n{3,}/, "\n\n")  # collapse excessive blank lines
-        .strip
+      convert_node(node, context: {}).strip.gsub(/\n{3,}/, "\n\n")
     end
 
+    def convert_node(node, context:)
+      return "" unless node
+
+      case node.node_type
+      when Nokogiri::XML::Node::TEXT_NODE
+        node.text.gsub(/[[:space:]]+/, " ")
+      when Nokogiri::XML::Node::ELEMENT_NODE
+        convert_element(node, context: context)
+      when Nokogiri::XML::Node::DOCUMENT_NODE
+        node.children.map { |c| convert_node(c, context: context) }.join
+      else
+        ""
+      end
+    end
+
+    def children_md(node, context)
+      node.children.map { |c| convert_node(c, context: context) }.join
+    end
+
+    def convert_element(node, context:) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity
+      tag = node.name.downcase
+
+      case tag
+      when "h1" then "\n\n# #{children_md(node, context).strip}\n\n"
+      when "h2" then "\n\n## #{children_md(node, context).strip}\n\n"
+      when "h3" then "\n\n### #{children_md(node, context).strip}\n\n"
+      when "h4" then "\n\n#### #{children_md(node, context).strip}\n\n"
+      when "h5" then "\n\n##### #{children_md(node, context).strip}\n\n"
+      when "h6" then "\n\n###### #{children_md(node, context).strip}\n\n"
+      when "p"  then "\n\n#{children_md(node, context).strip}\n\n"
+      when "br" then "  \n"
+      when "hr" then "\n\n---\n\n"
+
+      when "strong", "b" then "**#{children_md(node, context).strip}**"
+      when "em", "i"     then "*#{children_md(node, context).strip}*"
+      when "del", "s"    then "~~#{children_md(node, context).strip}~~"
+
+      when "code"
+        context[:pre] ? children_md(node, context) : "`#{children_md(node, context)}`"
+
+      when "pre"
+        lang = node.at_css("code")&.attr("class")&.match(/language-(\w+)/)&.[](1) || ""
+        "\n\n```#{lang}\n#{node.text}\n```\n\n"
+
+      when "blockquote"
+        quoted = children_md(node, context).strip.gsub(/^/, "> ")
+        "\n\n#{quoted}\n\n"
+
+      when "a"
+        href = node["href"].to_s.strip
+        text = children_md(node, context).strip
+        href.empty? ? text : "[#{text}](#{href})"
+
+      when "img"
+        src = node["src"].to_s.strip
+        alt = node["alt"].to_s.strip
+        src.empty? ? "" : "![#{alt}](#{src})"
+
+      when "ul"
+        items = node.css("> li").map do |li|
+          "- #{children_md(li, context).strip}"
+        end.join("\n")
+        "\n\n#{items}\n\n"
+
+      when "ol"
+        start = (node["start"] || 1).to_i
+        items = node.css("> li").each_with_index.map do |li, idx|
+          "#{start + idx}. #{children_md(li, context).strip}"
+        end.join("\n")
+        "\n\n#{items}\n\n"
+
+      when "table"
+        convert_table(node, context: context)
+
+      when "figure"
+        img_el  = node.at_css("img")
+        cap     = node.at_css("figcaption")&.text&.strip
+        img_md  = img_el ? convert_element(img_el, context: context) : ""
+        cap_md  = cap ? "\n*#{cap}*" : ""
+        "\n\n#{img_md}#{cap_md}\n\n"
+
+      when "script", "style", "noscript", "button", "input",
+           "select", "textarea", "iframe", "object", "embed",
+           "head", "link", "meta"
+        ""
+
+      else
+        children_md(node, context)
+      end
+    end
+
+    def convert_table(table, context:)
+      rows = table.css("tr")
+      return "" if rows.empty?
+
+      md_rows = rows.map do |row|
+        cells = row.css("th, td").map do |cell|
+          children_md(cell, context).strip.gsub("|", "\\|")
+        end
+        "| #{cells.join(" | ")} |"
+      end
+
+      cols = rows.first.css("th, td").size
+      sep  = "| #{Array.new(cols, "---").join(" | ")} |"
+
+      "\n\n#{md_rows.first}\n#{sep}\n#{md_rows[1..].join("\n")}\n\n"
+    end
+
+    # ------------------------------------------------------------------ #
+    # Helpers
+    # ------------------------------------------------------------------ #
+
     def truncate(text)
       max = SiteSetting.url_to_article_max_content_length
       return text if text.length <= max
diff --git a/plugin.rb b/plugin.rb
index 9c5b22f..b11f95f 100644
--- a/plugin.rb
+++ b/plugin.rb
@@ -6,9 +6,6 @@
 # authors: Your Name
 # url: https://github.com/yourname/discourse-url-to-article
 
-gem "nokogiri", "1.16.4"
-gem "reverse_markdown", "2.1.1"
-
 enabled_site_setting :url_to_article_enabled
 
 after_initialize do