# frozen_string_literal: true

# Uses only Nokogiri (already bundled with Discourse) — no extra gems needed.

require "nokogiri"
require "net/http"
require "uri"
require "timeout"

module UrlToArticle
  class ArticleExtractor
    NOISE_SELECTORS = %w[
      script style noscript iframe nav footer header
      .navigation .nav .menu .sidebar .widget .ad .advertisement
      .cookie-banner .cookie-notice .popup .modal .overlay
      .social-share .share-buttons .related-posts .comments
      #comments #sidebar #navigation #footer #header
      [role=navigation] [role=banner] [role=contentinfo]
      [aria-label=navigation] [aria-label=footer]
    ].freeze

    ARTICLE_SELECTORS = %w[
      article[class*=content]
      article[class*=post]
      article[class*=article]
      article
      [role=main]
      main
      .post-content
      .article-content
      .entry-content
      .article-body
      .story-body
      .post-body
      .content-body
      .page-content
      #article-body
      #post-content
      #main-content
    ].freeze

    Result = Struct.new(:title, :byline, :site_name, :description, :markdown, :url, keyword_init: true)

    def self.extract(url)
      new(url).extract
    end

    def initialize(url)
      @url = url
      @uri = URI.parse(url)
    end

    def extract
      html        = fetch_html
      doc         = Nokogiri::HTML(html)
      title       = extract_title(doc)
      byline      = extract_byline(doc)
      site_name   = extract_site_name(doc)
      description = extract_description(doc)

      content_node = find_content_node(doc)
      clean_node!(content_node)
      markdown = node_to_markdown(content_node)
      markdown = truncate(markdown)

      Result.new(
        title:       title,
        byline:      byline,
        site_name:   site_name,
        description: description,
        markdown:    markdown,
        url:         @url,
      )
    end

    private

    # ------------------------------------------------------------------ #
    # HTTP fetch
    # ------------------------------------------------------------------ #

    def fetch_html
      Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do
        response = do_get(@uri)

        if response.is_a?(Net::HTTPRedirection) && response["location"]
          @uri = URI.parse(response["location"])
          response = do_get(@uri)
        end

        raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess)

        body = response.body
        body = body.force_encoding("UTF-8") rescue body
        body
      end
    end

    def do_get(uri)
      http              = Net::HTTP.new(uri.host, uri.port)
      http.use_ssl      = uri.scheme == "https"
      http.open_timeout = 5
      http.read_timeout = SiteSetting.url_to_article_fetch_timeout

      req                  = Net::HTTP::Get.new(uri.request_uri)
      req["User-Agent"]    = "Mozilla/5.0 (compatible; Discourse/url-to-article)"
      req["Accept"]        = "text/html,application/xhtml+xml"
      req["Accept-Language"] = "en-US,en;q=0.9"

      http.request(req)
    end

    # ------------------------------------------------------------------ #
    # Metadata extraction
    # ------------------------------------------------------------------ #

    def extract_title(doc)
      og = doc.at_css('meta[property="og:title"]')&.attr("content")
      return og.strip if og.present?

      tw = doc.at_css('meta[name="twitter:title"]')&.attr("content")
      return tw.strip if tw.present?

      h1 = doc.at_css("h1")&.text
      return h1.strip if h1.present?

      doc.at_css("title")&.text&.strip || @uri.host
    end

    def extract_byline(doc)
      [
        doc.at_css('meta[name="author"]')&.attr("content"),
        doc.at_css('[rel="author"]')&.text,
        doc.at_css(".author")&.text,
        doc.at_css('[class*="byline"]')&.text,
        doc.at_css("address")&.text,
      ].compact.map(&:strip).reject(&:empty?).first
    end

    def extract_site_name(doc)
      doc.at_css('meta[property="og:site_name"]')&.attr("content")&.strip ||
        @uri.host.sub(/^www\./, "")
    end

    def extract_description(doc)
      doc.at_css('meta[property="og:description"]')&.attr("content")&.strip ||
        doc.at_css('meta[name="description"]')&.attr("content")&.strip
    end

    # ------------------------------------------------------------------ #
    # Content node selection
    # ------------------------------------------------------------------ #

    def find_content_node(doc)
      ARTICLE_SELECTORS.each do |sel|
        node = doc.at_css(sel)
        return node if node && node.text.strip.length > 200
      end
      score_and_pick(doc)
    end

    def score_and_pick(doc)
      best = doc.css("div, section, td").filter_map do |node|
        text_len = node.text.strip.length
        next if text_len < 150
        tag_count = node.css("*").size.to_f
        score = text_len - (tag_count * 3)
        [score, node]
      end.max_by { |score, _| score }

      best&.last || doc.at_css("body") || doc
    end

    # ------------------------------------------------------------------ #
    # Node cleaning
    # ------------------------------------------------------------------ #

    def clean_node!(node)
      return unless node

      NOISE_SELECTORS.each { |sel| node.css(sel).each(&:remove) }

      node.css("[style]").each do |el|
        el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i
      end

      node.css("span, div, p, section").each do |el|
        el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty?
      end

      allowed_attrs = {
        "a"    => %w[href title],
        "img"  => %w[src alt title],
        "td"   => %w[colspan rowspan],
        "th"   => %w[colspan rowspan scope],
        "ol"   => %w[start],
        "li"   => %w[value],
        "code" => %w[class],
        "pre"  => %w[class],
      }

      node.css("*").each do |el|
        tag       = el.name.downcase
        permitted = allowed_attrs[tag] || []
        el.attributes.each_key { |attr| el.remove_attribute(attr) unless permitted.include?(attr) }

        if tag == "img" && el["src"] && !el["src"].to_s.start_with?("http", "//", "data:")
          el["src"] = (URI.join(@url, el["src"]).to_s rescue nil)
        end
        if tag == "a" && el["href"] && !el["href"].to_s.start_with?("http", "//", "#", "mailto:")
          el["href"] = (URI.join(@url, el["href"]).to_s rescue nil)
        end
      end
    end

    # ------------------------------------------------------------------ #
    # HTML → Markdown (zero external dependencies)
    # ------------------------------------------------------------------ #

    def node_to_markdown(node)
      return "" unless node
      convert_node(node, context: {}).strip.gsub(/\n{3,}/, "\n\n")
    end

    def convert_node(node, context:)
      return "" unless node

      case node.node_type
      when Nokogiri::XML::Node::TEXT_NODE
        node.text.gsub(/[[:space:]]+/, " ")
      when Nokogiri::XML::Node::ELEMENT_NODE
        convert_element(node, context: context)
      when Nokogiri::XML::Node::DOCUMENT_NODE
        node.children.map { |c| convert_node(c, context: context) }.join
      else
        ""
      end
    end

    def children_md(node, context)
      node.children.map { |c| convert_node(c, context: context) }.join
    end

    def convert_element(node, context:) # rubocop:disable Metrics/MethodLength, Metrics/CyclomaticComplexity
      tag = node.name.downcase

      case tag
      when "h1" then "\n\n# #{children_md(node, context).strip}\n\n"
      when "h2" then "\n\n## #{children_md(node, context).strip}\n\n"
      when "h3" then "\n\n### #{children_md(node, context).strip}\n\n"
      when "h4" then "\n\n#### #{children_md(node, context).strip}\n\n"
      when "h5" then "\n\n##### #{children_md(node, context).strip}\n\n"
      when "h6" then "\n\n###### #{children_md(node, context).strip}\n\n"
      when "p"  then "\n\n#{children_md(node, context).strip}\n\n"
      when "br" then "  \n"
      when "hr" then "\n\n---\n\n"

      when "strong", "b" then "**#{children_md(node, context).strip}**"
      when "em", "i"     then "*#{children_md(node, context).strip}*"
      when "del", "s"    then "~~#{children_md(node, context).strip}~~"

      when "code"
        context[:pre] ? children_md(node, context) : "`#{children_md(node, context)}`"

      when "pre"
        lang = node.at_css("code")&.attr("class")&.match(/language-(\w+)/)&.[](1) || ""
        "\n\n```#{lang}\n#{node.text}\n```\n\n"

      when "blockquote"
        quoted = children_md(node, context).strip.gsub(/^/, "> ")
        "\n\n#{quoted}\n\n"

      when "a"
        href = node["href"].to_s.strip
        text = children_md(node, context).strip
        href.empty? ? text : "[#{text}](#{href})"

      when "img"
        src = node["src"].to_s.strip
        alt = node["alt"].to_s.strip
        src.empty? ? "" : "![#{alt}](#{src})"

      when "ul"
        items = node.css("> li").map do |li|
          "- #{children_md(li, context).strip}"
        end.join("\n")
        "\n\n#{items}\n\n"

      when "ol"
        start = (node["start"] || 1).to_i
        items = node.css("> li").each_with_index.map do |li, idx|
          "#{start + idx}. #{children_md(li, context).strip}"
        end.join("\n")
        "\n\n#{items}\n\n"

      when "table"
        convert_table(node, context: context)

      when "figure"
        img_el  = node.at_css("img")
        cap     = node.at_css("figcaption")&.text&.strip
        img_md  = img_el ? convert_element(img_el, context: context) : ""
        cap_md  = cap ? "\n*#{cap}*" : ""
        "\n\n#{img_md}#{cap_md}\n\n"

      when "script", "style", "noscript", "button", "input",
           "select", "textarea", "iframe", "object", "embed",
           "head", "link", "meta"
        ""

      else
        children_md(node, context)
      end
    end

    def convert_table(table, context:)
      rows = table.css("tr")
      return "" if rows.empty?

      md_rows = rows.map do |row|
        cells = row.css("th, td").map do |cell|
          children_md(cell, context).strip.gsub("|", "\\|")
        end
        "| #{cells.join(" | ")} |"
      end

      cols = rows.first.css("th, td").size
      sep  = "| #{Array.new(cols, "---").join(" | ")} |"

      "\n\n#{md_rows.first}\n#{sep}\n#{md_rows[1..].join("\n")}\n\n"
    end

    # ------------------------------------------------------------------ #
    # Helpers
    # ------------------------------------------------------------------ #

    def truncate(text)
      max = SiteSetting.url_to_article_max_content_length
      return text if text.length <= max
      text[0...max] + "\n\n*[Content truncated — visit the original article for the full text.]*"
    end
  end
end