From b1ef5163489ddffccddf3289be979ec5010ee0a2 Mon Sep 17 00:00:00 2001 From: robert Date: Wed, 18 Mar 2026 11:10:07 -0400 Subject: [PATCH] init commit --- README.md | 110 +++++++++ .../url_to_article/articles_controller.rb | 63 +++++ .../discourse/initializers/url-to-article.js | 197 +++++++++++++++ assets/stylesheets/url-to-article.scss | 51 ++++ config/locales/client.en.yml | 11 + config/settings.yml | 31 +++ lib/url_to_article/article_extractor.rb | 232 ++++++++++++++++++ plugin.rb | 35 +++ 8 files changed, 730 insertions(+) create mode 100644 README.md create mode 100644 app/controllers/url_to_article/articles_controller.rb create mode 100644 assets/javascripts/discourse/initializers/url-to-article.js create mode 100644 assets/stylesheets/url-to-article.scss create mode 100644 config/locales/client.en.yml create mode 100644 config/settings.yml create mode 100644 lib/url_to_article/article_extractor.rb create mode 100644 plugin.rb diff --git a/README.md b/README.md new file mode 100644 index 0000000..53bc105 --- /dev/null +++ b/README.md @@ -0,0 +1,110 @@ +# discourse-url-to-article + +A Discourse plugin that detects when a URL is pasted into the **topic title** field and offers to scrape the page, extracting the article content (Γ  la browser Reader Mode) and populating the **composer body** with a clean Markdown rendering. + +--- + +## Features + +- πŸ”— Detects a bare URL typed/pasted into the topic title +- πŸ“„ Extracts article content using a Readability-style heuristic (no external API needed) +- ✍️ Populates the topic body with clean Markdown: heading, byline, description, full article text +- πŸ›‘οΈ SSRF protection: blocks requests to private/loopback addresses +- βš™οΈ Configurable: auto-populate mode, allowlist/blocklist, timeout, content length cap +- 🌐 Works with most article-style pages (news, blogs, documentation) + +--- + +## Installation + +Add the plugin to your `app.yml`: + +```yaml +hooks: + after_code: + - exec: + cd: $home/plugins + cmd: + - git clone https://github.com/yourname/discourse-url-to-article.git +``` + +Then rebuild: `./launcher rebuild app` + +--- + +## Site Settings + +| Setting | Default | Description | +|---|---|---| +| `url_to_article_enabled` | `true` | Enable/disable the plugin | +| `url_to_article_auto_populate` | `false` | Populate body automatically without button click | +| `url_to_article_max_content_length` | `50000` | Max chars extracted from a page | +| `url_to_article_fetch_timeout` | `10` | Seconds before HTTP fetch times out | +| `url_to_article_allowed_domains` | *(blank = all)* | Comma-separated domain allowlist | +| `url_to_article_blocked_domains` | `localhost,127.0.0.1,…` | SSRF blocklist | + +--- + +## How It Works + +### Frontend (Ember.js) + +`initializers/url-to-article.js` hooks into the `composer-editor` component and observes the `composer.model.title` property via Ember's observer system. When the title matches a bare URL pattern: + +1. A dismissible bar appears above the editor offering to import the article. +2. On click (or automatically if `auto_populate` is on), it POSTs to `/url-to-article/extract`. +3. The response populates `composer.model.reply` (body) and optionally updates the title. + +### Backend (Ruby) + +`ArticleExtractor` in `lib/url_to_article/article_extractor.rb`: + +1. **Fetches** the HTML via `Net::HTTP` with a browser-like User-Agent (follows one redirect). +2. **Extracts metadata** from Open Graph / Twitter Card / standard `` tags. +3. **Finds the content node** by trying a list of known semantic selectors (`article`, `[role=main]`, `.post-content`, etc.), then falling back to a text-density scoring algorithm over all `
` and `
` elements. +4. **Cleans the node**: removes nav, ads, scripts, hidden elements; strips non-essential attributes; makes relative URLs absolute. +5. **Converts to Markdown** using the `reverse_markdown` gem. + +### Security + +- Only authenticated users can call `/url-to-article/extract`. +- Only `http`/`https` schemes are allowed. +- Configurable domain blocklist (loopback/private addresses blocked by default). +- Optional allowlist to restrict to specific domains. + +--- + +## Output Format + +The body is written as: + +```markdown +> **Site Name** β€” *Author Name* +> Source: + +*Article description or lead paragraph.* + +--- + +## Article Heading + +Full article text in Markdown... +``` + +--- + +## Extending + +### Custom extraction logic + +Subclass or monkey-patch `UrlToArticle::ArticleExtractor` in a separate plugin to add site-specific selectors or post-processing. + +### Paywall / JS-rendered sites + +For sites that require JavaScript rendering, replace the `fetch_html` method with a call to a headless browser service (e.g. Browserless, Splash) or a third-party extraction API (Diffbot, Mercury Parser API). + +--- + +## License + +MIT diff --git a/app/controllers/url_to_article/articles_controller.rb b/app/controllers/url_to_article/articles_controller.rb new file mode 100644 index 0000000..992accc --- /dev/null +++ b/app/controllers/url_to_article/articles_controller.rb @@ -0,0 +1,63 @@ +# frozen_string_literal: true + +module UrlToArticle + class ArticlesController < ::ApplicationController + requires_login + before_action :ensure_enabled! + before_action :validate_url! + + def extract + result = ArticleExtractor.extract(@url) + + render json: { + title: result.title, + byline: result.byline, + site_name: result.site_name, + description: result.description, + markdown: result.markdown, + url: result.url, + } + rescue => e + Rails.logger.warn("[url-to-article] Extraction failed for #{@url}: #{e.message}") + render json: { error: "Could not extract article: #{e.message}" }, status: :unprocessable_entity + end + + private + + def ensure_enabled! + raise Discourse::NotFound unless SiteSetting.url_to_article_enabled + end + + def validate_url! + raw = params.require(:url) + + begin + uri = URI.parse(raw) + rescue URI::InvalidURIError + return render json: { error: "Invalid URL" }, status: :bad_request + end + + unless %w[http https].include?(uri.scheme) + return render json: { error: "Only http/https URLs are supported" }, status: :bad_request + end + + # SSRF protection β€” block private/loopback addresses + blocked_domains = SiteSetting.url_to_article_blocked_domains + .split(",").map(&:strip).reject(&:empty?) + + if blocked_domains.any? { |d| uri.host&.include?(d) } + return render json: { error: "Domain not allowed" }, status: :forbidden + end + + # Optionally enforce an allowlist + allowed_domains = SiteSetting.url_to_article_allowed_domains + .split(",").map(&:strip).reject(&:empty?) + + if allowed_domains.any? && !allowed_domains.any? { |d| uri.host&.end_with?(d) } + return render json: { error: "Domain not in allowlist" }, status: :forbidden + end + + @url = raw + end + end +end diff --git a/assets/javascripts/discourse/initializers/url-to-article.js b/assets/javascripts/discourse/initializers/url-to-article.js new file mode 100644 index 0000000..34b710c --- /dev/null +++ b/assets/javascripts/discourse/initializers/url-to-article.js @@ -0,0 +1,197 @@ +import { apiInitializer } from "discourse/lib/api"; +import { debounce } from "@ember/runloop"; +import { ajax } from "discourse/lib/ajax"; +import I18n from "I18n"; + +const URL_REGEX = /^(https?:\/\/[^\s/$.?#][^\s]*)$/i; +const DEBOUNCE_MS = 600; + +export default apiInitializer("1.8.0", (api) => { + if (!api.container.lookup("site-settings:main").url_to_article_enabled) { + return; + } + + // ----------------------------------------------------------------------- + // Inject a helper button + status banner into the composer + // ----------------------------------------------------------------------- + api.modifyClass("component:composer-editor", { + pluginId: "url-to-article", + + didInsertElement() { + this._super(...arguments); + this._setupUrlToArticle(); + }, + + willDestroyElement() { + this._super(...arguments); + this._teardownUrlToArticle(); + }, + + _setupUrlToArticle() { + // Watch the title field β€” it lives outside the composer-editor DOM, + // so we observe via the composer model's `title` property. + const composer = this.get("composer"); + if (!composer) return; + + this._titleObserver = () => this._onTitleChanged(); + composer.addObserver("model.title", this, "_titleObserver"); + }, + + _teardownUrlToArticle() { + const composer = this.get("composer"); + if (!composer) return; + composer.removeObserver("model.title", this, "_titleObserver"); + }, + + _onTitleChanged() { + const title = this.get("composer.model.title") || ""; + const match = title.trim().match(URL_REGEX); + + if (!match) { + this._hideArticleBar(); + return; + } + + const url = match[1]; + + if (this._lastDetectedUrl === url) return; // Same URL β€” no-op + this._lastDetectedUrl = url; + + const autoPopulate = api.container + .lookup("site-settings:main") + .url_to_article_auto_populate; + + if (autoPopulate) { + debounce(this, "_fetchAndPopulate", url, DEBOUNCE_MS); + } else { + this._showArticleBar(url); + } + }, + + // ---- Bar UI ------------------------------------------------------- + + _showArticleBar(url) { + this._hideArticleBar(); // remove any existing bar first + + const bar = document.createElement("div"); + bar.className = "url-to-article-bar"; + bar.dataset.url = url; + bar.innerHTML = ` + πŸ“„ + + + + `; + + bar.querySelector(".url-to-article-btn").addEventListener("click", () => { + this._fetchAndPopulate(url); + }); + + bar.querySelector(".url-to-article-dismiss").addEventListener("click", () => { + this._hideArticleBar(); + this._lastDetectedUrl = null; // Allow re-detection if title changes + }); + + const toolbarEl = this.element.querySelector(".d-editor-container"); + if (toolbarEl) { + toolbarEl.insertAdjacentElement("afterbegin", bar); + } + }, + + _hideArticleBar() { + this.element?.querySelectorAll(".url-to-article-bar").forEach((el) => el.remove()); + }, + + _setStatus(message, type = "info") { + const bar = this.element?.querySelector(".url-to-article-bar"); + if (!bar) return; + + let status = bar.querySelector(".url-to-article-status"); + if (!status) { + status = document.createElement("span"); + status.className = "url-to-article-status"; + bar.appendChild(status); + } + status.textContent = message; + status.className = `url-to-article-status url-to-article-status--${type}`; + }, + + // ---- Fetch & populate --------------------------------------------- + + async _fetchAndPopulate(url) { + const bar = this.element?.querySelector(".url-to-article-bar"); + const btn = bar?.querySelector(".url-to-article-btn"); + + if (btn) { + btn.disabled = true; + btn.textContent = I18n.t("url_to_article.fetching"); + } + this._setStatus(I18n.t("url_to_article.fetching"), "info"); + + try { + const data = await ajax("/url-to-article/extract", { + type: "POST", + data: { url }, + }); + + if (data.error) { + throw new Error(data.error); + } + + this._populateComposer(data); + this._setStatus(I18n.t("url_to_article.success"), "success"); + + // Auto-hide bar after 3 seconds on success + setTimeout(() => this._hideArticleBar(), 3000); + } catch (err) { + const msg = err.jqXHR?.responseJSON?.error || err.message || I18n.t("url_to_article.error_generic"); + this._setStatus(`${I18n.t("url_to_article.error_prefix")} ${msg}`, "error"); + if (btn) { + btn.disabled = false; + btn.textContent = I18n.t("url_to_article.retry_button"); + } + } + }, + + _populateComposer(data) { + const composerModel = this.get("composer.model"); + if (!composerModel) return; + + // Build the article body in Markdown + const lines = []; + + // Attribution header + const siteName = data.site_name ? `**${data.site_name}**` : ""; + const byline = data.byline ? ` β€” *${data.byline}*` : ""; + if (siteName || byline) { + lines.push(`> ${siteName}${byline}`); + lines.push(`> ${I18n.t("url_to_article.source_label")}: <${data.url}>`); + lines.push(""); + } else { + lines.push(`> ${I18n.t("url_to_article.source_label")}: <${data.url}>`); + lines.push(""); + } + + if (data.description) { + lines.push(`*${data.description}*`); + lines.push(""); + lines.push("---"); + lines.push(""); + } + + lines.push(data.markdown || ""); + + const body = lines.join("\n"); + + // Only set title if it's still the raw URL (avoid overwriting edited titles) + const currentTitle = composerModel.get("title") || ""; + if (currentTitle.trim() === data.url || currentTitle.trim() === "") { + composerModel.set("title", data.title || data.url); + } + + composerModel.set("reply", body); + }, + }); +}); diff --git a/assets/stylesheets/url-to-article.scss b/assets/stylesheets/url-to-article.scss new file mode 100644 index 0000000..4692a5d --- /dev/null +++ b/assets/stylesheets/url-to-article.scss @@ -0,0 +1,51 @@ +/* URL-to-Article plugin styles */ + +.url-to-article-bar { + display: flex; + align-items: center; + gap: 0.5rem; + padding: 0.5rem 0.75rem; + margin-bottom: 0.5rem; + background: var(--tertiary-low, #e8f4ff); + border: 1px solid var(--tertiary-medium, #8bc2f0); + border-radius: var(--d-border-radius, 4px); + font-size: var(--font-down-1); + flex-wrap: wrap; +} + +.url-to-article-icon { + font-size: 1.1em; + flex-shrink: 0; +} + +.url-to-article-label { + flex: 1; + min-width: 8rem; + color: var(--primary-medium); + font-weight: 500; +} + +.url-to-article-btn { + flex-shrink: 0; +} + +.url-to-article-dismiss { + flex-shrink: 0; + padding: 0.25rem 0.5rem !important; + color: var(--primary-medium) !important; +} + +.url-to-article-status { + font-style: italic; + font-size: var(--font-down-1); + + &.url-to-article-status--info { + color: var(--tertiary); + } + &.url-to-article-status--success { + color: var(--success); + } + &.url-to-article-status--error { + color: var(--danger); + } +} diff --git a/config/locales/client.en.yml b/config/locales/client.en.yml new file mode 100644 index 0000000..b4fb857 --- /dev/null +++ b/config/locales/client.en.yml @@ -0,0 +1,11 @@ +en: + url_to_article: + bar_label: "URL detected β€” import as article?" + fetch_button: "Import Article" + retry_button: "Retry" + fetching: "Fetching…" + dismiss: "Dismiss" + success: "Article imported!" + error_generic: "Unknown error" + error_prefix: "Error:" + source_label: "Source" diff --git a/config/settings.yml b/config/settings.yml new file mode 100644 index 0000000..d2d6e4c --- /dev/null +++ b/config/settings.yml @@ -0,0 +1,31 @@ +plugins: + url_to_article_enabled: + default: true + client: true + type: bool + + url_to_article_auto_populate: + default: false + client: true + type: bool + description: "Automatically populate the body when a URL is detected in the title (no button click needed)" + + url_to_article_max_content_length: + default: 50000 + type: integer + description: "Maximum number of characters to extract from a page" + + url_to_article_fetch_timeout: + default: 10 + type: integer + description: "Seconds to wait when fetching a URL" + + url_to_article_allowed_domains: + default: "" + type: string + description: "Comma-separated list of allowed domains. Leave blank to allow all." + + url_to_article_blocked_domains: + default: "localhost,127.0.0.1,0.0.0.0,::1" + type: string + description: "Comma-separated list of blocked domains (SSRF protection)" diff --git a/lib/url_to_article/article_extractor.rb b/lib/url_to_article/article_extractor.rb new file mode 100644 index 0000000..a1b87be --- /dev/null +++ b/lib/url_to_article/article_extractor.rb @@ -0,0 +1,232 @@ +# frozen_string_literal: true + +require "nokogiri" +require "reverse_markdown" +require "net/http" +require "uri" +require "timeout" + +module UrlToArticle + class ArticleExtractor + # Tags that are almost never article content + NOISE_SELECTORS = %w[ + script style noscript iframe nav footer header + .navigation .nav .menu .sidebar .widget .ad .advertisement + .cookie-banner .cookie-notice .popup .modal .overlay + .social-share .share-buttons .related-posts .comments + #comments #sidebar #navigation #footer #header + [role=navigation] [role=banner] [role=contentinfo] + [aria-label=navigation] [aria-label=footer] + ].freeze + + # Candidate content selectors tried in order + ARTICLE_SELECTORS = %w[ + article[class*=content] + article[class*=post] + article[class*=article] + article + [role=main] + main + .post-content + .article-content + .entry-content + .article-body + .story-body + .post-body + .content-body + .page-content + #article-body + #post-content + #main-content + ].freeze + + Result = Struct.new(:title, :byline, :site_name, :description, :markdown, :url, keyword_init: true) + + def self.extract(url) + new(url).extract + end + + def initialize(url) + @url = url + @uri = URI.parse(url) + end + + def extract + html = fetch_html + doc = Nokogiri::HTML(html) + + title = extract_title(doc) + byline = extract_byline(doc) + site_name = extract_site_name(doc) + description = extract_description(doc) + content_node = find_content_node(doc) + + clean_node!(content_node) + markdown = node_to_markdown(content_node) + markdown = truncate(markdown) + + Result.new( + title: title, + byline: byline, + site_name: site_name, + description: description, + markdown: markdown, + url: @url + ) + end + + private + + def fetch_html + Timeout.timeout(SiteSetting.url_to_article_fetch_timeout) do + http = Net::HTTP.new(@uri.host, @uri.port) + http.use_ssl = @uri.scheme == "https" + http.open_timeout = 5 + http.read_timeout = SiteSetting.url_to_article_fetch_timeout + + request = Net::HTTP::Get.new(@uri.request_uri) + request["User-Agent"] = "Mozilla/5.0 (compatible; Discourse URL-to-Article Bot/1.0)" + request["Accept"] = "text/html,application/xhtml+xml" + request["Accept-Language"] = "en-US,en;q=0.9" + + response = http.request(request) + + # Follow one redirect + if response.is_a?(Net::HTTPRedirection) && response["location"] + redirect_uri = URI.parse(response["location"]) + @uri = redirect_uri + http = Net::HTTP.new(@uri.host, @uri.port) + http.use_ssl = @uri.scheme == "https" + response = http.get(@uri.request_uri, "User-Agent" => request["User-Agent"]) + end + + raise "HTTP #{response.code}" unless response.is_a?(Net::HTTPSuccess) + response.body.force_encoding("UTF-8") + end + end + + def extract_title(doc) + # Try OG title first, then twitter:title, then + og = doc.at_css('meta[property="og:title"]')&.attr("content") + return og.strip if og.present? + + tw = doc.at_css('meta[name="twitter:title"]')&.attr("content") + return tw.strip if tw.present? + + h1 = doc.at_css("h1")&.text + return h1.strip if h1.present? + + doc.at_css("title")&.text&.strip || @uri.host + end + + def extract_byline(doc) + candidates = [ + doc.at_css('meta[name="author"]')&.attr("content"), + doc.at_css('[rel="author"]')&.text, + doc.at_css(".author")&.text, + doc.at_css('[class*="byline"]')&.text, + doc.at_css("address")&.text, + ] + candidates.compact.map(&:strip).reject(&:empty?).first + end + + def extract_site_name(doc) + doc.at_css('meta[property="og:site_name"]')&.attr("content")&.strip || + @uri.host.sub(/^www\./, "") + end + + def extract_description(doc) + doc.at_css('meta[property="og:description"]')&.attr("content")&.strip || + doc.at_css('meta[name="description"]')&.attr("content")&.strip + end + + def find_content_node(doc) + # Try known article selectors + ARTICLE_SELECTORS.each do |sel| + node = doc.at_css(sel) + next unless node + text = node.text.strip + # Make sure it has meaningful content (>200 chars of text) + return node if text.length > 200 + end + + # Fallback: score all <div> and <section> blocks by text density + score_and_pick(doc) + end + + def score_and_pick(doc) + candidates = doc.css("div, section, td").map do |node| + text = node.text.strip + next if text.length < 150 + + # Score = text length - penalize nodes with lots of tags (nav-heavy) + tag_count = node.css("*").size.to_f + text_length = text.length.to_f + score = text_length - (tag_count * 3) + + [score, node] + end.compact.sort_by { |s, _| -s } + + candidates.first&.last || doc.at_css("body") || doc + end + + def clean_node!(node) + return unless node + + # Remove noise elements + NOISE_SELECTORS.each do |sel| + node.css(sel).each(&:remove) + end + + # Remove hidden elements + node.css("[style]").each do |el| + el.remove if el["style"] =~ /display\s*:\s*none|visibility\s*:\s*hidden/i + end + + # Remove empty tags (except br, img, hr) + node.css("span, div, p, section").each do |el| + el.remove if el.text.strip.empty? && el.css("img, video, audio, iframe").empty? + end + + # Strip all attributes except allowed ones on certain tags + allowed = { + "a" => %w[href title], + "img" => %w[src alt title width height], + "td" => %w[colspan rowspan], + "th" => %w[colspan rowspan scope], + "ol" => %w[start type], + "li" => %w[value], + "code" => %w[class], + "pre" => %w[class], + } + node.css("*").each do |el| + tag = el.name.downcase + permitted = allowed[tag] || [] + el.attributes.each_key do |attr| + el.remove_attribute(attr) unless permitted.include?(attr) + end + + # Make relative image URLs absolute + if tag == "img" && el["src"] && !el["src"].start_with?("http", "//", "data:") + el["src"] = URI.join(@url, el["src"]).to_s rescue nil + end + if tag == "a" && el["href"] && !el["href"].start_with?("http", "//", "#", "mailto:") + el["href"] = URI.join(@url, el["href"]).to_s rescue nil + end + end + end + + def node_to_markdown(node) + return "" unless node + ReverseMarkdown.convert(node.to_html, unknown_tags: :bypass, github_flavored: true) + .gsub(/\n{3,}/, "\n\n") # collapse excessive blank lines + .strip + end + + def truncate(text) + max = SiteSetting.url_to_article_max_content_length + return text if text.length <= max + text[0...max] + "\n\n*[Content truncated β€” visit the original article for the full text.]*" + end + end +end diff --git a/plugin.rb b/plugin.rb new file mode 100644 index 0000000..9c5b22f --- /dev/null +++ b/plugin.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +# name: discourse-url-to-article +# about: Scrapes a URL pasted into the topic title and populates the composer body with the article content +# version: 0.1.0 +# authors: Your Name +# url: https://github.com/yourname/discourse-url-to-article + +gem "nokogiri", "1.16.4" +gem "reverse_markdown", "2.1.1" + +enabled_site_setting :url_to_article_enabled + +after_initialize do + require_relative "lib/url_to_article/article_extractor" + + module ::UrlToArticle + PLUGIN_NAME = "discourse-url-to-article" + + class Engine < ::Rails::Engine + engine_name PLUGIN_NAME + isolate_namespace UrlToArticle + end + end + + require_relative "app/controllers/url_to_article/articles_controller" + + UrlToArticle::Engine.routes.draw do + post "/extract" => "articles#extract" + end + + Discourse::Application.routes.append do + mount UrlToArticle::Engine, at: "/url-to-article" + end +end