// content.js - Extracts page content and converts to Markdown // Guard against double-injection (MV3 scripting.executeScript can fire multiple times) if (window.__memosClipperLoaded) { /* skip */ } else { window.__memosClipperLoaded = true; (function () { // ── Turndown-lite: a minimal but solid HTML→Markdown converter ────────────── function htmlToMarkdown(element) { const clone = element.cloneNode(true); // Remove unwanted elements — comprehensive list covering real-world sites const removeSelectors = [ // Semantic structural chrome 'script', 'style', 'noscript', 'template', 'nav', 'header', 'footer', 'aside', // ARIA roles for chrome '[role="navigation"]', '[role="banner"]', '[role="complementary"]', '[role="contentinfo"]', '[role="search"]', '[role="toolbar"]', '[role="menubar"]', '[role="menu"]', '[role="dialog"]', // Common class/id patterns for site chrome '[class*="navbar"]', '[class*="nav-bar"]', '[class*="site-nav"]', '[class*="site-header"]', '[class*="site-footer"]', '[class*="page-header"]', '[class*="page-footer"]', '[id*="navbar"]', '[id*="site-nav"]', '[id*="site-header"]', '[id*="site-footer"]', // Ads and tracking '[class*="advertisement"]', '[class*="advert"]', '[class*=" ad-"]', '[class*="google-ad"]', '[class*="sponsored"]', '[id*="advertisement"]', '[id*="google_ad"]', // Cookie banners, popups, overlays '[class*="cookie"]', '[id*="cookie"]', '[class*="consent"]', '[id*="consent"]', '[class*="gdpr"]', '[id*="gdpr"]', '[class*="popup"]', '[class*="modal"]', '[class*="overlay"]', '[class*="banner"]', '[id*="banner"]', // Social / share widgets '[class*="share-bar"]', '[class*="social-bar"]', '[class*="share-buttons"]', '[class*="sharing"]', // Subscription / newsletter prompts '[class*="newsletter"]', '[class*="subscribe"]', // Comments sections '[id="comments"]', '[class*="comments-section"]', '[id*="disqus"]', // Related / recommended articles '[class*="related-posts"]', '[class*="recommended"]', '[class*="more-articles"]', // Sidebar '[class*="sidebar"]', '[id*="sidebar"]', // Print/hidden '[hidden]', '[aria-hidden="true"]', ].join(', '); clone.querySelectorAll(removeSelectors).forEach((el) => el.remove()); // Also remove elements that are visually hidden via inline style clone.querySelectorAll('[style*="display:none"],[style*="display: none"],[style*="visibility:hidden"]') .forEach((el) => el.remove()); return nodeToMd(clone).replace(/\n{3,}/g, "\n\n").trim(); } function nodeToMd(node, ctx = { listDepth: 0, ordered: false, index: 0 }) { if (node.nodeType === Node.TEXT_NODE) { return node.textContent.replace(/\s+/g, " "); } if (node.nodeType !== Node.ELEMENT_NODE) return ""; const tag = node.tagName.toLowerCase(); const children = () => Array.from(node.childNodes) .map((c) => nodeToMd(c, ctx)) .join(""); switch (tag) { case "h1": return `\n\n# ${children().trim()}\n\n`; case "h2": return `\n\n## ${children().trim()}\n\n`; case "h3": return `\n\n### ${children().trim()}\n\n`; case "h4": return `\n\n#### ${children().trim()}\n\n`; case "h5": return `\n\n##### ${children().trim()}\n\n`; case "h6": return `\n\n###### ${children().trim()}\n\n`; case "p": return `\n\n${children().trim()}\n\n`; case "br": return " \n"; case "hr": return "\n\n---\n\n"; case "strong": case "b": return `**${children()}**`; case "em": case "i": return `_${children()}_`; case "s": case "del": return `~~${children()}~~`; case "code": { const text = node.textContent; return text.includes("`") ? `\`\`${text}\`\`` : `\`${text}\``; } case "pre": { const codeEl = node.querySelector("code"); const lang = codeEl ? (codeEl.className.match(/language-(\S+)/) || [])[1] || "" : ""; const text = (codeEl || node).textContent; return `\n\n\`\`\`${lang}\n${text}\n\`\`\`\n\n`; } case "blockquote": return `\n\n${children() .trim() .split("\n") .map((l) => `> ${l}`) .join("\n")}\n\n`; case "a": { const href = node.getAttribute("href") || ""; const text = children().trim(); if (!text) return href; try { const abs = new URL(href, location.href).href; return `[${text}](${abs})`; } catch { return `[${text}](${href})`; } } case "img": { const src = node.getAttribute("src") || ""; const alt = node.getAttribute("alt") || ""; try { const abs = new URL(src, location.href).href; return `![${alt}](${abs})`; } catch { return src ? `![${alt}](${src})` : ""; } } case "ul": { const lines = Array.from(node.children) .map((li) => `${" ".repeat(ctx.listDepth)}- ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }).trim()}`) .join("\n"); return `\n\n${lines}\n\n`; } case "ol": { const lines = Array.from(node.children) .map((li, i) => `${" ".repeat(ctx.listDepth)}${i + 1}. ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }).trim()}`) .join("\n"); return `\n\n${lines}\n\n`; } case "li": return children(); case "table": return convertTable(node); case "figure": { const img = node.querySelector("img"); const caption = node.querySelector("figcaption"); let md = img ? nodeToMd(img, ctx) : children(); if (caption) md += `\n*${caption.textContent.trim()}*`; return `\n\n${md}\n\n`; } // skip presentational / hidden case "svg": case "canvas": case "video": case "audio": case "iframe": case "button": case "input": case "select": case "textarea": case "form": case "nav": case "header": case "footer": case "aside": return ""; default: return children(); } } function convertTable(table) { const rows = Array.from(table.querySelectorAll("tr")); if (!rows.length) return ""; const toRow = (tr) => "| " + Array.from(tr.querySelectorAll("th,td")) .map((c) => c.textContent.trim().replace(/\|/g, "\\|")) .join(" | ") + " |"; const header = toRow(rows[0]); const sep = "| " + Array.from(rows[0].querySelectorAll("th,td")) .map(() => "---") .join(" | ") + " |"; const body = rows.slice(1).map(toRow).join("\n"); return `\n\n${header}\n${sep}\n${body}\n\n`; } // ── Extract images from HTML ──────────────────────────────────────────────── function extractImages(element) { const seen = new Set(); const imgs = Array.from(element.querySelectorAll("img")); return imgs .filter((img) => { const src = img.getAttribute("src") || ""; if (!src) return false; // skip tiny icons / tracking pixels by rendered size const w = img.naturalWidth || img.width || 0; const h = img.naturalHeight || img.height || 0; if (w > 0 && w < 32 && h > 0 && h < 32) return false; // skip 1x1 gif trackers if (src.startsWith("data:image/gif")) return false; return true; }) .map((img) => { const src = img.getAttribute("src") || ""; let abs = src; try { abs = new URL(src, location.href).href; } catch {} return { src: abs, alt: img.getAttribute("alt") || "" }; }) .filter((img) => { if (seen.has(img.src)) return false; seen.add(img.src); return true; }); } // ── Message handler ───────────────────────────────────────────────────────── chrome.runtime.onMessage.addListener((msg, _sender, sendResponse) => { if (msg.action === "getContent") { try { let markdown = ""; let images = []; let title = document.title || location.href; if (msg.mode === "selection") { const sel = window.getSelection(); if (sel && sel.rangeCount > 0) { const frag = sel.getRangeAt(0).cloneContents(); const div = document.createElement("div"); div.appendChild(frag); markdown = htmlToMarkdown(div); images = extractImages(div); } else { markdown = ""; } } else { // full page — prefer article/main, fall back to body const root = document.querySelector("article") || document.querySelector("main") || document.querySelector('[role="main"]') || document.body; markdown = htmlToMarkdown(root); images = extractImages(root); } // Prepend source line const sourceNote = `> Source: [${title}](${location.href})\n\n`; markdown = sourceNote + markdown; sendResponse({ ok: true, markdown, images, title, url: location.href }); } catch (e) { sendResponse({ ok: false, error: e.message }); } return true; // async } }); })(); } // end guard