Files
memos-chrome-extension/src/content.js
2026-03-18 19:58:05 +01:00

304 lines
12 KiB
JavaScript

// content.js - Extracts page content and converts to Markdown
// Guard against double-injection (MV3 scripting.executeScript can fire multiple times)
if (window.__memosClipperLoaded) { /* skip */ } else {
window.__memosClipperLoaded = true;
(function () {
// ── Turndown-lite: a minimal but solid HTML→Markdown converter ──────────────
function htmlToMarkdown(element, isSelection = false, stripLinks = false) {
const clone = element.cloneNode(true);
// Remove unwanted elements — comprehensive list covering real-world sites
// Skip this if we're in selection mode, because the user explicitly picked this content
if (!isSelection) {
const removeSelectors = [
// Semantic structural chrome
'script', 'style', 'noscript', 'template',
'nav', 'header', 'footer', 'aside',
// ARIA roles for chrome
'[role="navigation"]', '[role="banner"]', '[role="complementary"]',
'[role="contentinfo"]', '[role="search"]', '[role="toolbar"]',
'[role="menubar"]', '[role="menu"]', '[role="dialog"]',
// Common class/id patterns for site chrome
'[class*="navbar"]', '[class*="nav-bar"]', '[class*="site-nav"]',
'[class*="site-header"]', '[class*="site-footer"]',
'[class*="page-header"]', '[class*="page-footer"]',
'[id*="navbar"]', '[id*="site-nav"]', '[id*="site-header"]', '[id*="site-footer"]',
// Ads and tracking
'[class*="advertisement"]', '[class*="advert"]', '[class*=" ad-"]',
'[class*="google-ad"]', '[class*="sponsored"]',
'[id*="advertisement"]', '[id*="google_ad"]',
// Cookie banners, popups, overlays
'[class*="cookie"]', '[id*="cookie"]',
'[class*="consent"]', '[id*="consent"]',
'[class*="gdpr"]', '[id*="gdpr"]',
'[class*="popup"]', '[class*="modal"]', '[class*="overlay"]',
'[class*="banner"]', '[id*="banner"]',
// Social / share widgets
'[class*="share-bar"]', '[class*="social-bar"]', '[class*="share-buttons"]',
'[class*="sharing"]',
// Subscription / newsletter prompts
'[class*="newsletter"]', '[class*="subscribe"]',
// Comments sections
'[id="comments"]', '[class*="comments-section"]', '[id*="disqus"]',
// Related / recommended articles
'[class*="related-posts"]', '[class*="recommended"]', '[class*="more-articles"]',
// Sidebar
'[class*="sidebar"]', '[id*="sidebar"]',
// Print/hidden
'[hidden]', '[aria-hidden="true"]',
].join(', ');
clone.querySelectorAll(removeSelectors).forEach((el) => el.remove());
// Also remove elements that are visually hidden via inline style
clone.querySelectorAll('[style*="display:none"],[style*="display: none"],[style*="visibility:hidden"]')
.forEach((el) => el.remove());
// Remove link-dense blocks (navigation menus, ad link lists, etc.)
// Collect candidates first to avoid mid-iteration detached-node issues.
// Only target outer chrome elements (nav, aside, header, footer, div, section)
// not content containers like article/main, to avoid stripping TOCs in prose.
const linkDenseCandidates = Array.from(
clone.querySelectorAll('nav, aside, header, footer, div, section')
).filter((el) => {
// Skip if inside the primary content container
if (el.closest('article, main, [role="main"]')) return false;
const totalText = (el.textContent || '').trim().length;
if (totalText < 30) return false; // too short to judge
const linkText = Array.from(el.querySelectorAll('a'))
.reduce((sum, a) => sum + (a.textContent || '').trim().length, 0);
if (linkText / totalText <= 0.65) return false;
// Require that the element has little direct (non-link) text of its own
const directText = Array.from(el.childNodes)
.filter((n) => n.nodeType === Node.TEXT_NODE)
.reduce((sum, n) => sum + n.textContent.trim().length, 0);
return directText < totalText * 0.25;
});
// Remove outermost candidates only (skip those already inside a removed ancestor)
linkDenseCandidates.forEach((el) => {
if (el.isConnected) el.remove();
});
} else {
// In selection mode, we still want to remove script/style tags if any
clone.querySelectorAll('script, style, noscript, template').forEach((el) => el.remove());
}
return nodeToMd(clone, { listDepth: 0, ordered: false, index: 0 }, stripLinks).replace(/\n{3,}/g, "\n\n").trim();
}
function nodeToMd(node, ctx = { listDepth: 0, ordered: false, index: 0 }, stripLinks = false) {
if (node.nodeType === Node.TEXT_NODE) {
return node.textContent.replace(/\s+/g, " ");
}
if (node.nodeType !== Node.ELEMENT_NODE) return "";
const tag = node.tagName.toLowerCase();
const children = () =>
Array.from(node.childNodes)
.map((c) => nodeToMd(c, ctx, stripLinks))
.join("");
switch (tag) {
case "h1": return `\n\n# ${children().trim()}\n\n`;
case "h2": return `\n\n## ${children().trim()}\n\n`;
case "h3": return `\n\n### ${children().trim()}\n\n`;
case "h4": return `\n\n#### ${children().trim()}\n\n`;
case "h5": return `\n\n##### ${children().trim()}\n\n`;
case "h6": return `\n\n###### ${children().trim()}\n\n`;
case "p": return `\n\n${children().trim()}\n\n`;
case "br": return " \n";
case "hr": return "\n\n---\n\n";
case "strong":
case "b": return `**${children()}**`;
case "em":
case "i": return `_${children()}_`;
case "s":
case "del": return `~~${children()}~~`;
case "code": {
const text = node.textContent;
if (text.includes("`")) return `\`\` ${text} \`\``;
return `\`${text}\``;
}
case "pre": {
const codeEl = node.querySelector("code");
const lang = codeEl
? (codeEl.className.match(/language-(\S+)/) || [])[1] || ""
: "";
const text = (codeEl || node).textContent;
return `\n\n\`\`\`${lang}\n${text}\n\`\`\`\n\n`;
}
case "blockquote": return `\n\n${children()
.trim()
.split("\n")
.map((l) => `> ${l}`)
.join("\n")}\n\n`;
case "a": {
const text = children().trim();
if (stripLinks) return text; // just the anchor text, no URL
const href = node.getAttribute("href") || "";
if (!text) return href;
try {
const abs = new URL(href, location.href).href;
return `[${text}](${abs})`;
} catch {
return `[${text}](${href})`;
}
}
case "img": {
const src = node.getAttribute("src") || "";
const alt = node.getAttribute("alt") || "";
try {
const abs = new URL(src, location.href).href;
return `![${alt}](${abs})`;
} catch {
return src ? `![${alt}](${src})` : "";
}
}
case "ul": {
const lines = Array.from(node.children)
.map((li) => `${" ".repeat(ctx.listDepth)}- ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }, stripLinks).trim()}`)
.join("\n");
return `\n\n${lines}\n\n`;
}
case "ol": {
const lines = Array.from(node.children)
.map((li, i) => `${" ".repeat(ctx.listDepth)}${i + 1}. ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }, stripLinks).trim()}`)
.join("\n");
return `\n\n${lines}\n\n`;
}
case "li": return children();
case "table": return convertTable(node, stripLinks);
case "figure": {
const img = node.querySelector("img");
const caption = node.querySelector("figcaption");
let md = img ? nodeToMd(img, ctx, stripLinks) : children();
if (caption) md += `\n*${caption.textContent.trim()}*`;
return `\n\n${md}\n\n`;
}
// skip presentational / hidden
case "svg":
case "canvas":
case "video":
case "audio":
case "iframe":
case "button":
case "input":
case "select":
case "textarea":
case "form":
case "nav":
case "header":
case "footer":
case "aside":
return "";
default:
return children();
}
}
function convertTable(table, stripLinks = false) {
const rows = Array.from(table.querySelectorAll("tr"));
if (!rows.length) return "";
const toRow = (tr) =>
"| " +
Array.from(tr.querySelectorAll("th,td"))
.map((c) => {
const text = stripLinks ? c.textContent.trim() : nodeToMd(c).trim();
return text.replace(/\|/g, "\\|");
})
.join(" | ") +
" |";
const header = toRow(rows[0]);
const sep =
"| " +
Array.from(rows[0].querySelectorAll("th,td"))
.map(() => "---")
.join(" | ") +
" |";
const body = rows.slice(1).map(toRow).join("\n");
return `\n\n${header}\n${sep}\n${body}\n\n`;
}
// ── Extract images from HTML ────────────────────────────────────────────────
function extractImages(element) {
const seen = new Set();
const imgs = Array.from(element.querySelectorAll("img"));
return imgs
.filter((img) => {
const src = img.getAttribute("src") || "";
if (!src) return false;
// skip tiny icons / tracking pixels by rendered size
const w = img.naturalWidth || img.width || 0;
const h = img.naturalHeight || img.height || 0;
if (w > 0 && w < 32 && h > 0 && h < 32) return false;
// skip 1x1 gif trackers
if (src.startsWith("data:image/gif")) return false;
return true;
})
.map((img) => {
const src = img.getAttribute("src") || "";
let abs = src;
try { abs = new URL(src, location.href).href; } catch {}
return { src: abs, alt: img.getAttribute("alt") || "" };
})
.filter((img) => {
if (seen.has(img.src)) return false;
seen.add(img.src);
return true;
});
}
// ── Message handler ─────────────────────────────────────────────────────────
chrome.runtime.onMessage.addListener((msg, _sender, sendResponse) => {
if (msg.action === "getContent") {
try {
let markdown = "";
let images = [];
let title = document.title || location.href;
const stripLinks = !!msg.stripLinks;
if (msg.mode === "selection") {
const sel = window.getSelection();
if (sel && sel.rangeCount > 0) {
const frag = sel.getRangeAt(0).cloneContents();
const div = document.createElement("div");
div.appendChild(frag);
markdown = htmlToMarkdown(div, true, stripLinks);
images = extractImages(div);
} else {
markdown = "";
}
} else {
// full page — prefer article/main, fall back to body
const root =
document.querySelector("article") ||
document.querySelector("main") ||
document.querySelector('[role="main"]') ||
document.body;
markdown = htmlToMarkdown(root, false, stripLinks);
images = extractImages(root);
}
// Prepend source line
const sourceNote = `> Source: [${title}](${location.href})\n\n`;
markdown = sourceNote + markdown;
sendResponse({ ok: true, markdown, images, title, url: location.href });
} catch (e) {
sendResponse({ ok: false, error: e.message });
}
return true; // async
}
});
})();
} // end guard