Files
memos-chrome-extension/content.js
2026-03-14 21:21:53 +01:00

267 lines
9.5 KiB
JavaScript

// content.js - Extracts page content and converts to Markdown
// Guard against double-injection (MV3 scripting.executeScript can fire multiple times)
if (window.__memosClipperLoaded) { /* skip */ } else {
window.__memosClipperLoaded = true;
(function () {
// ── Turndown-lite: a minimal but solid HTML→Markdown converter ──────────────
function htmlToMarkdown(element) {
const clone = element.cloneNode(true);
// Remove unwanted elements — comprehensive list covering real-world sites
const removeSelectors = [
// Semantic structural chrome
'script', 'style', 'noscript', 'template',
'nav', 'header', 'footer', 'aside',
// ARIA roles for chrome
'[role="navigation"]', '[role="banner"]', '[role="complementary"]',
'[role="contentinfo"]', '[role="search"]', '[role="toolbar"]',
'[role="menubar"]', '[role="menu"]', '[role="dialog"]',
// Common class/id patterns for site chrome
'[class*="navbar"]', '[class*="nav-bar"]', '[class*="site-nav"]',
'[class*="site-header"]', '[class*="site-footer"]',
'[class*="page-header"]', '[class*="page-footer"]',
'[id*="navbar"]', '[id*="site-nav"]', '[id*="site-header"]', '[id*="site-footer"]',
// Ads and tracking
'[class*="advertisement"]', '[class*="advert"]', '[class*=" ad-"]',
'[class*="google-ad"]', '[class*="sponsored"]',
'[id*="advertisement"]', '[id*="google_ad"]',
// Cookie banners, popups, overlays
'[class*="cookie"]', '[id*="cookie"]',
'[class*="consent"]', '[id*="consent"]',
'[class*="gdpr"]', '[id*="gdpr"]',
'[class*="popup"]', '[class*="modal"]', '[class*="overlay"]',
'[class*="banner"]', '[id*="banner"]',
// Social / share widgets
'[class*="share-bar"]', '[class*="social-bar"]', '[class*="share-buttons"]',
'[class*="sharing"]',
// Subscription / newsletter prompts
'[class*="newsletter"]', '[class*="subscribe"]',
// Comments sections
'[id="comments"]', '[class*="comments-section"]', '[id*="disqus"]',
// Related / recommended articles
'[class*="related-posts"]', '[class*="recommended"]', '[class*="more-articles"]',
// Sidebar
'[class*="sidebar"]', '[id*="sidebar"]',
// Print/hidden
'[hidden]', '[aria-hidden="true"]',
].join(', ');
clone.querySelectorAll(removeSelectors).forEach((el) => el.remove());
// Also remove elements that are visually hidden via inline style
clone.querySelectorAll('[style*="display:none"],[style*="display: none"],[style*="visibility:hidden"]')
.forEach((el) => el.remove());
return nodeToMd(clone).replace(/\n{3,}/g, "\n\n").trim();
}
function nodeToMd(node, ctx = { listDepth: 0, ordered: false, index: 0 }) {
if (node.nodeType === Node.TEXT_NODE) {
return node.textContent.replace(/\s+/g, " ");
}
if (node.nodeType !== Node.ELEMENT_NODE) return "";
const tag = node.tagName.toLowerCase();
const children = () =>
Array.from(node.childNodes)
.map((c) => nodeToMd(c, ctx))
.join("");
switch (tag) {
case "h1": return `\n\n# ${children().trim()}\n\n`;
case "h2": return `\n\n## ${children().trim()}\n\n`;
case "h3": return `\n\n### ${children().trim()}\n\n`;
case "h4": return `\n\n#### ${children().trim()}\n\n`;
case "h5": return `\n\n##### ${children().trim()}\n\n`;
case "h6": return `\n\n###### ${children().trim()}\n\n`;
case "p": return `\n\n${children().trim()}\n\n`;
case "br": return " \n";
case "hr": return "\n\n---\n\n";
case "strong":
case "b": return `**${children()}**`;
case "em":
case "i": return `_${children()}_`;
case "s":
case "del": return `~~${children()}~~`;
case "code": {
const text = node.textContent;
return text.includes("`") ? `\`\`${text}\`\`` : `\`${text}\``;
}
case "pre": {
const codeEl = node.querySelector("code");
const lang = codeEl
? (codeEl.className.match(/language-(\S+)/) || [])[1] || ""
: "";
const text = (codeEl || node).textContent;
return `\n\n\`\`\`${lang}\n${text}\n\`\`\`\n\n`;
}
case "blockquote": return `\n\n${children()
.trim()
.split("\n")
.map((l) => `> ${l}`)
.join("\n")}\n\n`;
case "a": {
const href = node.getAttribute("href") || "";
const text = children().trim();
if (!text) return href;
try {
const abs = new URL(href, location.href).href;
return `[${text}](${abs})`;
} catch {
return `[${text}](${href})`;
}
}
case "img": {
const src = node.getAttribute("src") || "";
const alt = node.getAttribute("alt") || "";
try {
const abs = new URL(src, location.href).href;
return `![${alt}](${abs})`;
} catch {
return src ? `![${alt}](${src})` : "";
}
}
case "ul": {
const lines = Array.from(node.children)
.map((li) => `${" ".repeat(ctx.listDepth)}- ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }).trim()}`)
.join("\n");
return `\n\n${lines}\n\n`;
}
case "ol": {
const lines = Array.from(node.children)
.map((li, i) => `${" ".repeat(ctx.listDepth)}${i + 1}. ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }).trim()}`)
.join("\n");
return `\n\n${lines}\n\n`;
}
case "li": return children();
case "table": return convertTable(node);
case "figure": {
const img = node.querySelector("img");
const caption = node.querySelector("figcaption");
let md = img ? nodeToMd(img, ctx) : children();
if (caption) md += `\n*${caption.textContent.trim()}*`;
return `\n\n${md}\n\n`;
}
// skip presentational / hidden
case "svg":
case "canvas":
case "video":
case "audio":
case "iframe":
case "button":
case "input":
case "select":
case "textarea":
case "form":
case "nav":
case "header":
case "footer":
case "aside":
return "";
default:
return children();
}
}
function convertTable(table) {
const rows = Array.from(table.querySelectorAll("tr"));
if (!rows.length) return "";
const toRow = (tr) =>
"| " +
Array.from(tr.querySelectorAll("th,td"))
.map((c) => c.textContent.trim().replace(/\|/g, "\\|"))
.join(" | ") +
" |";
const header = toRow(rows[0]);
const sep =
"| " +
Array.from(rows[0].querySelectorAll("th,td"))
.map(() => "---")
.join(" | ") +
" |";
const body = rows.slice(1).map(toRow).join("\n");
return `\n\n${header}\n${sep}\n${body}\n\n`;
}
// ── Extract images from HTML ────────────────────────────────────────────────
function extractImages(element) {
const seen = new Set();
const imgs = Array.from(element.querySelectorAll("img"));
return imgs
.filter((img) => {
const src = img.getAttribute("src") || "";
if (!src) return false;
// skip tiny icons / tracking pixels by rendered size
const w = img.naturalWidth || img.width || 0;
const h = img.naturalHeight || img.height || 0;
if (w > 0 && w < 32 && h > 0 && h < 32) return false;
// skip 1x1 gif trackers
if (src.startsWith("data:image/gif")) return false;
return true;
})
.map((img) => {
const src = img.getAttribute("src") || "";
let abs = src;
try { abs = new URL(src, location.href).href; } catch {}
return { src: abs, alt: img.getAttribute("alt") || "" };
})
.filter((img) => {
if (seen.has(img.src)) return false;
seen.add(img.src);
return true;
});
}
// ── Message handler ─────────────────────────────────────────────────────────
chrome.runtime.onMessage.addListener((msg, _sender, sendResponse) => {
if (msg.action === "getContent") {
try {
let markdown = "";
let images = [];
let title = document.title || location.href;
if (msg.mode === "selection") {
const sel = window.getSelection();
if (sel && sel.rangeCount > 0) {
const frag = sel.getRangeAt(0).cloneContents();
const div = document.createElement("div");
div.appendChild(frag);
markdown = htmlToMarkdown(div);
images = extractImages(div);
} else {
markdown = "";
}
} else {
// full page — prefer article/main, fall back to body
const root =
document.querySelector("article") ||
document.querySelector("main") ||
document.querySelector('[role="main"]') ||
document.body;
markdown = htmlToMarkdown(root);
images = extractImages(root);
}
// Prepend source line
const sourceNote = `> Source: [${title}](${location.href})\n\n`;
markdown = sourceNote + markdown;
sendResponse({ ok: true, markdown, images, title, url: location.href });
} catch (e) {
sendResponse({ ok: false, error: e.message });
}
return true; // async
}
});
})();
} // end guard