273 lines
9.9 KiB
JavaScript
273 lines
9.9 KiB
JavaScript
// content.js - Extracts page content and converts to Markdown
|
|
// Guard against double-injection (MV3 scripting.executeScript can fire multiple times)
|
|
if (window.__memosClipperLoaded) { /* skip */ } else {
|
|
window.__memosClipperLoaded = true;
|
|
|
|
(function () {
|
|
// ── Turndown-lite: a minimal but solid HTML→Markdown converter ──────────────
|
|
function htmlToMarkdown(element, isSelection = false) {
|
|
const clone = element.cloneNode(true);
|
|
|
|
// Remove unwanted elements — comprehensive list covering real-world sites
|
|
// Skip this if we're in selection mode, because the user explicitly picked this content
|
|
if (!isSelection) {
|
|
const removeSelectors = [
|
|
// Semantic structural chrome
|
|
'script', 'style', 'noscript', 'template',
|
|
'nav', 'header', 'footer', 'aside',
|
|
// ARIA roles for chrome
|
|
'[role="navigation"]', '[role="banner"]', '[role="complementary"]',
|
|
'[role="contentinfo"]', '[role="search"]', '[role="toolbar"]',
|
|
'[role="menubar"]', '[role="menu"]', '[role="dialog"]',
|
|
// Common class/id patterns for site chrome
|
|
'[class*="navbar"]', '[class*="nav-bar"]', '[class*="site-nav"]',
|
|
'[class*="site-header"]', '[class*="site-footer"]',
|
|
'[class*="page-header"]', '[class*="page-footer"]',
|
|
'[id*="navbar"]', '[id*="site-nav"]', '[id*="site-header"]', '[id*="site-footer"]',
|
|
// Ads and tracking
|
|
'[class*="advertisement"]', '[class*="advert"]', '[class*=" ad-"]',
|
|
'[class*="google-ad"]', '[class*="sponsored"]',
|
|
'[id*="advertisement"]', '[id*="google_ad"]',
|
|
// Cookie banners, popups, overlays
|
|
'[class*="cookie"]', '[id*="cookie"]',
|
|
'[class*="consent"]', '[id*="consent"]',
|
|
'[class*="gdpr"]', '[id*="gdpr"]',
|
|
'[class*="popup"]', '[class*="modal"]', '[class*="overlay"]',
|
|
'[class*="banner"]', '[id*="banner"]',
|
|
// Social / share widgets
|
|
'[class*="share-bar"]', '[class*="social-bar"]', '[class*="share-buttons"]',
|
|
'[class*="sharing"]',
|
|
// Subscription / newsletter prompts
|
|
'[class*="newsletter"]', '[class*="subscribe"]',
|
|
// Comments sections
|
|
'[id="comments"]', '[class*="comments-section"]', '[id*="disqus"]',
|
|
// Related / recommended articles
|
|
'[class*="related-posts"]', '[class*="recommended"]', '[class*="more-articles"]',
|
|
// Sidebar
|
|
'[class*="sidebar"]', '[id*="sidebar"]',
|
|
// Print/hidden
|
|
'[hidden]', '[aria-hidden="true"]',
|
|
].join(', ');
|
|
|
|
clone.querySelectorAll(removeSelectors).forEach((el) => el.remove());
|
|
|
|
// Also remove elements that are visually hidden via inline style
|
|
clone.querySelectorAll('[style*="display:none"],[style*="display: none"],[style*="visibility:hidden"]')
|
|
.forEach((el) => el.remove());
|
|
} else {
|
|
// In selection mode, we still want to remove script/style tags if any
|
|
clone.querySelectorAll('script, style, noscript, template').forEach((el) => el.remove());
|
|
}
|
|
|
|
return nodeToMd(clone).replace(/\n{3,}/g, "\n\n").trim();
|
|
}
|
|
|
|
function nodeToMd(node, ctx = { listDepth: 0, ordered: false, index: 0 }) {
|
|
if (node.nodeType === Node.TEXT_NODE) {
|
|
return node.textContent.replace(/\s+/g, " ");
|
|
}
|
|
if (node.nodeType !== Node.ELEMENT_NODE) return "";
|
|
|
|
const tag = node.tagName.toLowerCase();
|
|
const children = () =>
|
|
Array.from(node.childNodes)
|
|
.map((c) => nodeToMd(c, ctx))
|
|
.join("");
|
|
|
|
switch (tag) {
|
|
case "h1": return `\n\n# ${children().trim()}\n\n`;
|
|
case "h2": return `\n\n## ${children().trim()}\n\n`;
|
|
case "h3": return `\n\n### ${children().trim()}\n\n`;
|
|
case "h4": return `\n\n#### ${children().trim()}\n\n`;
|
|
case "h5": return `\n\n##### ${children().trim()}\n\n`;
|
|
case "h6": return `\n\n###### ${children().trim()}\n\n`;
|
|
case "p": return `\n\n${children().trim()}\n\n`;
|
|
case "br": return " \n";
|
|
case "hr": return "\n\n---\n\n";
|
|
|
|
case "strong":
|
|
case "b": return `**${children()}**`;
|
|
case "em":
|
|
case "i": return `_${children()}_`;
|
|
case "s":
|
|
case "del": return `~~${children()}~~`;
|
|
case "code": {
|
|
const text = node.textContent;
|
|
return text.includes("`") ? `\`\`${text}\`\`` : `\`${text}\``;
|
|
}
|
|
case "pre": {
|
|
const codeEl = node.querySelector("code");
|
|
const lang = codeEl
|
|
? (codeEl.className.match(/language-(\S+)/) || [])[1] || ""
|
|
: "";
|
|
const text = (codeEl || node).textContent;
|
|
return `\n\n\`\`\`${lang}\n${text}\n\`\`\`\n\n`;
|
|
}
|
|
case "blockquote": return `\n\n${children()
|
|
.trim()
|
|
.split("\n")
|
|
.map((l) => `> ${l}`)
|
|
.join("\n")}\n\n`;
|
|
|
|
case "a": {
|
|
const href = node.getAttribute("href") || "";
|
|
const text = children().trim();
|
|
if (!text) return href;
|
|
try {
|
|
const abs = new URL(href, location.href).href;
|
|
return `[${text}](${abs})`;
|
|
} catch {
|
|
return `[${text}](${href})`;
|
|
}
|
|
}
|
|
|
|
case "img": {
|
|
const src = node.getAttribute("src") || "";
|
|
const alt = node.getAttribute("alt") || "";
|
|
try {
|
|
const abs = new URL(src, location.href).href;
|
|
return ``;
|
|
} catch {
|
|
return src ? `` : "";
|
|
}
|
|
}
|
|
|
|
case "ul": {
|
|
const lines = Array.from(node.children)
|
|
.map((li) => `${" ".repeat(ctx.listDepth)}- ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }).trim()}`)
|
|
.join("\n");
|
|
return `\n\n${lines}\n\n`;
|
|
}
|
|
case "ol": {
|
|
const lines = Array.from(node.children)
|
|
.map((li, i) => `${" ".repeat(ctx.listDepth)}${i + 1}. ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }).trim()}`)
|
|
.join("\n");
|
|
return `\n\n${lines}\n\n`;
|
|
}
|
|
case "li": return children();
|
|
|
|
case "table": return convertTable(node);
|
|
|
|
case "figure": {
|
|
const img = node.querySelector("img");
|
|
const caption = node.querySelector("figcaption");
|
|
let md = img ? nodeToMd(img, ctx) : children();
|
|
if (caption) md += `\n*${caption.textContent.trim()}*`;
|
|
return `\n\n${md}\n\n`;
|
|
}
|
|
|
|
// skip presentational / hidden
|
|
case "svg":
|
|
case "canvas":
|
|
case "video":
|
|
case "audio":
|
|
case "iframe":
|
|
case "button":
|
|
case "input":
|
|
case "select":
|
|
case "textarea":
|
|
case "form":
|
|
case "nav":
|
|
case "header":
|
|
case "footer":
|
|
case "aside":
|
|
return "";
|
|
|
|
default:
|
|
return children();
|
|
}
|
|
}
|
|
|
|
function convertTable(table) {
|
|
const rows = Array.from(table.querySelectorAll("tr"));
|
|
if (!rows.length) return "";
|
|
const toRow = (tr) =>
|
|
"| " +
|
|
Array.from(tr.querySelectorAll("th,td"))
|
|
.map((c) => c.textContent.trim().replace(/\|/g, "\\|"))
|
|
.join(" | ") +
|
|
" |";
|
|
const header = toRow(rows[0]);
|
|
const sep =
|
|
"| " +
|
|
Array.from(rows[0].querySelectorAll("th,td"))
|
|
.map(() => "---")
|
|
.join(" | ") +
|
|
" |";
|
|
const body = rows.slice(1).map(toRow).join("\n");
|
|
return `\n\n${header}\n${sep}\n${body}\n\n`;
|
|
}
|
|
|
|
// ── Extract images from HTML ────────────────────────────────────────────────
|
|
function extractImages(element) {
|
|
const seen = new Set();
|
|
const imgs = Array.from(element.querySelectorAll("img"));
|
|
return imgs
|
|
.filter((img) => {
|
|
const src = img.getAttribute("src") || "";
|
|
if (!src) return false;
|
|
// skip tiny icons / tracking pixels by rendered size
|
|
const w = img.naturalWidth || img.width || 0;
|
|
const h = img.naturalHeight || img.height || 0;
|
|
if (w > 0 && w < 32 && h > 0 && h < 32) return false;
|
|
// skip 1x1 gif trackers
|
|
if (src.startsWith("data:image/gif")) return false;
|
|
return true;
|
|
})
|
|
.map((img) => {
|
|
const src = img.getAttribute("src") || "";
|
|
let abs = src;
|
|
try { abs = new URL(src, location.href).href; } catch {}
|
|
return { src: abs, alt: img.getAttribute("alt") || "" };
|
|
})
|
|
.filter((img) => {
|
|
if (seen.has(img.src)) return false;
|
|
seen.add(img.src);
|
|
return true;
|
|
});
|
|
}
|
|
|
|
// ── Message handler ─────────────────────────────────────────────────────────
|
|
chrome.runtime.onMessage.addListener((msg, _sender, sendResponse) => {
|
|
if (msg.action === "getContent") {
|
|
try {
|
|
let markdown = "";
|
|
let images = [];
|
|
let title = document.title || location.href;
|
|
|
|
if (msg.mode === "selection") {
|
|
const sel = window.getSelection();
|
|
if (sel && sel.rangeCount > 0) {
|
|
const frag = sel.getRangeAt(0).cloneContents();
|
|
const div = document.createElement("div");
|
|
div.appendChild(frag);
|
|
markdown = htmlToMarkdown(div, true);
|
|
images = extractImages(div);
|
|
} else {
|
|
markdown = "";
|
|
}
|
|
} else {
|
|
// full page — prefer article/main, fall back to body
|
|
const root =
|
|
document.querySelector("article") ||
|
|
document.querySelector("main") ||
|
|
document.querySelector('[role="main"]') ||
|
|
document.body;
|
|
markdown = htmlToMarkdown(root);
|
|
images = extractImages(root);
|
|
}
|
|
|
|
// Prepend source line
|
|
const sourceNote = `> Source: [${title}](${location.href})\n\n`;
|
|
markdown = sourceNote + markdown;
|
|
|
|
sendResponse({ ok: true, markdown, images, title, url: location.href });
|
|
} catch (e) {
|
|
sendResponse({ ok: false, error: e.message });
|
|
}
|
|
return true; // async
|
|
}
|
|
});
|
|
})();
|
|
} // end guard
|