Add Memos Clipper extension with Markdown content extraction and browser integration.
This commit is contained in:
272
src/content.js
Normal file
272
src/content.js
Normal file
@@ -0,0 +1,272 @@
|
||||
// content.js - Extracts page content and converts to Markdown
|
||||
// Guard against double-injection (MV3 scripting.executeScript can fire multiple times)
|
||||
if (window.__memosClipperLoaded) { /* skip */ } else {
|
||||
window.__memosClipperLoaded = true;
|
||||
|
||||
(function () {
|
||||
// ── Turndown-lite: a minimal but solid HTML→Markdown converter ──────────────
|
||||
function htmlToMarkdown(element, isSelection = false) {
|
||||
const clone = element.cloneNode(true);
|
||||
|
||||
// Remove unwanted elements — comprehensive list covering real-world sites
|
||||
// Skip this if we're in selection mode, because the user explicitly picked this content
|
||||
if (!isSelection) {
|
||||
const removeSelectors = [
|
||||
// Semantic structural chrome
|
||||
'script', 'style', 'noscript', 'template',
|
||||
'nav', 'header', 'footer', 'aside',
|
||||
// ARIA roles for chrome
|
||||
'[role="navigation"]', '[role="banner"]', '[role="complementary"]',
|
||||
'[role="contentinfo"]', '[role="search"]', '[role="toolbar"]',
|
||||
'[role="menubar"]', '[role="menu"]', '[role="dialog"]',
|
||||
// Common class/id patterns for site chrome
|
||||
'[class*="navbar"]', '[class*="nav-bar"]', '[class*="site-nav"]',
|
||||
'[class*="site-header"]', '[class*="site-footer"]',
|
||||
'[class*="page-header"]', '[class*="page-footer"]',
|
||||
'[id*="navbar"]', '[id*="site-nav"]', '[id*="site-header"]', '[id*="site-footer"]',
|
||||
// Ads and tracking
|
||||
'[class*="advertisement"]', '[class*="advert"]', '[class*=" ad-"]',
|
||||
'[class*="google-ad"]', '[class*="sponsored"]',
|
||||
'[id*="advertisement"]', '[id*="google_ad"]',
|
||||
// Cookie banners, popups, overlays
|
||||
'[class*="cookie"]', '[id*="cookie"]',
|
||||
'[class*="consent"]', '[id*="consent"]',
|
||||
'[class*="gdpr"]', '[id*="gdpr"]',
|
||||
'[class*="popup"]', '[class*="modal"]', '[class*="overlay"]',
|
||||
'[class*="banner"]', '[id*="banner"]',
|
||||
// Social / share widgets
|
||||
'[class*="share-bar"]', '[class*="social-bar"]', '[class*="share-buttons"]',
|
||||
'[class*="sharing"]',
|
||||
// Subscription / newsletter prompts
|
||||
'[class*="newsletter"]', '[class*="subscribe"]',
|
||||
// Comments sections
|
||||
'[id="comments"]', '[class*="comments-section"]', '[id*="disqus"]',
|
||||
// Related / recommended articles
|
||||
'[class*="related-posts"]', '[class*="recommended"]', '[class*="more-articles"]',
|
||||
// Sidebar
|
||||
'[class*="sidebar"]', '[id*="sidebar"]',
|
||||
// Print/hidden
|
||||
'[hidden]', '[aria-hidden="true"]',
|
||||
].join(', ');
|
||||
|
||||
clone.querySelectorAll(removeSelectors).forEach((el) => el.remove());
|
||||
|
||||
// Also remove elements that are visually hidden via inline style
|
||||
clone.querySelectorAll('[style*="display:none"],[style*="display: none"],[style*="visibility:hidden"]')
|
||||
.forEach((el) => el.remove());
|
||||
} else {
|
||||
// In selection mode, we still want to remove script/style tags if any
|
||||
clone.querySelectorAll('script, style, noscript, template').forEach((el) => el.remove());
|
||||
}
|
||||
|
||||
return nodeToMd(clone).replace(/\n{3,}/g, "\n\n").trim();
|
||||
}
|
||||
|
||||
function nodeToMd(node, ctx = { listDepth: 0, ordered: false, index: 0 }) {
|
||||
if (node.nodeType === Node.TEXT_NODE) {
|
||||
return node.textContent.replace(/\s+/g, " ");
|
||||
}
|
||||
if (node.nodeType !== Node.ELEMENT_NODE) return "";
|
||||
|
||||
const tag = node.tagName.toLowerCase();
|
||||
const children = () =>
|
||||
Array.from(node.childNodes)
|
||||
.map((c) => nodeToMd(c, ctx))
|
||||
.join("");
|
||||
|
||||
switch (tag) {
|
||||
case "h1": return `\n\n# ${children().trim()}\n\n`;
|
||||
case "h2": return `\n\n## ${children().trim()}\n\n`;
|
||||
case "h3": return `\n\n### ${children().trim()}\n\n`;
|
||||
case "h4": return `\n\n#### ${children().trim()}\n\n`;
|
||||
case "h5": return `\n\n##### ${children().trim()}\n\n`;
|
||||
case "h6": return `\n\n###### ${children().trim()}\n\n`;
|
||||
case "p": return `\n\n${children().trim()}\n\n`;
|
||||
case "br": return " \n";
|
||||
case "hr": return "\n\n---\n\n";
|
||||
|
||||
case "strong":
|
||||
case "b": return `**${children()}**`;
|
||||
case "em":
|
||||
case "i": return `_${children()}_`;
|
||||
case "s":
|
||||
case "del": return `~~${children()}~~`;
|
||||
case "code": {
|
||||
const text = node.textContent;
|
||||
return text.includes("`") ? `\`\`${text}\`\`` : `\`${text}\``;
|
||||
}
|
||||
case "pre": {
|
||||
const codeEl = node.querySelector("code");
|
||||
const lang = codeEl
|
||||
? (codeEl.className.match(/language-(\S+)/) || [])[1] || ""
|
||||
: "";
|
||||
const text = (codeEl || node).textContent;
|
||||
return `\n\n\`\`\`${lang}\n${text}\n\`\`\`\n\n`;
|
||||
}
|
||||
case "blockquote": return `\n\n${children()
|
||||
.trim()
|
||||
.split("\n")
|
||||
.map((l) => `> ${l}`)
|
||||
.join("\n")}\n\n`;
|
||||
|
||||
case "a": {
|
||||
const href = node.getAttribute("href") || "";
|
||||
const text = children().trim();
|
||||
if (!text) return href;
|
||||
try {
|
||||
const abs = new URL(href, location.href).href;
|
||||
return `[${text}](${abs})`;
|
||||
} catch {
|
||||
return `[${text}](${href})`;
|
||||
}
|
||||
}
|
||||
|
||||
case "img": {
|
||||
const src = node.getAttribute("src") || "";
|
||||
const alt = node.getAttribute("alt") || "";
|
||||
try {
|
||||
const abs = new URL(src, location.href).href;
|
||||
return ``;
|
||||
} catch {
|
||||
return src ? `` : "";
|
||||
}
|
||||
}
|
||||
|
||||
case "ul": {
|
||||
const lines = Array.from(node.children)
|
||||
.map((li) => `${" ".repeat(ctx.listDepth)}- ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }).trim()}`)
|
||||
.join("\n");
|
||||
return `\n\n${lines}\n\n`;
|
||||
}
|
||||
case "ol": {
|
||||
const lines = Array.from(node.children)
|
||||
.map((li, i) => `${" ".repeat(ctx.listDepth)}${i + 1}. ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }).trim()}`)
|
||||
.join("\n");
|
||||
return `\n\n${lines}\n\n`;
|
||||
}
|
||||
case "li": return children();
|
||||
|
||||
case "table": return convertTable(node);
|
||||
|
||||
case "figure": {
|
||||
const img = node.querySelector("img");
|
||||
const caption = node.querySelector("figcaption");
|
||||
let md = img ? nodeToMd(img, ctx) : children();
|
||||
if (caption) md += `\n*${caption.textContent.trim()}*`;
|
||||
return `\n\n${md}\n\n`;
|
||||
}
|
||||
|
||||
// skip presentational / hidden
|
||||
case "svg":
|
||||
case "canvas":
|
||||
case "video":
|
||||
case "audio":
|
||||
case "iframe":
|
||||
case "button":
|
||||
case "input":
|
||||
case "select":
|
||||
case "textarea":
|
||||
case "form":
|
||||
case "nav":
|
||||
case "header":
|
||||
case "footer":
|
||||
case "aside":
|
||||
return "";
|
||||
|
||||
default:
|
||||
return children();
|
||||
}
|
||||
}
|
||||
|
||||
function convertTable(table) {
|
||||
const rows = Array.from(table.querySelectorAll("tr"));
|
||||
if (!rows.length) return "";
|
||||
const toRow = (tr) =>
|
||||
"| " +
|
||||
Array.from(tr.querySelectorAll("th,td"))
|
||||
.map((c) => c.textContent.trim().replace(/\|/g, "\\|"))
|
||||
.join(" | ") +
|
||||
" |";
|
||||
const header = toRow(rows[0]);
|
||||
const sep =
|
||||
"| " +
|
||||
Array.from(rows[0].querySelectorAll("th,td"))
|
||||
.map(() => "---")
|
||||
.join(" | ") +
|
||||
" |";
|
||||
const body = rows.slice(1).map(toRow).join("\n");
|
||||
return `\n\n${header}\n${sep}\n${body}\n\n`;
|
||||
}
|
||||
|
||||
// ── Extract images from HTML ────────────────────────────────────────────────
|
||||
function extractImages(element) {
|
||||
const seen = new Set();
|
||||
const imgs = Array.from(element.querySelectorAll("img"));
|
||||
return imgs
|
||||
.filter((img) => {
|
||||
const src = img.getAttribute("src") || "";
|
||||
if (!src) return false;
|
||||
// skip tiny icons / tracking pixels by rendered size
|
||||
const w = img.naturalWidth || img.width || 0;
|
||||
const h = img.naturalHeight || img.height || 0;
|
||||
if (w > 0 && w < 32 && h > 0 && h < 32) return false;
|
||||
// skip 1x1 gif trackers
|
||||
if (src.startsWith("data:image/gif")) return false;
|
||||
return true;
|
||||
})
|
||||
.map((img) => {
|
||||
const src = img.getAttribute("src") || "";
|
||||
let abs = src;
|
||||
try { abs = new URL(src, location.href).href; } catch {}
|
||||
return { src: abs, alt: img.getAttribute("alt") || "" };
|
||||
})
|
||||
.filter((img) => {
|
||||
if (seen.has(img.src)) return false;
|
||||
seen.add(img.src);
|
||||
return true;
|
||||
});
|
||||
}
|
||||
|
||||
// ── Message handler ─────────────────────────────────────────────────────────
|
||||
chrome.runtime.onMessage.addListener((msg, _sender, sendResponse) => {
|
||||
if (msg.action === "getContent") {
|
||||
try {
|
||||
let markdown = "";
|
||||
let images = [];
|
||||
let title = document.title || location.href;
|
||||
|
||||
if (msg.mode === "selection") {
|
||||
const sel = window.getSelection();
|
||||
if (sel && sel.rangeCount > 0) {
|
||||
const frag = sel.getRangeAt(0).cloneContents();
|
||||
const div = document.createElement("div");
|
||||
div.appendChild(frag);
|
||||
markdown = htmlToMarkdown(div, true);
|
||||
images = extractImages(div);
|
||||
} else {
|
||||
markdown = "";
|
||||
}
|
||||
} else {
|
||||
// full page — prefer article/main, fall back to body
|
||||
const root =
|
||||
document.querySelector("article") ||
|
||||
document.querySelector("main") ||
|
||||
document.querySelector('[role="main"]') ||
|
||||
document.body;
|
||||
markdown = htmlToMarkdown(root);
|
||||
images = extractImages(root);
|
||||
}
|
||||
|
||||
// Prepend source line
|
||||
const sourceNote = `> Source: [${title}](${location.href})\n\n`;
|
||||
markdown = sourceNote + markdown;
|
||||
|
||||
sendResponse({ ok: true, markdown, images, title, url: location.href });
|
||||
} catch (e) {
|
||||
sendResponse({ ok: false, error: e.message });
|
||||
}
|
||||
return true; // async
|
||||
}
|
||||
});
|
||||
})();
|
||||
} // end guard
|
||||
Reference in New Issue
Block a user