Auto-commited changes

This commit is contained in:
2026-03-18 19:58:05 +01:00
parent 4355fa78fa
commit 2b8659499a
4 changed files with 115 additions and 30 deletions

View File

@@ -5,7 +5,7 @@ window.__memosClipperLoaded = true;
(function () {
// ── Turndown-lite: a minimal but solid HTML→Markdown converter ──────────────
function htmlToMarkdown(element, isSelection = false) {
function htmlToMarkdown(element, isSelection = false, stripLinks = false) {
const clone = element.cloneNode(true);
// Remove unwanted elements — comprehensive list covering real-world sites
@@ -54,15 +54,40 @@ window.__memosClipperLoaded = true;
// Also remove elements that are visually hidden via inline style
clone.querySelectorAll('[style*="display:none"],[style*="display: none"],[style*="visibility:hidden"]')
.forEach((el) => el.remove());
// Remove link-dense blocks (navigation menus, ad link lists, etc.)
// Collect candidates first to avoid mid-iteration detached-node issues.
// Only target outer chrome elements (nav, aside, header, footer, div, section)
// not content containers like article/main, to avoid stripping TOCs in prose.
const linkDenseCandidates = Array.from(
clone.querySelectorAll('nav, aside, header, footer, div, section')
).filter((el) => {
// Skip if inside the primary content container
if (el.closest('article, main, [role="main"]')) return false;
const totalText = (el.textContent || '').trim().length;
if (totalText < 30) return false; // too short to judge
const linkText = Array.from(el.querySelectorAll('a'))
.reduce((sum, a) => sum + (a.textContent || '').trim().length, 0);
if (linkText / totalText <= 0.65) return false;
// Require that the element has little direct (non-link) text of its own
const directText = Array.from(el.childNodes)
.filter((n) => n.nodeType === Node.TEXT_NODE)
.reduce((sum, n) => sum + n.textContent.trim().length, 0);
return directText < totalText * 0.25;
});
// Remove outermost candidates only (skip those already inside a removed ancestor)
linkDenseCandidates.forEach((el) => {
if (el.isConnected) el.remove();
});
} else {
// In selection mode, we still want to remove script/style tags if any
clone.querySelectorAll('script, style, noscript, template').forEach((el) => el.remove());
}
return nodeToMd(clone).replace(/\n{3,}/g, "\n\n").trim();
return nodeToMd(clone, { listDepth: 0, ordered: false, index: 0 }, stripLinks).replace(/\n{3,}/g, "\n\n").trim();
}
function nodeToMd(node, ctx = { listDepth: 0, ordered: false, index: 0 }) {
function nodeToMd(node, ctx = { listDepth: 0, ordered: false, index: 0 }, stripLinks = false) {
if (node.nodeType === Node.TEXT_NODE) {
return node.textContent.replace(/\s+/g, " ");
}
@@ -71,7 +96,7 @@ window.__memosClipperLoaded = true;
const tag = node.tagName.toLowerCase();
const children = () =>
Array.from(node.childNodes)
.map((c) => nodeToMd(c, ctx))
.map((c) => nodeToMd(c, ctx, stripLinks))
.join("");
switch (tag) {
@@ -93,7 +118,8 @@ window.__memosClipperLoaded = true;
case "del": return `~~${children()}~~`;
case "code": {
const text = node.textContent;
return text.includes("`") ? `\`\`${text}\`\`` : `\`${text}\``;
if (text.includes("`")) return `\`\` ${text} \`\``;
return `\`${text}\``;
}
case "pre": {
const codeEl = node.querySelector("code");
@@ -110,8 +136,9 @@ window.__memosClipperLoaded = true;
.join("\n")}\n\n`;
case "a": {
const href = node.getAttribute("href") || "";
const text = children().trim();
if (stripLinks) return text; // just the anchor text, no URL
const href = node.getAttribute("href") || "";
if (!text) return href;
try {
const abs = new URL(href, location.href).href;
@@ -134,24 +161,24 @@ window.__memosClipperLoaded = true;
case "ul": {
const lines = Array.from(node.children)
.map((li) => `${" ".repeat(ctx.listDepth)}- ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }).trim()}`)
.map((li) => `${" ".repeat(ctx.listDepth)}- ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }, stripLinks).trim()}`)
.join("\n");
return `\n\n${lines}\n\n`;
}
case "ol": {
const lines = Array.from(node.children)
.map((li, i) => `${" ".repeat(ctx.listDepth)}${i + 1}. ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }).trim()}`)
.map((li, i) => `${" ".repeat(ctx.listDepth)}${i + 1}. ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }, stripLinks).trim()}`)
.join("\n");
return `\n\n${lines}\n\n`;
}
case "li": return children();
case "table": return convertTable(node);
case "table": return convertTable(node, stripLinks);
case "figure": {
const img = node.querySelector("img");
const caption = node.querySelector("figcaption");
let md = img ? nodeToMd(img, ctx) : children();
let md = img ? nodeToMd(img, ctx, stripLinks) : children();
if (caption) md += `\n*${caption.textContent.trim()}*`;
return `\n\n${md}\n\n`;
}
@@ -178,13 +205,16 @@ window.__memosClipperLoaded = true;
}
}
function convertTable(table) {
function convertTable(table, stripLinks = false) {
const rows = Array.from(table.querySelectorAll("tr"));
if (!rows.length) return "";
const toRow = (tr) =>
"| " +
Array.from(tr.querySelectorAll("th,td"))
.map((c) => c.textContent.trim().replace(/\|/g, "\\|"))
.map((c) => {
const text = stripLinks ? c.textContent.trim() : nodeToMd(c).trim();
return text.replace(/\|/g, "\\|");
})
.join(" | ") +
" |";
const header = toRow(rows[0]);
@@ -234,6 +264,7 @@ window.__memosClipperLoaded = true;
let markdown = "";
let images = [];
let title = document.title || location.href;
const stripLinks = !!msg.stripLinks;
if (msg.mode === "selection") {
const sel = window.getSelection();
@@ -241,7 +272,7 @@ window.__memosClipperLoaded = true;
const frag = sel.getRangeAt(0).cloneContents();
const div = document.createElement("div");
div.appendChild(frag);
markdown = htmlToMarkdown(div, true);
markdown = htmlToMarkdown(div, true, stripLinks);
images = extractImages(div);
} else {
markdown = "";
@@ -253,7 +284,7 @@ window.__memosClipperLoaded = true;
document.querySelector("main") ||
document.querySelector('[role="main"]') ||
document.body;
markdown = htmlToMarkdown(root);
markdown = htmlToMarkdown(root, false, stripLinks);
images = extractImages(root);
}