Auto-commited changes
This commit is contained in:
@@ -5,7 +5,7 @@ window.__memosClipperLoaded = true;
|
||||
|
||||
(function () {
|
||||
// ── Turndown-lite: a minimal but solid HTML→Markdown converter ──────────────
|
||||
function htmlToMarkdown(element, isSelection = false) {
|
||||
function htmlToMarkdown(element, isSelection = false, stripLinks = false) {
|
||||
const clone = element.cloneNode(true);
|
||||
|
||||
// Remove unwanted elements — comprehensive list covering real-world sites
|
||||
@@ -54,15 +54,40 @@ window.__memosClipperLoaded = true;
|
||||
// Also remove elements that are visually hidden via inline style
|
||||
clone.querySelectorAll('[style*="display:none"],[style*="display: none"],[style*="visibility:hidden"]')
|
||||
.forEach((el) => el.remove());
|
||||
|
||||
// Remove link-dense blocks (navigation menus, ad link lists, etc.)
|
||||
// Collect candidates first to avoid mid-iteration detached-node issues.
|
||||
// Only target outer chrome elements (nav, aside, header, footer, div, section)
|
||||
// not content containers like article/main, to avoid stripping TOCs in prose.
|
||||
const linkDenseCandidates = Array.from(
|
||||
clone.querySelectorAll('nav, aside, header, footer, div, section')
|
||||
).filter((el) => {
|
||||
// Skip if inside the primary content container
|
||||
if (el.closest('article, main, [role="main"]')) return false;
|
||||
const totalText = (el.textContent || '').trim().length;
|
||||
if (totalText < 30) return false; // too short to judge
|
||||
const linkText = Array.from(el.querySelectorAll('a'))
|
||||
.reduce((sum, a) => sum + (a.textContent || '').trim().length, 0);
|
||||
if (linkText / totalText <= 0.65) return false;
|
||||
// Require that the element has little direct (non-link) text of its own
|
||||
const directText = Array.from(el.childNodes)
|
||||
.filter((n) => n.nodeType === Node.TEXT_NODE)
|
||||
.reduce((sum, n) => sum + n.textContent.trim().length, 0);
|
||||
return directText < totalText * 0.25;
|
||||
});
|
||||
// Remove outermost candidates only (skip those already inside a removed ancestor)
|
||||
linkDenseCandidates.forEach((el) => {
|
||||
if (el.isConnected) el.remove();
|
||||
});
|
||||
} else {
|
||||
// In selection mode, we still want to remove script/style tags if any
|
||||
clone.querySelectorAll('script, style, noscript, template').forEach((el) => el.remove());
|
||||
}
|
||||
|
||||
return nodeToMd(clone).replace(/\n{3,}/g, "\n\n").trim();
|
||||
return nodeToMd(clone, { listDepth: 0, ordered: false, index: 0 }, stripLinks).replace(/\n{3,}/g, "\n\n").trim();
|
||||
}
|
||||
|
||||
function nodeToMd(node, ctx = { listDepth: 0, ordered: false, index: 0 }) {
|
||||
function nodeToMd(node, ctx = { listDepth: 0, ordered: false, index: 0 }, stripLinks = false) {
|
||||
if (node.nodeType === Node.TEXT_NODE) {
|
||||
return node.textContent.replace(/\s+/g, " ");
|
||||
}
|
||||
@@ -71,7 +96,7 @@ window.__memosClipperLoaded = true;
|
||||
const tag = node.tagName.toLowerCase();
|
||||
const children = () =>
|
||||
Array.from(node.childNodes)
|
||||
.map((c) => nodeToMd(c, ctx))
|
||||
.map((c) => nodeToMd(c, ctx, stripLinks))
|
||||
.join("");
|
||||
|
||||
switch (tag) {
|
||||
@@ -93,7 +118,8 @@ window.__memosClipperLoaded = true;
|
||||
case "del": return `~~${children()}~~`;
|
||||
case "code": {
|
||||
const text = node.textContent;
|
||||
return text.includes("`") ? `\`\`${text}\`\`` : `\`${text}\``;
|
||||
if (text.includes("`")) return `\`\` ${text} \`\``;
|
||||
return `\`${text}\``;
|
||||
}
|
||||
case "pre": {
|
||||
const codeEl = node.querySelector("code");
|
||||
@@ -110,8 +136,9 @@ window.__memosClipperLoaded = true;
|
||||
.join("\n")}\n\n`;
|
||||
|
||||
case "a": {
|
||||
const href = node.getAttribute("href") || "";
|
||||
const text = children().trim();
|
||||
if (stripLinks) return text; // just the anchor text, no URL
|
||||
const href = node.getAttribute("href") || "";
|
||||
if (!text) return href;
|
||||
try {
|
||||
const abs = new URL(href, location.href).href;
|
||||
@@ -134,24 +161,24 @@ window.__memosClipperLoaded = true;
|
||||
|
||||
case "ul": {
|
||||
const lines = Array.from(node.children)
|
||||
.map((li) => `${" ".repeat(ctx.listDepth)}- ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }).trim()}`)
|
||||
.map((li) => `${" ".repeat(ctx.listDepth)}- ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }, stripLinks).trim()}`)
|
||||
.join("\n");
|
||||
return `\n\n${lines}\n\n`;
|
||||
}
|
||||
case "ol": {
|
||||
const lines = Array.from(node.children)
|
||||
.map((li, i) => `${" ".repeat(ctx.listDepth)}${i + 1}. ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }).trim()}`)
|
||||
.map((li, i) => `${" ".repeat(ctx.listDepth)}${i + 1}. ${nodeToMd(li, { ...ctx, listDepth: ctx.listDepth + 1 }, stripLinks).trim()}`)
|
||||
.join("\n");
|
||||
return `\n\n${lines}\n\n`;
|
||||
}
|
||||
case "li": return children();
|
||||
|
||||
case "table": return convertTable(node);
|
||||
case "table": return convertTable(node, stripLinks);
|
||||
|
||||
case "figure": {
|
||||
const img = node.querySelector("img");
|
||||
const caption = node.querySelector("figcaption");
|
||||
let md = img ? nodeToMd(img, ctx) : children();
|
||||
let md = img ? nodeToMd(img, ctx, stripLinks) : children();
|
||||
if (caption) md += `\n*${caption.textContent.trim()}*`;
|
||||
return `\n\n${md}\n\n`;
|
||||
}
|
||||
@@ -178,13 +205,16 @@ window.__memosClipperLoaded = true;
|
||||
}
|
||||
}
|
||||
|
||||
function convertTable(table) {
|
||||
function convertTable(table, stripLinks = false) {
|
||||
const rows = Array.from(table.querySelectorAll("tr"));
|
||||
if (!rows.length) return "";
|
||||
const toRow = (tr) =>
|
||||
"| " +
|
||||
Array.from(tr.querySelectorAll("th,td"))
|
||||
.map((c) => c.textContent.trim().replace(/\|/g, "\\|"))
|
||||
.map((c) => {
|
||||
const text = stripLinks ? c.textContent.trim() : nodeToMd(c).trim();
|
||||
return text.replace(/\|/g, "\\|");
|
||||
})
|
||||
.join(" | ") +
|
||||
" |";
|
||||
const header = toRow(rows[0]);
|
||||
@@ -234,6 +264,7 @@ window.__memosClipperLoaded = true;
|
||||
let markdown = "";
|
||||
let images = [];
|
||||
let title = document.title || location.href;
|
||||
const stripLinks = !!msg.stripLinks;
|
||||
|
||||
if (msg.mode === "selection") {
|
||||
const sel = window.getSelection();
|
||||
@@ -241,7 +272,7 @@ window.__memosClipperLoaded = true;
|
||||
const frag = sel.getRangeAt(0).cloneContents();
|
||||
const div = document.createElement("div");
|
||||
div.appendChild(frag);
|
||||
markdown = htmlToMarkdown(div, true);
|
||||
markdown = htmlToMarkdown(div, true, stripLinks);
|
||||
images = extractImages(div);
|
||||
} else {
|
||||
markdown = "";
|
||||
@@ -253,7 +284,7 @@ window.__memosClipperLoaded = true;
|
||||
document.querySelector("main") ||
|
||||
document.querySelector('[role="main"]') ||
|
||||
document.body;
|
||||
markdown = htmlToMarkdown(root);
|
||||
markdown = htmlToMarkdown(root, false, stripLinks);
|
||||
images = extractImages(root);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user