add: export docx with math

This commit is contained in:
windingwind 2023-12-03 16:12:19 +08:00
parent 6e3094c69c
commit 622cef7cf3
9 changed files with 28790 additions and 46 deletions

View File

@ -6,6 +6,10 @@
src="chrome://__addonRef__/content/scripts/docxWorker.js"
type="application/javascript"
></script>
<script
src="chrome://__addonRef__/content/lib/js/SaxonJS2.rt.js"
type="application/javascript"
></script>
</head>
<body></body>
</html>

24837
addon/chrome/content/lib/js/SaxonJS2.rt.js vendored Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -98,6 +98,7 @@
"release-it": "^16.1.5",
"replace-in-file": "^7.0.1",
"typescript": "^5.2.2",
"xslt3": "^2.6.0",
"zotero-types": "^1.3.5"
}
}

View File

@ -0,0 +1 @@
npx xslt3 -t -xsl:scripts/docx/mml2omml.xsl -export:addon/chrome/content/lib/js/mml2omml.sef.json -nogo -relocate:on -ns:##html5

3822
scripts/docx/mml2omml.xsl Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,32 @@
// @ts-ignore
import { config } from "../../package.json";
// @ts-ignore defined by html-docx-js
import htmlDocx from "html-docx-js/dist/html-docx";
const XSL_PATH = `chrome://${config.addonRef}/content/lib/js/mml2omml.sef.json`;
// this runs in a iframe. accept input message
// and return output message
onmessage = ({ data: { type, jobId, message } }) => {
onmessage = async ({ data: { type, jobID, message } }) => {
if (type === "parseDocx") {
console.log("DOCX Worker", type, jobId, message);
console.log("DOCX Worker", type, jobID, message);
const blob = htmlDocx.asBlob(message);
console.log("DOCX Worker", blob);
postMessage({ type: "parseDocxReturn", jobId, message: blob }, "*");
postMessage({ type: "parseDocxReturn", jobID, message: blob }, "*");
} else if (type === "parseMML") {
console.log("MML Worker", type, jobID, message);
// @ts-ignore defined by SaxonJS
const result = await SaxonJS.transform(
{
stylesheetLocation: XSL_PATH,
sourceType: "xml",
sourceText: message,
destination: "serialized",
},
"async",
);
postMessage(
{ type: "parseMMLReturn", jobID, message: result.principalResult },
"*",
);
}
};

View File

@ -13,8 +13,11 @@ export async function saveDocx(filename: string, noteId: number) {
}
async function note2docx(noteItem: Zotero.Item) {
const renderedContent = parseDocxCitationFields(
const worker = await getWorker();
const renderedContent = await parseDocxFields(
await renderNoteHTML(noteItem),
worker,
);
let htmlDoc =
'<!DOCTYPE html>\n<html lang="en"><head><meta charset="UTF-8"></head>\n';
@ -23,36 +26,46 @@ async function note2docx(noteItem: Zotero.Item) {
ztoolkit.log(`[Note2DOCX] ${htmlDoc}`);
let blob: ArrayBufferLike;
const lock = Zotero.Promise.defer();
const jobId = randomString(6, new Date().toUTCString());
const listener = (ev: MessageEvent) => {
if (ev.data.type === "parseDocxReturn" && ev.data.jobId === jobId) {
blob = ev.data.message;
lock.resolve();
}
};
const worker = await getWorker();
worker.contentWindow?.addEventListener("message", listener);
worker.contentWindow?.postMessage(
{
type: "parseDocx",
jobId,
message: htmlDoc,
},
"*",
);
await lock.promise;
worker.contentWindow?.removeEventListener("message", listener);
const blob = await sendWorkerTask(worker, "parseDocx", htmlDoc);
destroyWorker(worker);
return blob!;
}
type CitationCache = Record<string, { field: string; text: string }>;
function parseDocxCitationFields(html: string) {
async function parseDocxFields(html: string, worker: HTMLIFrameElement) {
const parser = new DOMParser();
const doc = parser.parseFromString(html, "text/html");
// Remove katex html elements to prevent duplicate rendering
doc.querySelectorAll(".katex-html").forEach((elem) => {
elem.remove();
});
const mathCache = {} as MathCache;
for (const elem of Array.from(doc.querySelectorAll("math"))) {
let str = (await sendWorkerTask(
worker,
"parseMML",
elem.outerHTML,
)) as string;
if (!str) {
continue;
}
str = str.replaceAll('<?xml version="1.0" encoding="UTF-8"?>', "");
if (elem.getAttribute("display") === "block") {
str = `<m:oMathPara xmlns:m="http://schemas.openxmlformats.org/officeDocument/2006/math">${str}</m:oMathPara>`;
}
const newElem = doc.createElement("span");
const mathID = getCacheID(mathCache, {
math: "",
});
mathCache[mathID].math = str;
newElem.setAttribute("data-bn-math-index", mathID);
elem.parentNode!.replaceChild(newElem, elem);
}
const citationCache = {} as CitationCache;
/*
[
@ -110,7 +123,10 @@ function parseDocxCitationFields(html: string) {
properties.formattedCitation = formattedCitation;
properties.plainCitation = formattedCitation + " ";
properties.noteIndex = 0;
const citationID = getCitationID(citationCache);
const citationID = getCacheID(citationCache, {
field: "",
text: "",
});
const csl = {
citationID,
@ -171,11 +187,22 @@ function parseDocxCitationFields(html: string) {
*/
}
const str = doc.body.innerHTML;
let str = doc.body.innerHTML;
// Replace all <span data-bn-math-index="T21wEH05"></span> with <!--[if gte msEquation 12]><m:oMath...</m:oMath><![endif]-->
const mathRegexp = /<span data-bn-math-index="([^"]+)"><\/span>/g;
str = str.replace(mathRegexp, (match, p1) => {
return `<!--[if gte msEquation 12]>${mathCache[p1].math}<![endif]-->`;
});
str = str.replaceAll(
"http://schemas.openxmlformats.org/officeDocument/2006/math",
"http://schemas.microsoft.com/office/2004/12/omml",
);
// Replace all <span data-bn-citation-index="T21wEH05"></span> with ADDIN ZOTERO_ITEM CSL_CITATION {...}
const re = /<span data-bn-citation-index="([^"]+)"><\/span>/g;
let parsed = str.replace(re, (match, p1) => {
const citationRegexp = /<span data-bn-citation-index="([^"]+)"><\/span>/g;
str = str.replace(citationRegexp, (match, p1) => {
return generateDocxField(
`ADDIN ZOTERO_ITEM CSL_CITATION ${htmlEscape(
doc,
@ -185,24 +212,23 @@ function parseDocxCitationFields(html: string) {
);
});
parsed += generateDocxField(
`ADDIN ZOTERO_BIBL {"uncited":[],"omitted":[],"custom":[]} CSL_BIBLIOGRAPHY`,
"[BIBLIOGRAPHY] Please click Zotero - Refresh in Word/LibreOffice to update all fields",
);
if (Object.keys(citationCache).length > 0) {
str += generateDocxField(
`ADDIN ZOTERO_BIBL {"uncited":[],"omitted":[],"custom":[]} CSL_BIBLIOGRAPHY`,
"[BIBLIOGRAPHY] Please click Zotero - Refresh in Word/LibreOffice to update all fields",
);
}
return parsed;
return str;
}
function getCitationID(citationCache: CitationCache) {
let citationID = Zotero.Utilities.randomString();
while (citationID in citationCache) {
citationID = Zotero.Utilities.randomString();
function getCacheID(cache: Record<string, any>, defaultValue: any) {
let id = Zotero.Utilities.randomString();
while (id in cache) {
id = Zotero.Utilities.randomString();
}
citationCache[citationID] = {
field: "",
text: "",
};
return citationID;
cache[id] = defaultValue;
return id;
}
function generateDocxField(fieldCode: string, text: string) {
@ -218,6 +244,8 @@ ${text}
<![endif]-->`;
}
type MathCache = Record<string, { math: string }>;
async function getWorker(): Promise<HTMLIFrameElement> {
const worker = ztoolkit.UI.createElement(document, "iframe", {
properties: {
@ -235,6 +263,34 @@ async function getWorker(): Promise<HTMLIFrameElement> {
return worker;
}
async function sendWorkerTask(
worker: HTMLIFrameElement,
type: string,
message: any,
): Promise<any> {
const jobID = randomString(6, new Date().toUTCString());
const lock = Zotero.Promise.defer();
let retMessage: any;
const listener = (ev: MessageEvent) => {
if (ev.data.type === `${type}Return` && ev.data.jobID === jobID) {
retMessage = ev.data.message;
lock.resolve();
}
};
worker.contentWindow?.addEventListener("message", listener);
worker.contentWindow?.postMessage(
{
type,
jobID,
message,
},
"*",
);
await lock.promise;
worker.contentWindow?.removeEventListener("message", listener);
return retMessage;
}
function destroyWorker(worker: any) {
worker.parentNode.removeChild(worker);
worker = null;

View File

@ -256,10 +256,13 @@ async function renderNoteHTML(
const mathDelimiterRegex = /^\$+|\$+$/g;
doc.querySelectorAll(".math").forEach((node) => {
const displayMode = node.innerHTML.startsWith("$$");
node.innerHTML = katex.renderToString(
node.innerHTML.replace(mathDelimiterRegex, ""),
{
throwOnError: false,
// output: "mathml",
displayMode,
},
);
});