Mammoth.js 是一个用于将 Word 文档(.docx)转换为 HTML 或 Markdown 的 JavaScript 库,支持浏览器和 Node.js 环境。
<script src="https://unpkg.com/[email protected]/mammoth.browser.min.js">script>
npm install mammoth
# 或
yarn add mammoth
// 浏览器中使用 input[type=file] 获取文件
document.getElementById("docx-file").addEventListener("change", function(event) {
const file = event.target.files[0];
mammoth.extractRawText({arrayBuffer: file})
.then(function(result) {
document.getElementById("output").innerHTML = result.value;
console.log(result.messages); // 转换过程中的消息
})
.catch(function(error) {
console.error(error);
});
});
// Node.js 中使用
const mammoth = require("mammoth");
const fs = require("fs");
fs.readFile("document.docx", function(err, data) {
mammoth.extractRawText({arrayBuffer: data})
.then(function(result) {
fs.writeFileSync("output.html", result.value);
})
.catch(function(error) {
console.error(error);
});
});
mammoth.convertToMarkdown({arrayBuffer: file})
.then(function(result) {
console.log(result.value); // Markdown 内容
});
mammoth.convertToHtml(input, options)
- 转换为 HTMLmammoth.convertToMarkdown(input, options)
- 转换为 Markdownmammoth.extractRawText(input)
- 提取纯文本输入可以是以下形式之一:
{arrayBuffer: arrayBuffer}
- ArrayBuffer 对象{buffer: buffer}
- Node.js Buffer 对象const options = {
styleMap: [
"p[style-name='Heading 1'] => h1:fresh",
"p[style-name='Heading 2'] => h2:fresh",
"r[style-name='Strong'] => strong"
]
};
mammoth.convertToHtml({arrayBuffer: file}, options)
.then(function(result) {
// 使用自定义样式映射的 HTML
});
const options = {
convertImage: mammoth.images.imgElement(function(image) {
return image.read("base64").then(function(imageBuffer) {
return {
src: "data:" + image.contentType + ";base64," + imageBuffer
};
});
})
};
mammoth.convertToHtml({arrayBuffer: file}, options);
function convertParagraph(element) {
if (element.styleId && element.styleId === "Heading1") {
return { type: "element", tagName: "h1", children: element.children };
}
}
const options = {
converters: {
paragraph: convertParagraph
}
};
const options = {
ignoreEmptyParagraphs: true
};
<input type="file" id="docx-input">
<div id="preview">div>
<script>
document.getElementById("docx-input").addEventListener("change", function(e) {
const file = e.target.files[0];
mammoth.convertToHtml({arrayBuffer: file})
.then(function(result) {
document.getElementById("preview").innerHTML = result.value;
})
.catch(function(error) {
console.error(error);
});
});
script>
const mammoth = require("mammoth");
const fs = require("fs");
const path = require("path");
const inputDir = "./docx-files";
const outputDir = "./html-files";
fs.readdir(inputDir, (err, files) => {
files.forEach(file => {
if (path.extname(file) === ".docx") {
const inputPath = path.join(inputDir, file);
const outputPath = path.join(outputDir, path.basename(file, ".docx") + ".html");
fs.readFile(inputPath, (err, data) => {
mammoth.convertToHtml({buffer: data})
.then(result => {
fs.writeFileSync(outputPath, result.value);
console.log(`Converted ${file} to HTML`);
})
.catch(error => {
console.error(`Error converting ${file}:`, error);
});
});
}
});
});
const options = {
convertImage: mammoth.images.imgElement(function(image) {
const extension = image.contentType.split("/")[1];
const filename = `image-${Date.now()}.${extension}`;
return image.read().then(function(imageBuffer) {
fs.writeFileSync(path.join("images", filename), imageBuffer);
return { src: `images/${filename}` };
});
})
};
mammoth.convertToHtml({buffer: data}, options);
Mammoth.js 是一个强大的工具,特别适合需要将 Word 文档内容集成到 Web 应用中的场景,如 CMS 系统、文档管理系统等。