package com.example.demo;
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.jacob.activeX.ActiveXComponent;
import com.jacob.com.Dispatch;
import com.jacob.com.Variant;
import sun.misc.BASE64Encoder;
@SuppressWarnings("restriction")
public class Word2HtmlPDFUpdate {
// 8 代表word保存成html
public static final int WORD_HTML = 8;
public static final String BASE64_PREFFIX = "data:image/jpeg;base64,";
public static String wordToHtml(String word, String html) {
ActiveXComponent app = new ActiveXComponent("Word.Application");
String title = null;
try {
// 设置word应用程序不可见
app.setProperty("Visible", new Variant(false));
// documents表示word程序的所有文档窗口,(word是多文档应用程序)
Dispatch docs = app.getProperty("Documents").toDispatch();
// 打开要转换的word文件
Dispatch doc = Dispatch.invoke(docs, "Open", Dispatch.Method,
new Object[] { word, new Variant(false), new Variant(true) }, new int[1]).toDispatch();
Dispatch wordContent = Dispatch.get(doc, "Content").toDispatch();
Dispatch paragraphs = Dispatch.get(wordContent, "Paragraphs").toDispatch();
// int paragraphCount = Dispatch.get(paragraphs, "Count").getInt();// 总行数
title = getContentByRow(paragraphs, 1);
if (null != title) {
title = title.trim();
}
// 作为html格式保存到临时文件
Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[] { html, new Variant(WORD_HTML) }, new int[1]);
// 关闭word文件
Dispatch.call(doc, "Close", new Variant(false));
} catch (Exception e) {
e.printStackTrace();
return null;
} finally {
// 关闭word应用程序
app.invoke("Quit", new Variant[] {});
}
return title;
}
private static String getContentByRow(Dispatch paragraphs, int i) {
Dispatch paragraph = Dispatch.call(paragraphs, "Item", new Variant(i)).toDispatch();
Dispatch paragraphRange = Dispatch.get(paragraph, "Range").toDispatch();
String paragraphContent = Dispatch.get(paragraphRange, "Text").toString();
return paragraphContent;
}
public static synchronized String convertHtml(String word, String html) {
return wordToHtml(word, html);
}
public static String replaceSrc2Base64(String htmlStr, String parentPath) {
String img = "";
Pattern p_image;
Matcher m_image;
String regEx_img = "<img[^>]*>";
p_image = Pattern.compile(regEx_img, Pattern.CASE_INSENSITIVE);
m_image = p_image.matcher(htmlStr);
while (m_image.find()) {
// 得到<img />数据
img = m_image.group();
// 匹配<img>中的src数据
Matcher m = Pattern.compile("src\\s*=\\s*\"?(.*?)(\"|>|\\s+)").matcher(img);
while (m.find()) {
String str = m.group(1);
String path = parentPath + str;
String base64Encode = base64Encode(path);
htmlStr = htmlStr.replace(str, base64Encode);
}
}
return htmlStr;
}
public static String base64Encode(String imgSrc) {
InputStream is = null;
byte[] data = null;
try {
is = new FileInputStream(new File(imgSrc));
data = new byte[is.available()];
is.read(data);
} catch (Exception e) {
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
BASE64Encoder encoder = new BASE64Encoder();
return BASE64_PREFFIX + encoder.encode(data);
}
public static void write(String path, String content) {
OutputStream out = null;
try {
out = new FileOutputStream(new File(path));
out.write(content.getBytes("GBK"));
} catch (Exception e) {
e.printStackTrace();
} finally {
if (null != out) {
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
public static boolean deleteFile(File dirFile) {
if (!dirFile.exists()) {
return false;
}
if (dirFile.isFile()) {
return dirFile.delete();
} else {
for (File file : dirFile.listFiles()) {
deleteFile(file);
}
}
return dirFile.delete();
}
public static void main(String[] args) {
convertWord2Html("c://", "bbb.docx");
}
public static void convertWord2Html(String parentPath, String fileName) {
String name = fileName.split("\\.")[0];
String srcPath = parentPath + fileName;
String destPath = parentPath + name + ".html";
String tempDir = parentPath + name + ".files";
// 第一步 word 转 html 获取文章的标题, html 会生成到一个对应的目录
// 第二部 读html,找到img标签的src,找到对应的图片的路径,转为base64并且替换
// 第三部 将修改后的内容写到html
String title = Word2HtmlPDFUpdate.convertHtml(srcPath, destPath); // --> 1
if (null != title) {
FileInputStream is = null;
BufferedInputStream bis = null;
try {
is = new FileInputStream(new File(destPath));
bis = new BufferedInputStream(is);
byte[] buffer = new byte[2048];
int len = 0;
StringBuffer content = new StringBuffer();
while ((len = bis.read(buffer)) != -1) {
content.append(new String(buffer, 0, len, "GBK"));
}
String text = content.toString();
text = text.replaceFirst("charset=gb2312", text);
text = replaceSrc2Base64(text, parentPath); // --> 2
write(destPath, text); // --> 3
System.out.println("html输出路径:" + destPath);
System.out.println(title.trim());
System.out.println(text);
File tempFile = new File(tempDir);
if (tempFile.exists()) {
deleteFile(tempFile);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (bis != null) {
try {
bis.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}
}