jacob word 转html

jacob word转html

步骤其实很简单,word 可用通过jacob直接转为html为了方便, 我们要将图片转为base64而已

需要注意的是 linux 系统好像不支持jacob ,所以只有windows 才支持
1.在项目中build path 加入jacob.jar
2.在jdk的bin目录加入对应系统的dll 文件 , 例如jacob-1.18-x64.dll 
<dependency>
    <groupId>com.hynnet</groupId>
    <artifactId>jacob</artifactId>
    <version>1.18</version>
</dependency>
package com.example.demo;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.jacob.activeX.ActiveXComponent;
import com.jacob.com.Dispatch;
import com.jacob.com.Variant;

import sun.misc.BASE64Encoder;

@SuppressWarnings("restriction")
public class Word2HtmlPDFUpdate {
	// 8 代表word保存成html
	public static final int WORD_HTML = 8;
	public static final String BASE64_PREFFIX = "data:image/jpeg;base64,";

	public static String wordToHtml(String word, String html) {
		ActiveXComponent app = new ActiveXComponent("Word.Application");
		String title = null;
		try {
			// 设置word应用程序不可见
			app.setProperty("Visible", new Variant(false));
			// documents表示word程序的所有文档窗口,(word是多文档应用程序)
			Dispatch docs = app.getProperty("Documents").toDispatch();
			// 打开要转换的word文件
			Dispatch doc = Dispatch.invoke(docs, "Open", Dispatch.Method,
					new Object[] { word, new Variant(false), new Variant(true) }, new int[1]).toDispatch();

			Dispatch wordContent = Dispatch.get(doc, "Content").toDispatch();
			Dispatch paragraphs = Dispatch.get(wordContent, "Paragraphs").toDispatch();
			// int paragraphCount = Dispatch.get(paragraphs, "Count").getInt();// 总行数
			title = getContentByRow(paragraphs, 1);
			if (null != title) {
				title = title.trim();
			}

			// 作为html格式保存到临时文件
			Dispatch.invoke(doc, "SaveAs", Dispatch.Method, new Object[] { html, new Variant(WORD_HTML) }, new int[1]);
			// 关闭word文件
			Dispatch.call(doc, "Close", new Variant(false));
		} catch (Exception e) {

			e.printStackTrace();
			return null;
		} finally {
			// 关闭word应用程序
			app.invoke("Quit", new Variant[] {});
		}
		return title;
	}

	private static String getContentByRow(Dispatch paragraphs, int i) {
		Dispatch paragraph = Dispatch.call(paragraphs, "Item", new Variant(i)).toDispatch();
		Dispatch paragraphRange = Dispatch.get(paragraph, "Range").toDispatch();
		String paragraphContent = Dispatch.get(paragraphRange, "Text").toString();
		return paragraphContent;
	}

	public static synchronized String convertHtml(String word, String html) {
		return wordToHtml(word, html);
	}

	public static String replaceSrc2Base64(String htmlStr, String parentPath) {
		String img = "";
		Pattern p_image;
		Matcher m_image;
		String regEx_img = "<img[^>]*>";
		p_image = Pattern.compile(regEx_img, Pattern.CASE_INSENSITIVE);
		m_image = p_image.matcher(htmlStr);
		while (m_image.find()) {
			// 得到<img />数据
			img = m_image.group();
			// 匹配<img>中的src数据
			Matcher m = Pattern.compile("src\\s*=\\s*\"?(.*?)(\"|>|\\s+)").matcher(img);
			while (m.find()) {
				String str = m.group(1);
				String path = parentPath + str;
				String base64Encode = base64Encode(path);
				htmlStr = htmlStr.replace(str, base64Encode);
			}
		}
		return htmlStr;
	}

	public static String base64Encode(String imgSrc) {
		InputStream is = null;
		byte[] data = null;
		try {
			is = new FileInputStream(new File(imgSrc));
			data = new byte[is.available()];
			is.read(data);
		} catch (Exception e) {
		} finally {
			if (is != null) {
				try {
					is.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
		BASE64Encoder encoder = new BASE64Encoder();
		return BASE64_PREFFIX + encoder.encode(data);
	}

	public static void write(String path, String content) {
		OutputStream out = null;
		try {
			out = new FileOutputStream(new File(path));
			out.write(content.getBytes("GBK"));
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			if (null != out) {
				try {
					out.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}

	public static boolean deleteFile(File dirFile) {
		if (!dirFile.exists()) {
			return false;
		}

		if (dirFile.isFile()) {
			return dirFile.delete();
		} else {

			for (File file : dirFile.listFiles()) {
				deleteFile(file);
			}
		}

		return dirFile.delete();
	}

	public static void main(String[] args) {
		convertWord2Html("c://", "bbb.docx");
	}

	public static void convertWord2Html(String parentPath, String fileName) {
		String name = fileName.split("\\.")[0];
		String srcPath = parentPath + fileName;
		String destPath = parentPath + name + ".html";
		String tempDir = parentPath + name + ".files";
		// 第一步 word 转 html 获取文章的标题, html 会生成到一个对应的目录
		// 第二部 读html,找到img标签的src,找到对应的图片的路径,转为base64并且替换
		// 第三部 将修改后的内容写到html

		String title = Word2HtmlPDFUpdate.convertHtml(srcPath, destPath); // --> 1
		if (null != title) {
			FileInputStream is = null;
			BufferedInputStream bis = null;
			try {
				is = new FileInputStream(new File(destPath));
				bis = new BufferedInputStream(is);
				byte[] buffer = new byte[2048];
				int len = 0;
				StringBuffer content = new StringBuffer();
				while ((len = bis.read(buffer)) != -1) {
					content.append(new String(buffer, 0, len, "GBK"));
				}
				String text = content.toString();
				text = text.replaceFirst("charset=gb2312", text);

				text = replaceSrc2Base64(text, parentPath); // --> 2
				write(destPath, text); // --> 3
				System.out.println("html输出路径:" + destPath);
				System.out.println(title.trim());
				System.out.println(text);

				File tempFile = new File(tempDir);
				if (tempFile.exists()) {
					deleteFile(tempFile);
				}

			} catch (Exception e) {
				e.printStackTrace();
			} finally {
				if (is != null) {
					try {
						is.close();
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
				if (bis != null) {
					try {
						bis.close();
					} catch (IOException e) {
						e.printStackTrace();
					}
				}
			}
		}
	}
}