将Word(.doc)里的图片替换成文本

目标：将word里的图片替换成文本

流程、思想：

读取出Word中的图片 - 存储读取到的图片（提供图片识别原） - 删除Word中的该图片 - 在Word中该图片位置插入替换文字 - 删除存储的图片

Maven依赖

<!-- poi-实现word文件的读取和修改等操作 -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>4.0.0</version>
</dependency>
<!-- 针对于2007版（.docx） -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>4.0.0</version>
</dependency>
<!-- 针对于2003版（.doc） -->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>4.0.0</version>
</dependency>
<!-- poi-end -->
<!-- Jsoup-用以解析HTML -->
<dependency>
	<groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>
<!-- Jsoup-end -->

代码实现

public static void main(String argv[]) {
    try {
        //源文件全路径
        String oldFileName = "D:/Test/ml/test.doc";
        //目标文件全路径
        String newFileName = "D:/Test/ml/html_to_word.doc";
        //构造HTML生成全路径
        String temporary = 
            new File(oldFileName).getParent()+File.separator+"temporary.html";

        //读取DOC生成获得html内容和图片集
        Map<String,Object> map = convert2Html(oldFileName);
        //取得图片集
        List<String> listPics = (List<String>) map.get("listPics");
        //取得生成的html内容
        String htmlContent = (String) map.get("htmlContent");
        for (String pic : listPics){
            //有后缀名的图片才进行识别
            if (pic.lastIndexOf(".") != -1){
                //以下两行代码是构造HTML中的图片标记（即：<img ...>）
                String temp = htmlContent.substring(
                    htmlContent.indexOf("<img src=\""+pic.replace("\\","/")+"\""));
                temp = temp.substring(0,temp.indexOf(">")+1);
                //文字替换图片
                htmlContent = htmlContent.replaceAll(temp,"<span>{{title}}</span>");
            }else {/*因为识别出来的非图片格式（即：无后缀名）的图片再放回doc时会出现
            图片异常显示，所以以"<span></span>"进行占位*/
                //以下两行代码是构造HTML中的图片标记（即：<img ...>）
                String temp = htmlContent.substring(
                    htmlContent.indexOf("<img src=\""+pic.replace("\\","/")+"\""));
                temp = temp.substring(0,temp.indexOf(">")+1);
                //占位
                htmlContent = htmlContent.replaceAll(temp,"<span></span>");
            }
        }
        //输出HTML文件
        writeFile(htmlContent, temporary);
        //HTML转Doc
        html2Doc(temporary,newFileName);
    } catch (Exception e) {
        e.printStackTrace();
    }
}

/**
 * 输出HTML文件
 * @param content 从doc读取生成的html文本
 * @param path 目标文件全路径
 */
public static void writeFile(String content, String path) {

    FileOutputStream fos = null;
    BufferedWriter bw = null;
    org.jsoup.nodes.Document doc = Jsoup.parse(content);
    String styleOld=doc.getElementsByTag("style").html();
    //统一字体格式为宋体
    styleOld=styleOld.replaceAll("font-family:.+(?=;\\b)", "font-family:SimSun");

    doc.getElementsByTag("head").empty();
    doc.getElementsByTag("head")
        .append("<meta http-equiv=\"Content-Type\" content=\"text/html;
                charset=UTF-8\"></meta>");
    doc.getElementsByTag("head")
        .append(" <style type=\"text/css\"></style>");
    doc.getElementsByTag("style")
        .append(styleOld);
    /*正则表达式查询字体内容：font-family:.+(?=;\b)*/

    content=doc.html();
    content=content
        .replace("<meta http-equiv=\"Content-Type\" content=\"text/html; 
                 charset=UTF-8\">", "<meta http-equiv=\"Content-Type\" 
                 content=\"text/html; charset=UTF-8\"></meta>");
    try {
        File file = new File(path);
        fos = new FileOutputStream(file);
        bw = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8"));
        bw.write(content);
    } catch (FileNotFoundException fnfe) {
        fnfe.printStackTrace();
    } catch (IOException ioe) {
        ioe.printStackTrace();
    } finally {
        try {
            if (bw != null)
                bw.close();
            if (fos != null)
                fos.close();
        } catch (IOException ie) {
        }
    }
}

/**
 * 读取DOC生成获得html内容和图片集
 * @param fileName 源文件全路径
 * @return 返回的map中包含html内容和图片集
 * @throws Exception
 */
public static Map<String,Object> convert2Html(String fileName) throws Exception {

    //存放html内容和图片集
    Map<String,Object> map = new HashMap<>();
    //保存图片集
    List<String> listPics = new ArrayList<>();
    //构造提取出来的图片保存路径以及HTML中<img>的图片源路径
    String imgPath = new File(fileName).getParent()+
        File.separator+"img"+File.separator;
    File file = new File(imgPath);
    //不存在，则创建
    if (!file.exists()){
        file.mkdirs();
    }

    HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));
    //WordToHtmlUtils.loadDoc(new FileInputStream(inputFile));
    WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
        DocumentBuilderFactory.newInstance().newDocumentBuilder()
        .newDocument());
    wordToHtmlConverter.setPicturesManager( new PicturesManager() {
        public String savePicture(byte[] content, PictureType pictureType, 
               String suggestedName, float widthInches, float heightInches ) {
            return imgPath.replace("\\","/") + suggestedName;
        }
    } );
    wordToHtmlConverter.processDocument(wordDocument);

    //保存图片
    List pics=wordDocument.getPicturesTable().getAllPictures();
    if(pics!=null){
        for(int i=0;i<pics.size();i++){
            Picture pic = (Picture)pics.get(i);
            try {
                String picAbsolutePath = imgPath + pic.suggestFullFileName();
                //将图片全路径存入listPics
                listPics.add(picAbsolutePath);
                pic.writeImageContent(new FileOutputStream(picAbsolutePath));

            } catch (FileNotFoundException e) {
                e.printStackTrace();
            }
        }
    }
    Document htmlDocument = wordToHtmlConverter.getDocument();

    ByteArrayOutputStream out = new ByteArrayOutputStream();
    DOMSource domSource = new DOMSource(htmlDocument);

    StreamResult streamResult = new StreamResult(out);

    TransformerFactory tf = TransformerFactory.newInstance();
    Transformer serializer = tf.newTransformer();

    serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
    serializer.setOutputProperty(OutputKeys.INDENT, "yes");
    serializer.setOutputProperty(OutputKeys.METHOD, "HTML");
    serializer.transform(domSource, streamResult);
    out.close();

    String htmlContent=new String(out.toByteArray());
    //替换UEditor无法识别的转义字符
    htmlContent=htmlContent.replaceAll("&ldquo;","\"").replaceAll("&rdquo;","\"")
      .replaceAll("&mdash;","-");

    map.put("htmlContent",htmlContent);
    map.put("listPics",listPics);

    return map;
}

/**
 * html转doc
 * @param source
 * @param target
 * @throws Exception
 */
public static void html2Doc(String source , String target) throws Exception {
    //创建 POIFSFileSystem 对象
    POIFSFileSystem poifs = new POIFSFileSystem();
    //获取DirectoryEntry
    DirectoryEntry directory = poifs.getRoot();
    //创建输出流
    OutputStream out = new FileOutputStream(target);
    try {
        //创建文档,1.格式"WordDocument",2.HTML文件输入流
        directory.createDocument("WordDocument", new FileInputStream(source));
        //写入
        poifs.writeFilesystem(out);
        //释放资源
        out.close();
        System.out.println("success");
    } catch (IOException e) {
        e.printStackTrace();
    }
}

将Word(.doc)里的图片替换成文本

目标：将word里的图片替换成文本

Maven依赖

代码实现

请喝咖啡，谢谢老板~