目标:将word里的图片替换成文本

流程、思想:

读取出Word中的图片 - 存储读取到的图片(提供图片识别原) - 删除Word中的该图片 - 在Word中该图片位置插入替换文字 - 删除存储的图片

Maven依赖

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
<!-- poi-实现word文件的读取和修改等操作 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.0.0</version>
</dependency>
<!-- 针对于2007版(.docx) -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.0.0</version>
</dependency>
<!-- 针对于2003版(.doc) -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.0.0</version>
</dependency>
<!-- poi-end -->
<!-- Jsoup-用以解析HTML -->
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.11.3</version>
</dependency>
<!-- Jsoup-end -->

代码实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
public static void main(String argv[]) {
try {
//源文件全路径
String oldFileName = "D:/Test/ml/test.doc";
//目标文件全路径
String newFileName = "D:/Test/ml/html_to_word.doc";
//构造HTML生成全路径
String temporary =
new File(oldFileName).getParent()+File.separator+"temporary.html";

//读取DOC生成获得html内容和图片集
Map<String,Object> map = convert2Html(oldFileName);
//取得图片集
List<String> listPics = (List<String>) map.get("listPics");
//取得生成的html内容
String htmlContent = (String) map.get("htmlContent");
for (String pic : listPics){
//有后缀名的图片才进行识别
if (pic.lastIndexOf(".") != -1){
//以下两行代码是构造HTML中的图片标记(即:<img ...>)
String temp = htmlContent.substring(
htmlContent.indexOf("<img src=\""+pic.replace("\\","/")+"\""));
temp = temp.substring(0,temp.indexOf(">")+1);
//文字替换图片
htmlContent = htmlContent.replaceAll(temp,"<span>{{title}}</span>");
}else {/*因为识别出来的非图片格式(即:无后缀名)的图片再放回doc时会出现
图片异常显示,所以以"<span></span>"进行占位*/
//以下两行代码是构造HTML中的图片标记(即:<img ...>)
String temp = htmlContent.substring(
htmlContent.indexOf("<img src=\""+pic.replace("\\","/")+"\""));
temp = temp.substring(0,temp.indexOf(">")+1);
//占位
htmlContent = htmlContent.replaceAll(temp,"<span></span>");
}
}
//输出HTML文件
writeFile(htmlContent, temporary);
//HTML转Doc
html2Doc(temporary,newFileName);
} catch (Exception e) {
e.printStackTrace();
}
}

/**
* 输出HTML文件
* @param content 从doc读取生成的html文本
* @param path 目标文件全路径
*/
public static void writeFile(String content, String path) {

FileOutputStream fos = null;
BufferedWriter bw = null;
org.jsoup.nodes.Document doc = Jsoup.parse(content);
String styleOld=doc.getElementsByTag("style").html();
//统一字体格式为宋体
styleOld=styleOld.replaceAll("font-family:.+(?=;\\b)", "font-family:SimSun");

doc.getElementsByTag("head").empty();
doc.getElementsByTag("head")
.append("<meta http-equiv=\"Content-Type\" content=\"text/html;
charset=UTF-8\"></meta>");
doc.getElementsByTag("head")
.append(" <style type=\"text/css\"></style>");
doc.getElementsByTag("style")
.append(styleOld);
/*正则表达式查询字体内容:font-family:.+(?=;\b)*/

content=doc.html();
content=content
.replace("<meta http-equiv=\"Content-Type\" content=\"text/html;
charset=UTF-8\">", "<meta http-equiv=\"Content-Type\"
content=\"text/html; charset=UTF-8\"></meta>");
try {
File file = new File(path);
fos = new FileOutputStream(file);
bw = new BufferedWriter(new OutputStreamWriter(fos,"UTF-8"));
bw.write(content);
} catch (FileNotFoundException fnfe) {
fnfe.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (bw != null)
bw.close();
if (fos != null)
fos.close();
} catch (IOException ie) {
}
}
}

/**
* 读取DOC生成获得html内容和图片集
* @param fileName 源文件全路径
* @return 返回的map中包含html内容和图片集
* @throws Exception
*/
public static Map<String,Object> convert2Html(String fileName) throws Exception {

//存放html内容和图片集
Map<String,Object> map = new HashMap<>();
//保存图片集
List<String> listPics = new ArrayList<>();
//构造提取出来的图片保存路径以及HTML中<img>的图片源路径
String imgPath = new File(fileName).getParent()+
File.separator+"img"+File.separator;
File file = new File(imgPath);
//不存在,则创建
if (!file.exists()){
file.mkdirs();
}

HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(fileName));
//WordToHtmlUtils.loadDoc(new FileInputStream(inputFile));
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(
DocumentBuilderFactory.newInstance().newDocumentBuilder()
.newDocument());
wordToHtmlConverter.setPicturesManager( new PicturesManager() {
public String savePicture(byte[] content, PictureType pictureType,
String suggestedName, float widthInches, float heightInches ) {
return imgPath.replace("\\","/") + suggestedName;
}
} );
wordToHtmlConverter.processDocument(wordDocument);

//保存图片
List pics=wordDocument.getPicturesTable().getAllPictures();
if(pics!=null){
for(int i=0;i<pics.size();i++){
Picture pic = (Picture)pics.get(i);
try {
String picAbsolutePath = imgPath + pic.suggestFullFileName();
//将图片全路径存入listPics
listPics.add(picAbsolutePath);
pic.writeImageContent(new FileOutputStream(picAbsolutePath));

} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
}
Document htmlDocument = wordToHtmlConverter.getDocument();

ByteArrayOutputStream out = new ByteArrayOutputStream();
DOMSource domSource = new DOMSource(htmlDocument);

StreamResult streamResult = new StreamResult(out);

TransformerFactory tf = TransformerFactory.newInstance();
Transformer serializer = tf.newTransformer();

serializer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "HTML");
serializer.transform(domSource, streamResult);
out.close();

String htmlContent=new String(out.toByteArray());
//替换UEditor无法识别的转义字符
htmlContent=htmlContent.replaceAll("&ldquo;","\"").replaceAll("&rdquo;","\"")
.replaceAll("&mdash;","-");

map.put("htmlContent",htmlContent);
map.put("listPics",listPics);

return map;
}

/**
* html转doc
* @param source
* @param target
* @throws Exception
*/
public static void html2Doc(String source , String target) throws Exception {
//创建 POIFSFileSystem 对象
POIFSFileSystem poifs = new POIFSFileSystem();
//获取DirectoryEntry
DirectoryEntry directory = poifs.getRoot();
//创建输出流
OutputStream out = new FileOutputStream(target);
try {
//创建文档,1.格式"WordDocument",2.HTML文件输入流
directory.createDocument("WordDocument", new FileInputStream(source));
//写入
poifs.writeFilesystem(out);
//释放资源
out.close();
System.out.println("success");
} catch (IOException e) {
e.printStackTrace();
}
}