在Java中,你可以使用Apache POI库来读取Word文档并提取文本内容。你可以在 Maven 项目中添加以下依赖:
<!--Word-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
以下是一个简单的示例代码,展示如何使用Apache POI将Word文档转换为文本:
import com.fasterxml.jackson.databind.exc.InvalidFormatException;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.FileInputStream;
public class WordToTextConverter {
public static void main(String[] args) {
String filePath = "C:\\xxx.docx";
try (FileInputStream fis = new FileInputStream(filePath);
XWPFDocument document = new XWPFDocument(fis)) {
// 1. Check if the document is empty
if (document.getParagraphs().isEmpty()) {
System.err.println("Word文档为空");
return;
}
// 2. Create XWPFWordExtractor
XWPFWordExtractor extractor = new XWPFWordExtractor(document);
// 3. Get text from the Word document
String text = extractor.getText();
// 4. Print the extracted text
System.out.println("Text from Word document:\n" + text);
} catch (InvalidFormatException e) {
System.err.println("无效的Word文档格式");
e.printStackTrace();
} catch (Exception e) {
System.err.println("无法读取Word文档");
e.printStackTrace();
}
}
}