读取Word文档,并将文本转成html标签,后面发现,经常有带上下角标的内容,于是一并处理了。
核心在于对XWPFRun对象中Subscript属性的处理。
1 ///2 /// 读取Word,并识别文本中的上下角标 3 /// 4 /// 5 ///6 public static string ReadWordTextExWithSubscript(string fileName) 7 { 8 9 string fileText = string.Empty;10 StringBuilder sbFileText = new StringBuilder();11 12 #region 打开文档13 XWPFDocument document = null;14 try15 {16 using (FileStream file = new FileStream(fileName, FileMode.Open, FileAccess.Read))17 {18 document = new XWPFDocument(file);19 }20 }21 catch (Exception e)22 {23 throw e;24 }25 #endregion26 //正文段落27 foreach (XWPFParagraph paragraph in document.Paragraphs)28 {29 //获取段楼中的句列表30 IList runsLists = paragraph.Runs;31 32 sbFileText.Append(" ");33 foreach (XWPFRun run in runsLists)34 {35 switch (run.Subscript)36 {37 case VerticalAlign.BASELINE:38 sbFileText.Append(run.Text);39 break;40 //上角标41 case VerticalAlign.SUPERSCRIPT:42 sbFileText.Append("" + run.Text + "");43 break;44 //下角标45 case VerticalAlign.SUBSCRIPT:46 sbFileText.Append("" + run.Text + "");47 break;48 default:49 sbFileText.Append(run.Text);50 break;51 }52 53 }54 sbFileText.AppendLine("
");55 }56 fileText = sbFileText.ToString();57 58 return fileText;59 }
Word文档:
输出:
<p>测试<sup>上</sup><sub>下</sub>ok。</p>
<p>CO<sub>2</sub></p><p>面积约6000km<sup>2</sup></p>Html预览: