08-03
31
java 正则表达式过滤html元素
作者:Java伴侣 日期:2008-03-31
这是我写的一个过滤html元素的程序,也许对大家有点帮助!
复制内容到剪贴板 程序代码
/**
* filter all html element.
* For example:<a href="www.sohu.com/test">hello!</a>
* The filter result is :hello!
* Notice:This method filter the text between "<" and ">"
* @param element
* @return
*/
public static String getTxtWithoutHTMLElement (String element)
{
// String reg="<[^<|^>]+>";
// return element.replaceAll(reg,"");
if(null==element||"".equals(element.trim()))
{
return element;
}
Pattern pattern=Pattern.compile("<[^<|^>]*>");
Matcher matcher=pattern.matcher(element);
StringBuffer txt=new StringBuffer();
while(matcher.find())
{
String group=matcher.group();
if(group.matches("<[\\s]*>"))
{
matcher.appendReplacement(txt,group);
}
else
{
matcher.appendReplacement(txt,"");
}
}
matcher.appendTail(txt);
repaceEntities(txt,"&","&");
repaceEntities(txt,"<","<");
repaceEntities(txt,">",">");
repaceEntities(txt,""","\"");
repaceEntities(txt," ","");
return txt.toString();
}
下面是测试用例:
public void testGetTxtWithoutHTMLElement ()
{
assertEquals("test",ExcelHssfView.getTxtWithoutHTMLElement("<a href='a/test'>test</a>"));
assertEquals("test",ExcelHssfView.getTxtWithoutHTMLElement("<a href='a/test'>test"));
assertEquals("test",ExcelHssfView.getTxtWithoutHTMLElement("<input type='text'>test</input>"));
assertEquals("test",ExcelHssfView.getTxtWithoutHTMLElement("<p>test"));
assertEquals("test",ExcelHssfView.getTxtWithoutHTMLElement("<table><tr><td>test</td></tr></table>"));
assertEquals("te<st",ExcelHssfView.getTxtWithoutHTMLElement("<p>te<st"));
assertEquals("te>st",ExcelHssfView.getTxtWithoutHTMLElement("<p>te>st"));
assertEquals("tst",ExcelHssfView.getTxtWithoutHTMLElement("<p>t<e>st"));
assertEquals("t<st",ExcelHssfView.getTxtWithoutHTMLElement("<p>t<<e>st"));
assertEquals("<>test",ExcelHssfView.getTxtWithoutHTMLElement("<p><>test"));
assertEquals("< >test",ExcelHssfView.getTxtWithoutHTMLElement("<p>< >test"));
assertEquals("<<>test",ExcelHssfView.getTxtWithoutHTMLElement("<p><<>test"));
assertEquals("test",ExcelHssfView.getTxtWithoutHTMLElement("<table><tr><td> test</td></tr></table>"));
}
* filter all html element.
* For example:<a href="www.sohu.com/test">hello!</a>
* The filter result is :hello!
* Notice:This method filter the text between "<" and ">"
* @param element
* @return
*/
public static String getTxtWithoutHTMLElement (String element)
{
// String reg="<[^<|^>]+>";
// return element.replaceAll(reg,"");
if(null==element||"".equals(element.trim()))
{
return element;
}
Pattern pattern=Pattern.compile("<[^<|^>]*>");
Matcher matcher=pattern.matcher(element);
StringBuffer txt=new StringBuffer();
while(matcher.find())
{
String group=matcher.group();
if(group.matches("<[\\s]*>"))
{
matcher.appendReplacement(txt,group);
}
else
{
matcher.appendReplacement(txt,"");
}
}
matcher.appendTail(txt);
repaceEntities(txt,"&","&");
repaceEntities(txt,"<","<");
repaceEntities(txt,">",">");
repaceEntities(txt,""","\"");
repaceEntities(txt," ","");
return txt.toString();
}
下面是测试用例:
public void testGetTxtWithoutHTMLElement ()
{
assertEquals("test",ExcelHssfView.getTxtWithoutHTMLElement("<a href='a/test'>test</a>"));
assertEquals("test",ExcelHssfView.getTxtWithoutHTMLElement("<a href='a/test'>test"));
assertEquals("test",ExcelHssfView.getTxtWithoutHTMLElement("<input type='text'>test</input>"));
assertEquals("test",ExcelHssfView.getTxtWithoutHTMLElement("<p>test"));
assertEquals("test",ExcelHssfView.getTxtWithoutHTMLElement("<table><tr><td>test</td></tr></table>"));
assertEquals("te<st",ExcelHssfView.getTxtWithoutHTMLElement("<p>te<st"));
assertEquals("te>st",ExcelHssfView.getTxtWithoutHTMLElement("<p>te>st"));
assertEquals("tst",ExcelHssfView.getTxtWithoutHTMLElement("<p>t<e>st"));
assertEquals("t<st",ExcelHssfView.getTxtWithoutHTMLElement("<p>t<<e>st"));
assertEquals("<>test",ExcelHssfView.getTxtWithoutHTMLElement("<p><>test"));
assertEquals("< >test",ExcelHssfView.getTxtWithoutHTMLElement("<p>< >test"));
assertEquals("<<>test",ExcelHssfView.getTxtWithoutHTMLElement("<p><<>test"));
assertEquals("test",ExcelHssfView.getTxtWithoutHTMLElement("<table><tr><td> test</td></tr></table>"));
}
评论: 0 | 引用: 0 | 查看次数: 715
发表评论