使用HtmlParser抓去网页内容

package parser;

 

import org.htmlparser.Parser;

import org.htmlparser.beans.StringBean;

importorg.htmlparser.filters.NodeClassFilter;

importorg.htmlparser.parserapplications.StringExtractor;

import org.htmlparser.tags.BodyTag;

import org.htmlparser.util.NodeList;

import org.htmlparser.util.ParserException;

 

/**

 * 使用HtmlParser抓去网页内容: 要抓去页面的内容最方便的方法就是使用StringBean. 里面有几个控制页面内容的几个参数.

 * 在后面的代码中会有说明. Htmlparser包中还有一个示例StringExtractor 里面有个直接得到内容的方法,

 * 其中也是使用了StringBean . 另外直接解析Parser的每个标签也可以的.

 *

 *@author chenguoyong

 *

 */

public class GetContent {

       publicvoid getContentUsingStringBean(String url) {

              StringBeansb = new StringBean();

              sb.setLinks(true);// 是否显示web页面的连接(Links)

              //为了取得页面的整洁美观一般设置上面两项为true , 如果要保持页面的原有格式, 如代码页面的空格缩进 可以设置为false

              sb.setCollapse(true);// 如果是true的话把一系列空白字符用一个字符替代.

              sb.setReplaceNonBreakingSpaces(true);//If true regular space

              sb

                            .setURL("http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html");

              System.out.println("TheContent is :\n" + sb.getStrings());

 

       }

 

       publicvoid getContentUsingStringExtractor(String url, boolean link) {

              //StringExtractor内部机制和上面的一样.做了一下包装

              StringExtractorse = new StringExtractor(url);

              Stringtext = null;

              try{

                     text= se.extractStrings(link);

                     System.out.println("Thecontent is :\n" + text);

              }catch (ParserException e) {

                     e.printStackTrace();

              }

       }

 

       publicvoid getContentUsingParser(String url) {

              NodeListnl;

              try{

                     Parserp = new Parser(url);

                     nl= p.parse(new NodeClassFilter(BodyTag.class));

                     BodyTagbt = (BodyTag) nl.elementAt(0);

                     System.out.println(bt.toPlainTextString());// 保留原来的内容格式. 包含js代码

              }catch (ParserException e) {

                     e.printStackTrace();

              }

       }

 

       /**

        * @param args

        */

       publicstatic void main(String[] args) {

              Stringurl = "http://www.blogjava.net/51AOP/archive/2006/07/19/59064.html";

              //newGetContent().getContentUsingParser(url);

              //--------------------------------------------------

              newGetContent().getContentUsingStringBean(url);

http://c.tieba.baidu.com/p/3408749050
http://c.tieba.baidu.com/p/3408749395
http://c.tieba.baidu.com/p/3408869872
http://c.tieba.baidu.com/p/3408889389
http://c.tieba.baidu.com/p/3408905730
http://c.tieba.baidu.com/p/3408983919
http://c.tieba.baidu.com/p/3408987713
http://c.tieba.baidu.com/p/3409238829
http://c.tieba.baidu.com/p/3409302576
http://c.tieba.baidu.com/p/3409324206
http://c.tieba.baidu.com/p/3409328563
http://c.tieba.baidu.com/p/3409332883
http://c.tieba.baidu.com/p/3409337269
http://c.tieba.baidu.com/p/3409341558
http://c.tieba.baidu.com/p/3409345894
http://c.tieba.baidu.com/p/3409350213
http://c.tieba.baidu.com/p/3409354458
http://c.tieba.baidu.com/p/3409358652
http://c.tieba.baidu.com/p/3409358652
http://c.tieba.baidu.com/p/3409363045
http://c.tieba.baidu.com/p/3409367533
http://c.tieba.baidu.com/p/3409371860
http://c.tieba.baidu.com/p/3409376337
http://c.tieba.baidu.com/p/3409380701
http://c.tieba.baidu.com/p/3409389603
http://c.tieba.baidu.com/p/3409394100
http://c.tieba.baidu.com/p/3409398551
http://c.tieba.baidu.com/p/3409403048
http://c.tieba.baidu.com/p/3409412676
http://c.tieba.baidu.com/p/3409407844
http://c.tieba.baidu.com/p/3409417793
http://c.tieba.baidu.com/p/3409422741
http://c.tieba.baidu.com/p/3409432831
http://c.tieba.baidu.com/p/3409437768
http://c.tieba.baidu.com/p/3409442408
http://c.tieba.baidu.com/p/3409447140
http://c.tieba.baidu.com/p/3409451830
http://c.tieba.baidu.com/p/3409456819
http://c.tieba.baidu.com/p/3409461659
http://c.tieba.baidu.com/p/3409461659
http://c.tieba.baidu.com/p/3409466665
http://c.tieba.baidu.com/p/3409471467
http://c.tieba.baidu.com/p/3409476139
http://c.tieba.baidu.com/p/3409480662
http://c.tieba.baidu.com/p/3409485140
http://c.tieba.baidu.com/p/3409490104
http://c.tieba.baidu.com/p/3409494880
http://c.tieba.baidu.com/p/3409500048
http://c.tieba.baidu.com/p/3409538997
http://c.tieba.baidu.com/p/3409543296
http://c.tieba.baidu.com/p/3409548124
http://c.tieba.baidu.com/p/3409552702
http://c.tieba.baidu.com/p/3409557518
http://c.tieba.baidu.com/p/3409562457
http://c.tieba.baidu.com/p/3409567386
http://c.tieba.baidu.com/p/3409572148
http://c.tieba.baidu.com/p/3409576791
http://c.tieba.baidu.com/p/3409581593
http://c.tieba.baidu.com/p/3409586354
http://c.tieba.baidu.com/p/3409626383
http://c.tieba.baidu.com/p/3409385259
http://c.tieba.baidu.com/p/3409767728
http://c.tieba.baidu.com/p/3409787667
http://c.tieba.baidu.com/p/3409791516
http://c.tieba.baidu.com/p/3409795327
http://c.tieba.baidu.com/p/3409866665
http://c.tieba.baidu.com/p/3409873864
http://c.tieba.baidu.com/p/3409879998
http://c.tieba.baidu.com/p/3409884553
http://c.tieba.baidu.com/p/3409895642
http://c.tieba.baidu.com/p/3409900207
http://c.tieba.baidu.com/p/3409903862
http://c.tieba.baidu.com/p/3409912381
http://c.tieba.baidu.com/p/3409908113
http://c.tieba.baidu.com/p/3409991219
http://c.tieba.baidu.com/p/3410010420
http://c.tieba.baidu.com/p/3410018434
http://c.tieba.baidu.com/p/3410178761
http://c.tieba.baidu.com/p/3410147170
http://c.tieba.baidu.com/p/3410141093
http://c.tieba.baidu.com/p/3410131727
http://c.tieba.baidu.com/p/3410122313
http://c.tieba.baidu.com/p/3410112662
http://c.tieba.baidu.com/p/3410103121
http://c.tieba.baidu.com/p/3410097950
http://c.tieba.baidu.com/p/3410093865
http://c.tieba.baidu.com/p/3410088684
http://c.tieba.baidu.com/p/3410052996
http://c.tieba.baidu.com/p/3410046741
http://c.tieba.baidu.com/p/3408925683
http://c.tieba.baidu.com/p/3410196625

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。