htmlparser解析器 是一款非常不错的基于C#的网页分析工具,它的来源:Majestic-12,是一个分布式的搜索引擎^_^。至于这个Majestic-12是什么,我不再赘述,感兴趣的朋友可以自行在网络上搜索。html解析器的用途非常广泛,我把它用来分析百度的热门关键词,效果也很不错哦

百度风云榜上的关键词是可以订阅的,不错貌似不是RSS2.0标准协议的。它的结构是一个表格:

<description><![CDATA[
  <table>
                    <tbody>
                        <!--循环下面第一个tr结构50次-->

                        <tr>
                                <th>1</th><!--序号  改变变量-->
                            <td><a href="http://www.baidu.com/baidu?cl=3&tn=baidutop10&fr=top1000&wd=%D6%B1%CD%A8%D6%D0%C4%CF%C1%F4%D1%D4%B0%E5" target="_blank">直通中南留言板</a></td><!--标题,字数控制<28个汉字的空间-->
                            <td><a href="../detail/1_%D6%B1%CD%A8%D6%D0%C4%CF%C1%F4%D1%D4%B0%E5.html" target="_blank"></a></td><!--详 的链接-->
                            <td>5436</td><!--最近24小时-->
                            <td>543500%</td><!--本周搜索量-->
                            <td><!--下面是文字链,替换数据链接-->

                                <a href="http://news.baidu.com/ns?tn=news&from=news&cl=2&rn=20&ct=0&word=%D6%B1%CD%A8%D6%D0%C4%CF%C1%F4%D1%D4%B0%E5" target="_blank">新闻</a>

                                <a href="http://tieba.baidu.com/f?kw=%D6%B1%CD%A8%D6%D0%C4%CF%C1%F4%D1%D4%B0%E5" target="_blank">贴吧</a>

                                <a href="http://image.baidu.com/i?tn=baiduimage&lm=-1&ct=201326592&cl=2&word=%D6%B1%CD%A8%D6%D0%C4%CF%C1%F4%D1%D4%B0%E5" target="_blank">图片</a>

                            </td>

                        </tr>
...
</tbody>
</table>
]]></description>

td中的数据分别代表 排名、关键词、最近24小时搜索量、变化率和相关链接,下面就使用htmlparser来解析这些数据并保存起来。

        public void parserTR(ref HTMLparser op,HTMLchunk oc)
        {
            HTMLchunk oChunk = oc;
            string href = "";
            string key = "";
            string hits = "";
            string changerate = "";
            string xinwen = "";
            string tieba = "";
            string tupian = "";
            bool skipdetail = false;
            bool sousuoliang24hour = false;
            bool bianhualv1week = false;
            bool _xinwen = false;
            bool _tieba = false;
            bool _tupian = false;
            bool finished = false;

            do
            {
                if (finished)
                {
                    break;
                }
                if (oChunk.sTag == "td" && oChunk.oType == HTMLchunkType.OpenTag)
                {
                    if (skipdetail)
                    {
                        skipdetail = false;
                        continue;
                    }
                    if (oChunk.GetParamValue("class") == "key")
                    {
                        oChunk = op.ParseNext();
                        if (oChunk.sTag == "a")
                        {
                            href = oChunk.GetParamValue("href");
                            oChunk = op.ParseNext();
                            key = oChunk.oHTML;
                            skipdetail = true;
                        }
                    }
                    else if (!sousuoliang24hour)
                    {
                        op.ParseNext();
                        hits = oChunk.oHTML;
                        sousuoliang24hour = true;
                    }
                    else if (!bianhualv1week)
                    {
                        op.ParseNext();
                        changerate = oChunk.oHTML;
                        bianhualv1week = true;
                    }
                    else
                    {
                        while ((oChunk = op.ParseNext()) != null)
                        {
                            if ((oChunk.sTag == "td" && oChunk.oType == HTMLchunkType.CloseTag))
                                break;
                            if (_tieba && _tupian && _xinwen)
                            {
                                finished = true;
                                break;
                            }
                            if (oChunk.sTag == "a" && oChunk.oType == HTMLchunkType.OpenTag)
                            {
                                if (!_xinwen)
                                {
                                    xinwen = oChunk.GetParamValue("href");
                                    _xinwen = true;
                                }
                                else if (!_tieba)
                                {
                                    tieba = oChunk.GetParamValue("href");
                                    _tieba = true;
                                }
                                else if (!_tupian)
                                {
                                    tupian = oChunk.GetParamValue("href");
                                    _tupian = true;
                                }
                            }
                        }
                    }
                }
            } while ((oChunk = op.ParseNext()) != null);
            keyword k = new keyword() ;
            k.key = key; //关键词
            k.href = href; //搜索链接
            k.hits = hits; //24小时搜索量
            k.changerate = changerate; //变化率
            k.xinwen = xinwen; //百度相关新闻
            k.tieba = tieba; //帖吧
            k.tupian = tupian; //百度图片
            k.date = DateTime.Now;
            k.done = false;
            if(k.key != "")
            this.addKeyword(k);
        }