htmlparser解析器的应用
htmlparser解析器 是一款非常不错的基于C#的网页分析工具,它的来源:Majestic-12,是一个分布式的搜索引擎^_^。至于这个Majestic-12是什么,我不再赘述,感兴趣的朋友可以自行在网络上搜索。html解析器的用途非常广泛,我把它用来分析百度的热门关键词,效果也很不错哦
百度风云榜上的关键词是可以订阅的,不错貌似不是RSS2.0标准协议的。它的结构是一个表格:
<description><![CDATA[ <table> <tbody> <!--循环下面第一个tr结构50次--> <tr> <th>1</th><!--序号 改变变量--> <td><a href="http://www.baidu.com/baidu?cl=3&tn=baidutop10&fr=top1000&wd=%D6%B1%CD%A8%D6%D0%C4%CF%C1%F4%D1%D4%B0%E5" target="_blank">直通中南留言板</a></td><!--标题,字数控制<28个汉字的空间--> <td><a href="../detail/1_%D6%B1%CD%A8%D6%D0%C4%CF%C1%F4%D1%D4%B0%E5.html" target="_blank"></a></td><!--详 的链接--> <td>5436</td><!--最近24小时--> <td>543500%</td><!--本周搜索量--> <td><!--下面是文字链,替换数据链接--> <a href="http://news.baidu.com/ns?tn=news&from=news&cl=2&rn=20&ct=0&word=%D6%B1%CD%A8%D6%D0%C4%CF%C1%F4%D1%D4%B0%E5" target="_blank">新闻</a> <a href="http://tieba.baidu.com/f?kw=%D6%B1%CD%A8%D6%D0%C4%CF%C1%F4%D1%D4%B0%E5" target="_blank">贴吧</a> <a href="http://image.baidu.com/i?tn=baiduimage&lm=-1&ct=201326592&cl=2&word=%D6%B1%CD%A8%D6%D0%C4%CF%C1%F4%D1%D4%B0%E5" target="_blank">图片</a> </td> </tr> ... </tbody> </table> ]]></description>
td中的数据分别代表 排名、关键词、最近24小时搜索量、变化率和相关链接,下面就使用htmlparser来解析这些数据并保存起来。
public void parserTR(ref HTMLparser op,HTMLchunk oc) { HTMLchunk oChunk = oc; string href = ""; string key = ""; string hits = ""; string changerate = ""; string xinwen = ""; string tieba = ""; string tupian = ""; bool skipdetail = false; bool sousuoliang24hour = false; bool bianhualv1week = false; bool _xinwen = false; bool _tieba = false; bool _tupian = false; bool finished = false; do { if (finished) { break; } if (oChunk.sTag == "td" && oChunk.oType == HTMLchunkType.OpenTag) { if (skipdetail) { skipdetail = false; continue; } if (oChunk.GetParamValue("class") == "key") { oChunk = op.ParseNext(); if (oChunk.sTag == "a") { href = oChunk.GetParamValue("href"); oChunk = op.ParseNext(); key = oChunk.oHTML; skipdetail = true; } } else if (!sousuoliang24hour) { op.ParseNext(); hits = oChunk.oHTML; sousuoliang24hour = true; } else if (!bianhualv1week) { op.ParseNext(); changerate = oChunk.oHTML; bianhualv1week = true; } else { while ((oChunk = op.ParseNext()) != null) { if ((oChunk.sTag == "td" && oChunk.oType == HTMLchunkType.CloseTag)) break; if (_tieba && _tupian && _xinwen) { finished = true; break; } if (oChunk.sTag == "a" && oChunk.oType == HTMLchunkType.OpenTag) { if (!_xinwen) { xinwen = oChunk.GetParamValue("href"); _xinwen = true; } else if (!_tieba) { tieba = oChunk.GetParamValue("href"); _tieba = true; } else if (!_tupian) { tupian = oChunk.GetParamValue("href"); _tupian = true; } } } } } } while ((oChunk = op.ParseNext()) != null); keyword k = new keyword() ; k.key = key; //关键词 k.href = href; //搜索链接 k.hits = hits; //24小时搜索量 k.changerate = changerate; //变化率 k.xinwen = xinwen; //百度相关新闻 k.tieba = tieba; //帖吧 k.tupian = tupian; //百度图片 k.date = DateTime.Now; k.done = false; if(k.key != "") this.addKeyword(k); }
似乎很不错啊,呵呵