PHP html解析器代码
虽然笔者更主张使用高级语言做html的解析器;可今天笔者让我们web界的小强php也做一把html解析器,证明小强的功力是多么深厚^_^
算法思想的说明:
可以将整个算法理解为一个有限状态机,首先获取 “<”,然后解析小于号右边的字符串直到 “>” 为止。这样,便完成了一个html标签的解析过程。
具体过程各位请看下面的代码:
function dp($txt, $var) { $trans = get_html_translation_table(HTML_ENTITIES); $encoded = strtr($var, $trans); printf("<br>%s is now: %s", $txt, $encoded); }
/* author: http://www.cnphp.info source: http://www.cnphp.info/php-html-parser-code-update.html */ function parse($html) { $html2 = $html; $tmp = $html2; $c=0; while (($tmp[$c] || $c < strlen($tmp))) { if ($tmp[$c] == '<' || $istag) { $istag = 0; if ($tmp[$c++] == '!') { // comment if ($tmp[$c++] == '-' || $tmp[$c+1] == '-') { $c+=2; // trim($tmp) ?? while ($tmp[$c] == ' ') $c++; $c--; startCommentCallback(); // find the end of the comment $t = $c; while (($tmp[$t] || $t < strlen($tmp)) && !($tmp[$t] == '-' && $tmp[$t++] == '-' && $tmp[$t+2] == '>')) $t++; if (($tmp[$t] || $t < strlen($tmp))) { while ($tmp[$t] == ' ') $t--; $comment = substr($tmp, $c, $t-$c); while ($tmp[$t] == ' ') $t++; $t+=2; $c = $t; } commentCallback($comment); endCommentCallback(); $c++; } else { $c--; // handle starttag } } elseif ($tmp[$c] == '/' || $tmp[$c-1] == '/') { if ($tmp[$c] == '/') $c++; $t=$c; while (($tmp[$t] || $t < strlen($tmp)) && $tmp[$t] != '>') $t++; $tag = substr($tmp,$c,$t-$c); endCallback($tag); $t++; $c = $t; continue; } else { // starttag here // maybe inseperate function because of above if ($tmp[$c-1] != '<') $c--; // comment function from above in seperate function call. if ($tmp[$c] == '!' && $tmp[$c++] == '-') comment(); $t = $c; $q = $c; // $q belongs to $tag[$q] $tagstart = $c; $tag = substr($tmp,$c); while ($tmp[$t] != '>' && $tmp[$t] != ' ') $t++; if ($tmp[$t] == '>') { $tag = substr($tmp, $c, $t-$c); if ($tag[0] == '!') { $tag = substr($tmp,$c+1, $t-($c+1)); $tagstart = $c+1; declCallback($tag, "", 0); } else startCallback($tag, "", 0); // TEST was c = t $c = $t+1; continue; } elseif ($tmp[$c] == ' ') { while ($tmp[$c] == ' ') $c++; } else { if ($tmp[$q] == '!') { $q++; $tag = substr($tmp, $q, $q-$c); declCallback ($tag, "", 0); } else { declCallback ($tag, "", 0); } break; } unset($args); $numargs = 0; while (($tmp[$c] || $c < strlen($tmp))) { $istrue = 0; $tagended = 0; while ($tmp[$c] == ' ') $c++; if (!$tmp[$c-1] == ' ') $c--; // $arg = start of argument $arg = $c; // $q is used to find end of argument if ($tmp[$arg] == '"' || $tmp[$arg] == '\'') { $c++; $arg = $c; while (($tmp[$c] || $c < strlen($tmp)) && !($tmp[$c] == '"' && $tmp[$c-1] != '\\' && $tmp[$c] != '\'')) $c++; // add arguments to table? if ($tmp[$c] != '>') continue; if ($tmp[$c+1] == '>') { $c++; } break; } $val = ""; while ($tmp[$c] != '=' && $tmp[$c] != ' ' && $tmp[$c] != '>') $c++; if ($tmp[$c] != ' ' && $tmp[$c] != '>') $istrue = 1; if ($tmp[$c] == '>') $tagended = 1; $q = $c; $c++; if ($istrue) { if ($tmp[$c] != '\'' && $tmp[$c] != '"') { while ($tmp[$c] != ' ' && $tmp[$c] != '>') $c++; if ($tmp[$c] == '>') { $val = substr($tmp,$q, $c-$q); } else { $c++; $val = substr($tmp, $c, $c-$q); continue; } } else { $c++; while ($tmp[$c] && ($tmp[$c] != '\'' || ($tmp[$c] == '\'' && $tmp[$c-1] == '\\')) && ($tmp[$c] != '"' || ($tmp[$c] == '"' && $tmp[$c-1] == '\\'))) $c++; if ($tmp[$c] == '>') { $val = substr($tmp,$q, $c-$q); $c++; // add args break; } elseif ($tmp[$c+1] == '>') { $val = substr($tmp, $q, $c-$q); $c++; // add args break; } else { $val = substr($tmp, $q, $c-$q); $c+=2; // add args } } } else { // add args if (!$tagended) continue; $tagended = 0; $c--; break; } } // is q allowed here? $q=0; if ($tag[$q] == '!') { $q++; // FIXME $tag = substr($tag, $q); // FIX THIS TOO declCallback($tag, "", 0); } else { // and this startCallback($tag, "", 0); } // clear arg list; $c++; continue; } } else { // check for newline char if ($tmp[$c] == '\n') { $c++; continue; } $text = $tmp; $q = $c; if ($text[$q] == '!') { $q--; if ($text[$q-1] == '<') { $q--; continue; } } while ($tmp[$c] == ' ' && $tmp[$c] != '<' && ($tmp[$c] || $c < strlen($tmp))) $c++; if ($tmp[$c] == '<' && $tmp[$c+1]) { continue; } else if (!($tmp[$c] || $c < strlen($tmp))) break; // text start textStartCallback(); for (;;) { while (($tmp[$c] || $c < strlen($tmp)) && $tmp[$c] != '<') $c++; if ($tmp[$c] == '<') { if ($tmp[$c+1] == ' ') { $c++; continue; } else $istag = 1; } break; } $text = substr($tmp, $q, $c-$q); // text callback textCallback($text); // text end textEndCallback(); $c++; continue; } } return; }
当然,如果你觉得这个html解析器太简单了请看笔者写的《PHP Simple HTML DOM解析器使用入门》
手颤了,飘过~