static function iterateOverNode($node)
{
if ($node instanceof \DOMText) {
// Replace whitespace characters with a space (equivilant to \s)
return preg_replace("/[\\t\\n\\f\\r ]+/im", " ", $node->wholeText);
}
if ($node instanceof \DOMDocumentType) {
// ignore
return "";
}
$nextName = static::nextChildName($node);
$prevName = static::prevChildName($node);
$name = strtolower($node->nodeName);
// start whitespace
switch ($name) {
case "hr":
return "---------------------------------------------------------------\n";
case "style":
case "head":
case "title":
case "meta":
case "script":
// ignore these tags
return "";
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
case "ol":
case "ul":
// add two newlines, second line is added below
$output = "\n";
break;
case "td":
case "th":
// add tab char to separate table fields
$output = "\t";
break;
case "tr":
case "p":
case "div":
// add one line
$output = "\n";
break;
case "li":
$output = "- ";
break;
default:
// print out contents of unknown tags
$output = "";
break;
}
// debug
//$output .= "[$name,$nextName]";
if (isset($node->childNodes)) {
for ($i = 0; $i < $node->childNodes->length; $i++) {
$n = $node->childNodes->item($i);
$text = static::iterateOverNode($n);
$output .= $text;
}
}
// end whitespace
switch ($name) {
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
$output .= "\n";
break;
case "p":
case "br":
// add one line
if ($nextName != "div") {
$output .= "\n";
}
break;
case "div":
// add one line only if the next child isn't a div
if ($nextName != "div" && $nextName != null) {
$output .= "\n";
}
break;
case "a":
// links are returned in [text](link) format
$href = $node->getAttribute("href");
$output = trim($output);
// remove double [[ ]] s from linking images
if (substr($output, 0, 1) == "[" && substr($output, -1) == "]") {
$output = substr($output, 1, strlen($output) - 2);
// for linking images, the title of the <a> overrides the title of the <img>
if ($node->getAttribute("title")) {
$output = $node->getAttribute("title");
}
}
// if there is no link text, but a title attr
if (!$output && $node->getAttribute("title")) {
$output = $node->getAttribute("title");
}
if ($href == null) {
// it doesn't link anywhere
if ($node->getAttribute("name") != null) {
$output = "[{$output}]";
}
} else {
if ($href == $output || $href == "mailto:{$output}" || $href == "http://{$output}" || $href == "https://{$output}") {
// link to the same address: just use link
$output;
} else {
// replace it
if ($output) {
$output = "[{$output}]({$href})";
} else {
// empty string
$output = $href;
}
}
}
// does the next node require additional whitespace?
switch ($nextName) {
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
$output .= "\n";
break;
}
break;
case "img":
if ($node->getAttribute("title")) {
$output = "[" . $node->getAttribute("title") . "]";
} elseif ($node->getAttribute("alt")) {
$output = "[" . $node->getAttribute("alt") . "]";
} else {
$output = "";
}
break;
case "li":
$output .= "\n";
break;
default:
// do nothing
}
return $output;
}