protected function kneadHtml($html, $type, $pos = 0)
{
libxml_use_internal_errors(true);
// Load HTML snippet into DOMDocument using UTF-8 hack
$utf8_hack = '<?xml version="1.0" encoding="UTF-8"?>';
$doc = new \DOMDocument();
$doc->loadHTML($utf8_hack . $html);
// Download images, change to relative paths
$doc = $this->scrapeAndKneadImages($doc);
// Download audio files, change to relative paths
$doc = $this->scrapeAndKneadMedia($doc);
// Deal with <a href="">, <a href=''>, and other mutations
$doc = $this->kneadHref($doc, $type, $pos);
// Make sure empty tags (e.g. <b></b>) don't get turned into self-closing versions by adding an empty text node to them.
$xpath = new \DOMXPath($doc);
while (($nodes = $xpath->query('//*[not(text() or node() or self::br or self::hr or self::img)]')) && $nodes->length > 0) {
foreach ($nodes as $node) {
$node->appendChild(new \DOMText(''));
}
}
// Remove srcset attributes because responsive images aren't a thing in the EPUB world.
$srcsets = $xpath->query('//img[@srcset]');
foreach ($srcsets as $srcset) {
$srcset->removeAttribute('srcset');
}
// If you are storing multi-byte characters in XML, then saving the XML using saveXML() will create problems.
// Ie. It will spit out the characters converted in encoded format. Instead do the following:
$html = $doc->saveXML($doc->documentElement);
// Remove auto-created <html> <body> and <!DOCTYPE> tags.
$html = preg_replace('/^<!DOCTYPE.+?>/', '', str_replace(array('<html>', '</html>', '<body>', '</body>'), array('', '', '', ''), $html));
// Mobi7 hacks
$html = $this->transformXML($utf8_hack . "<html>{$html}</html>", $this->dir . '/templates/epub201/mobi-hacks.xsl');
$errors = libxml_get_errors();
// TODO: Handle errors gracefully
libxml_clear_errors();
return $html;
}