private function convert2Utf8($html, $header = null)
{
if (empty($html) || empty($header)) {
return $html;
}
$encoding = null;
// remove strange things
$html = str_replace('</[>', '', $html);
if (is_array($header)) {
$header = implode("\n", $header);
}
if (empty($header) || !preg_match_all('/([^;]+)(?:;\\s*charset=["\']?([^;"\'\\n]*))?/im', $header, $match, PREG_SET_ORDER)) {
// error parsing the response
$this->logger->log('debug', 'Could not find Content-Type header in HTTP response', ['header' => $header]);
} else {
// get last matched element (in case of redirects)
$match = end($match);
if (isset($match[2])) {
$encoding = trim($match[2], "\"' \r\n\v\t");
}
}
// TODO: check to see if encoding is supported (can we convert it?)
// If it's not, result will be empty string.
// For now we'll check for invalid encoding types returned by some sites, e.g. 'none'
// Problem URL: http://facta.co.jp/blog/archives/20111026001026.html
if (empty($encoding) || $encoding == 'none') {
// search for encoding in HTML - only look at the first 50000 characters
// Why 50000? See, for example, http://www.lemonde.fr/festival-de-cannes/article/2012/05/23/deux-cretes-en-goguette-sur-la-croisette_1705732_766360.html
// TODO: improve this so it looks at smaller chunks first
$html_head = substr($html, 0, 50000);
if (preg_match('/^<\\?xml\\s+version=(?:"[^"]*"|\'[^\']*\')\\s+encoding=("[^"]*"|\'[^\']*\')/s', $html_head, $match)) {
$encoding = trim($match[1], '"\'');
} elseif (preg_match('/<meta\\s+http-equiv=["\']?Content-Type["\']? content=["\'][^;]+;\\s*charset=["\']?([^;"\'>]+)/i', $html_head, $match)) {
$encoding = trim($match[1]);
} elseif (preg_match_all('/<meta\\s+([^>]+)>/i', $html_head, $match)) {
foreach ($match[1] as $_test) {
if (preg_match('/charset=["\']?([^"\']+)/i', $_test, $_m)) {
$encoding = trim($_m[1]);
break;
}
}
}
}
$encoding = strtolower(trim($encoding));
// fix bad encoding values
if ($encoding === 'iso-8850-1') {
$encoding = 'iso-8859-1';
}
if (empty($encoding) || $encoding === 'iso-8859-1') {
// replace MS Word smart qutoes
$trans = array();
$trans[chr(130)] = '‚';
// Single Low-9 Quotation Mark
$trans[chr(131)] = 'ƒ';
// Latin Small Letter F With Hook
$trans[chr(132)] = '„';
// Double Low-9 Quotation Mark
$trans[chr(133)] = '…';
// Horizontal Ellipsis
$trans[chr(134)] = '†';
// Dagger
$trans[chr(135)] = '‡';
// Double Dagger
$trans[chr(136)] = 'ˆ';
// Modifier Letter Circumflex Accent
$trans[chr(137)] = '‰';
// Per Mille Sign
$trans[chr(138)] = 'Š';
// Latin Capital Letter S With Caron
$trans[chr(139)] = '‹';
// Single Left-Pointing Angle Quotation Mark
$trans[chr(140)] = 'Œ';
// Latin Capital Ligature OE
$trans[chr(145)] = '‘';
// Left Single Quotation Mark
$trans[chr(146)] = '’';
// Right Single Quotation Mark
$trans[chr(147)] = '“';
// Left Double Quotation Mark
$trans[chr(148)] = '”';
// Right Double Quotation Mark
$trans[chr(149)] = '•';
// Bullet
$trans[chr(150)] = '–';
// En Dash
$trans[chr(151)] = '—';
// Em Dash
$trans[chr(152)] = '˜';
// Small Tilde
$trans[chr(153)] = '™';
// Trade Mark Sign
$trans[chr(154)] = 'š';
// Latin Small Letter S With Caron
$trans[chr(155)] = '›';
// Single Right-Pointing Angle Quotation Mark
$trans[chr(156)] = 'œ';
// Latin Small Ligature OE
$trans[chr(159)] = 'Ÿ';
// Latin Capital Letter Y With Diaeresis
$html = strtr($html, $trans);
}
if ($encoding !== 'utf-8') {
$this->logger->log('debug', 'Converting to UTF-8', ['encoding' => $encoding]);
return \SimplePie_Misc::change_encoding($html, $encoding, 'utf-8') ?: $html;
}
$this->logger->log('debug', 'Treating as UTF-8', ['encoding' => $encoding]);
return $html;
}