public static function toText(string $html) : string
{
$text = $html;
// Remove styles a scripts
$text = self::removeTags($text, ['style', 'script']);
// Re-format lines
// <pre>
$text = preg_replace_callback('~<pre[^>]*>(.+?)</pre>~is', function ($matches) {
// Line breaks are converted to <br />, that are removed later
return nl2br($matches[1]);
}, $text);
// \r, redundant line breaks, tabs and <br />
$text = preg_replace(["~\r~", "~[\n\t]+~", '~<br[^>]*>~i'], ['', ' ', "\n"], $text);
// Processing of most tags and entities
static $search = ['~<h[3-6][^>]*>(.+?)</h[3-6]>~is', '~(<div[^>]*>)|(</div>)~i', '~(<p(?:\\s+[^>]+)?>)|(</p>)~i', '~(<table[^>]*>)|(</table>)~i', '~</tr>*~i', '~<td[^>]*>(.+?)</td>~is', '~(…)~i', '~(“)|(”)~i', '~(')~i', '~(©)|(©)~i', '~™~i', '~®~i', '~(—)|(–)~i'];
static $replace = ["\n\n\\1\n\n", "\n\n", "\n\n", "\n\n", "\n", "\\1\t", '...', '"', '\'', '(c)', '(tm)', '(R)', '-'];
$text = preg_replace($search, $replace, $text);
// <h1> and <h2>
$text = preg_replace_callback('~<h[12][^>]*>(.+?)</h[12]>~is', function ($matches) {
return "\n\n\n" . mb_strtoupper($matches[1], 'utf-8') . "\n\n";
}, $text);
// <strong>
$text = preg_replace_callback('~<strong[^>]*>(.+?)</strong>~is', function ($matches) {
return mb_strtoupper($matches[1], 'utf-8');
}, $text);
// <hr />
$text = preg_replace_callback('~<hr[^>]*>~i', function ($matches) {
return "\n" . str_repeat('-', 50) . "\n";
}, $text);
// <th>
$text = preg_replace_callback('~<th[^>]*>(.+?)</th>~is', function ($matches) {
return mb_strtoupper($matches[1], 'utf-8') . "\t";
}, $text);
// <a>
$text = self::linkToText($text);
// <ul> and <ol>
$text = self::listToText($text);
// Two empty lines at most
$text = trim($text, "\n ");
$text = preg_replace("~\n\\s+\n~", "\n\n", $text);
// Process <blockquote> (empty lines are removed before <blockquote> processing on purpose)
$text = self::blockquoteToText($text);
// Remove all left tags
$text = strip_tags($text);
// Replacing [textlink] for <> (must be done after strip_tags)
$text = preg_replace('~\\[textlink\\]\\s*~s', '<', $text);
$text = preg_replace('~\\s*\\[/textlink\\]~s', '>', $text);
// Replaces non-breaking spaces
$text = preg_replace(['~ ~i', '~ ~i'], ["\t", ' '], $text);
// Remove other entities (must not be performed before)
// After previous processing some entities are upper case, that is why we have to use strtolower
$text = preg_replace_callback('~(&#?[a-z0-9]+;)~i', function ($matches) {
return html_entity_decode(strtolower($matches[1]), ENT_QUOTES, 'utf-8');
}, $text);
// Two empty lines at most (performed second times on purpose)
$text = trim($text, "\n ");
$text = preg_replace("~\n\\s+\n~", "\n\n", $text);
// Because of <blockquote> converting
$text = preg_replace("~(\n>\\s*)+\n~", "\n>\n", $text);
// One space at most
$text = preg_replace("~(\n|\t)( )+~", '\\1', $text);
$text = preg_replace('~( ){2,}~', ' ', $text);
// No space at line ends
$text = preg_replace("~[ \t]+\n~", "\n", $text);
return $text;
}