DaveChild\TextStatistics\Text::cleanText PHP Method

cleanText() public static method

Trims, removes line breaks, multiple spaces and generally cleans text before processing.
public static cleanText ( string | boolean $strText ) : string
$strText string | boolean Text to be transformed
return string
    public static function cleanText($strText)
    {
        // Check for boolean before processing as string
        if (is_bool($strText)) {
            return '';
        }
        // Check to see if we already processed this text. If we did, don't
        // re-process it.
        $key = sha1($strText);
        if (isset(self::$clean[$key])) {
            return self::$clean[$key];
        }
        $strText = utf8_decode($strText);
        // Curly quotes etc
        $strText = str_replace(array("‘", "’", "“", "”", "–", "—", "…"), array("'", "'", '"', '"', '-', '--', '...'), $strText);
        $strText = str_replace(array(chr(145), chr(146), chr(147), chr(148), chr(150), chr(151), chr(133)), array("'", "'", '"', '"', '-', '--', '...'), $strText);
        // Replace periods within numbers
        $strText = preg_replace('`([^0-9][0-9]+)\\.([0-9]+[^0-9])`mis', '${1}0$2', $strText);
        // Handle HTML. Treat block level elements as sentence terminators and
        // remove all other tags.
        $strText = preg_replace('`<script(.*?)>(.*?)</script>`is', '', $strText);
        $strText = preg_replace('`\\</?(address|blockquote|center|dir|div|dl|dd|dt|fieldset|form|h1|h2|h3|h4|h5|h6|menu|noscript|ol|p|pre|table|ul|li)[^>]*>`is', '.', $strText);
        $strText = html_entity_decode($strText);
        $strText = strip_tags($strText);
        // Assume blank lines (i.e., paragraph breaks) end sentences (useful
        // for titles in plain text documents) and replace remaining new
        // lines with spaces
        $strText = preg_replace('`(\\r\\n|\\n\\r)`is', "\n", $strText);
        $strText = preg_replace('`(\\r|\\n){2,}`is', ".\n\n", $strText);
        $strText = preg_replace('`[ ]*(\\n|\\r\\n|\\r)[ ]*`', ' ', $strText);
        // Replace commas, hyphens, quotes etc (count as spaces)
        $strText = preg_replace('`[",:;()/\\`-]`', ' ', $strText);
        // Unify terminators and spaces
        $strText = trim($strText, '. ') . '.';
        // Add final terminator.
        $strText = preg_replace('`[\\.!?]`', '.', $strText);
        // Unify terminators
        $strText = preg_replace('`([\\.\\s]*\\.[\\.\\s]*)`mis', '. ', $strText);
        // Merge terminators separated by whitespace.
        $strText = preg_replace('`[ ]+`', ' ', $strText);
        // Remove multiple spaces
        $strText = preg_replace('`([\\.])[\\. ]+`', '$1', $strText);
        // Check for duplicated terminators
        $strText = trim(preg_replace('`[ ]*([\\.])`', '$1 ', $strText));
        // Pad sentence terminators
        // Lower case all words following terminators (for gunning fog score)
        $strText = preg_replace_callback('`\\. [^\\. ]`', create_function('$matches', 'return strtolower($matches[0]);'), $strText);
        $strText = trim($strText);
        // Cache it and return
        self::$clean[$key] = $strText;
        return $strText;
    }