PKPString::diff PHP Method

diff() static public method

The calculation is optimized to identify the common largest substring. The return value is an array of the following format: array( array( diff-type => substring ), array(...) ) whereby diff-type can be one of: -1 = deletion 0 = common substring 1 = addition
static public diff ( $originalString, $editedString ) : array
$originalString string
$editedString string
return array
    static function diff($originalString, $editedString)
    {
        // Split strings into character arrays (multi-byte compatible).
        foreach (array('originalStringCharacters' => $originalString, 'editedStringCharacters' => $editedString) as $characterArrayName => $string) {
            ${$characterArrayName} = array();
            self::regexp_match_all('/./', $string, ${$characterArrayName});
            if (isset(${$characterArrayName}[0])) {
                ${$characterArrayName} = ${$characterArrayName}[0];
            }
        }
        // Determine the length of the strings.
        $originalStringLength = count($originalStringCharacters);
        $editedStringLength = count($editedStringCharacters);
        // Is there anything to compare?
        if ($originalStringLength == 0 && $editedStringLength == 0) {
            return array();
        }
        // Is the original string empty?
        if ($originalStringLength == 0) {
            // Return the edited string as addition.
            return array(array(1 => $editedString));
        }
        // Is the edited string empty?
        if ($editedStringLength == 0) {
            // Return the original string as deletion.
            return array(array(-1 => $originalString));
        }
        // Initialize the local indices:
        // 1) Create a character index for the edited string.
        $characterIndex = array();
        for ($characterPosition = 0; $characterPosition < $editedStringLength; $characterPosition++) {
            $characterIndex[$editedStringCharacters[$characterPosition]][] = $characterPosition;
        }
        // 2) Initialize the substring and the length index.
        $substringIndex = $lengthIndex = array();
        // Iterate over the original string to identify
        // the largest common string.
        for ($originalPosition = 0; $originalPosition < $originalStringLength; $originalPosition++) {
            // Find all occurrences of the original character
            // in the target string.
            $comparedCharacter = $originalStringCharacters[$originalPosition];
            // Do we have a commonality between the original string
            // and the edited string?
            if (isset($characterIndex[$comparedCharacter])) {
                // Loop over all commonalities.
                foreach ($characterIndex[$comparedCharacter] as $editedPosition) {
                    // Calculate the current and the preceding position
                    // ids for indexation.
                    $currentPosition = $originalPosition . '-' . $editedPosition;
                    $previousPosition = $originalPosition - 1 . '-' . ($editedPosition - 1);
                    // Does the occurrence in the target string continue
                    // an existing common substring or does it start
                    // a new one?
                    if (isset($substringIndex[$previousPosition])) {
                        // This is a continuation of an existing common
                        // substring...
                        $newSubstring = $substringIndex[$previousPosition] . $comparedCharacter;
                        $newSubstringLength = self::strlen($newSubstring);
                        // Move the substring in the substring index.
                        $substringIndex[$currentPosition] = $newSubstring;
                        unset($substringIndex[$previousPosition]);
                        // Move the substring in the length index.
                        $lengthIndex[$newSubstringLength][$currentPosition] = $newSubstring;
                        unset($lengthIndex[$newSubstringLength - 1][$previousPosition]);
                    } else {
                        // Start a new common substring...
                        // Add the substring to the substring index.
                        $substringIndex[$currentPosition] = $comparedCharacter;
                        // Add the substring to the length index.
                        $lengthIndex[1][$currentPosition] = $comparedCharacter;
                    }
                }
            }
        }
        // If we have no commonalities at all then mark the original
        // string as deleted and the edited string as added and
        // return.
        if (empty($lengthIndex)) {
            return array(array(-1 => $originalString), array(1 => $editedString));
        }
        // Pop the largest common substrings from the length index.
        end($lengthIndex);
        $largestSubstringLength = key($lengthIndex);
        // Take the first common substring if we have more than
        // one substring with the same length.
        // FIXME: Find a better heuristic for this decision.
        reset($lengthIndex[$largestSubstringLength]);
        $largestSubstringPosition = key($lengthIndex[$largestSubstringLength]);
        list($largestSubstringEndOriginal, $largestSubstringEndEdited) = explode('-', $largestSubstringPosition);
        $largestSubstring = $lengthIndex[$largestSubstringLength][$largestSubstringPosition];
        // Add the largest common substring to the result set
        $diffResult = array(array(0 => $largestSubstring));
        // Prepend the diff of the substrings before the common substring
        // to the result diff (by recursion).
        $precedingSubstringOriginal = self::substr($originalString, 0, $largestSubstringEndOriginal - $largestSubstringLength + 1);
        $precedingSubstringEdited = self::substr($editedString, 0, $largestSubstringEndEdited - $largestSubstringLength + 1);
        $diffResult = array_merge(self::diff($precedingSubstringOriginal, $precedingSubstringEdited), $diffResult);
        // Append the diff of the substrings after thr common substring
        // to the result diff (by recursion).
        $succeedingSubstringOriginal = self::substr($originalString, $largestSubstringEndOriginal + 1);
        $succeedingSubstringEdited = self::substr($editedString, $largestSubstringEndEdited + 1);
        $diffResult = array_merge($diffResult, self::diff($succeedingSubstringOriginal, $succeedingSubstringEdited));
        // Return the array representing the diff.
        return $diffResult;
    }

Usage Example

 /**
  * @covers PKPString::diff
  */
 public function testDiff()
 {
     // Test two strings that have common substrings.
     $originalString = 'The original string.';
     $editedString = 'The edited original.';
     $expectedDiff = array(array(0 => 'The'), array(1 => ' edited'), array(0 => ' original'), array(-1 => ' string'), array(0 => '.'));
     $resultDiff = PKPString::diff($originalString, $editedString);
     self::assertEquals($expectedDiff, $resultDiff);
     // Test two completely different strings.
     $originalString = 'abc';
     $editedString = 'def';
     $expectedDiff = array(array(-1 => 'abc'), array(1 => 'def'));
     $resultDiff = PKPString::diff($originalString, $editedString);
     self::assertEquals($expectedDiff, $resultDiff);
     // A more realistic example from the citation editor use case
     $originalString = 'Willinsky, B. (2006). The access principle: The case for open acces to research and scholarship. Cambridge, MA: MIT Press.';
     $editedString = 'Willinsky, J. (2006). The access principle: The case for open access to research and scholarship. Cambridge, MA: MIT Press.';
     $expectedDiff = array(array(0 => 'Willinsky, '), array(-1 => 'B'), array(1 => 'J'), array(0 => '. (2006). The access principle: The case for open acce'), array(1 => 's'), array(0 => 's to research and scholarship. Cambridge, MA: MIT Press.'));
     $resultDiff = PKPString::diff($originalString, $editedString);
     self::assertEquals($expectedDiff, $resultDiff);
 }
All Usage Examples Of PKPString::diff