File Horde/Text/Diff/Engine/Native.php

  1: <?php
  2: /**
  3:  * Class used internally by Horde_Text_Diff to actually compute the diffs.
  4:  *
  5:  * This class is implemented using native PHP code.
  6:  *
  7:  * The algorithm used here is mostly lifted from the perl module
  8:  * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
  9:  * http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
 10:  *
 11:  * More ideas are taken from: http://www.ics.uci.edu/~eppstein/161/960229.html
 12:  *
 13:  * Some ideas (and a bit of code) are taken from analyze.c, of GNU
 14:  * diffutils-2.7, which can be found at:
 15:  * ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
 16:  *
 17:  * Some ideas (subdivision by NCHUNKS > 2, and some optimizations) are from
 18:  * Geoffrey T. Dairiki <dairiki@dairiki.org>. The original PHP version of this
 19:  * code was written by him, and is used/adapted with his permission.
 20:  *
 21:  * Copyright 2004-2012 Horde LLC (http://www.horde.org/)
 22:  *
 23:  * See the enclosed file COPYING for license information (LGPL). If you did
 24:  * not receive this file, see http://www.horde.org/licenses/lgpl21.
 25:  *
 26:  * @author  Geoffrey T. Dairiki <dairiki@dairiki.org>
 27:  * @package Text_Diff
 28:  */
 29: class Horde_Text_Diff_Engine_Native
 30: {
 31:     public function diff($from_lines, $to_lines)
 32:     {
 33:         array_walk($from_lines, array('Horde_Text_Diff', 'trimNewlines'));
 34:         array_walk($to_lines, array('Horde_Text_Diff', 'trimNewlines'));
 35: 
 36:         $n_from = count($from_lines);
 37:         $n_to = count($to_lines);
 38: 
 39:         $this->xchanged = $this->ychanged = array();
 40:         $this->xv = $this->yv = array();
 41:         $this->xind = $this->yind = array();
 42:         unset($this->seq);
 43:         unset($this->in_seq);
 44:         unset($this->lcs);
 45: 
 46:         // Skip leading common lines.
 47:         for ($skip = 0; $skip < $n_from && $skip < $n_to; $skip++) {
 48:             if ($from_lines[$skip] !== $to_lines[$skip]) {
 49:                 break;
 50:             }
 51:             $this->xchanged[$skip] = $this->ychanged[$skip] = false;
 52:         }
 53: 
 54:         // Skip trailing common lines.
 55:         $xi = $n_from; $yi = $n_to;
 56:         for ($endskip = 0; --$xi > $skip && --$yi > $skip; $endskip++) {
 57:             if ($from_lines[$xi] !== $to_lines[$yi]) {
 58:                 break;
 59:             }
 60:             $this->xchanged[$xi] = $this->ychanged[$yi] = false;
 61:         }
 62: 
 63:         // Ignore lines which do not exist in both files.
 64:         for ($xi = $skip; $xi < $n_from - $endskip; $xi++) {
 65:             $xhash[$from_lines[$xi]] = 1;
 66:         }
 67:         for ($yi = $skip; $yi < $n_to - $endskip; $yi++) {
 68:             $line = $to_lines[$yi];
 69:             if (($this->ychanged[$yi] = empty($xhash[$line]))) {
 70:                 continue;
 71:             }
 72:             $yhash[$line] = 1;
 73:             $this->yv[] = $line;
 74:             $this->yind[] = $yi;
 75:         }
 76:         for ($xi = $skip; $xi < $n_from - $endskip; $xi++) {
 77:             $line = $from_lines[$xi];
 78:             if (($this->xchanged[$xi] = empty($yhash[$line]))) {
 79:                 continue;
 80:             }
 81:             $this->xv[] = $line;
 82:             $this->xind[] = $xi;
 83:         }
 84: 
 85:         // Find the LCS.
 86:         $this->_compareseq(0, count($this->xv), 0, count($this->yv));
 87: 
 88:         // Merge edits when possible.
 89:         $this->_shiftBoundaries($from_lines, $this->xchanged, $this->ychanged);
 90:         $this->_shiftBoundaries($to_lines, $this->ychanged, $this->xchanged);
 91: 
 92:         // Compute the edit operations.
 93:         $edits = array();
 94:         $xi = $yi = 0;
 95:         while ($xi < $n_from || $yi < $n_to) {
 96:             assert($yi < $n_to || $this->xchanged[$xi]);
 97:             assert($xi < $n_from || $this->ychanged[$yi]);
 98: 
 99:             // Skip matching "snake".
100:             $copy = array();
101:             while ($xi < $n_from && $yi < $n_to
102:                    && !$this->xchanged[$xi] && !$this->ychanged[$yi]) {
103:                 $copy[] = $from_lines[$xi++];
104:                 ++$yi;
105:             }
106:             if ($copy) {
107:                 $edits[] = new Horde_Text_Diff_Op_Copy($copy);
108:             }
109: 
110:             // Find deletes & adds.
111:             $delete = array();
112:             while ($xi < $n_from && $this->xchanged[$xi]) {
113:                 $delete[] = $from_lines[$xi++];
114:             }
115: 
116:             $add = array();
117:             while ($yi < $n_to && $this->ychanged[$yi]) {
118:                 $add[] = $to_lines[$yi++];
119:             }
120: 
121:             if ($delete && $add) {
122:                 $edits[] = new Horde_Text_Diff_Op_Change($delete, $add);
123:             } elseif ($delete) {
124:                 $edits[] = new Horde_Text_Diff_Op_Delete($delete);
125:             } elseif ($add) {
126:                 $edits[] = new Horde_Text_Diff_Op_Add($add);
127:             }
128:         }
129: 
130:         return $edits;
131:     }
132: 
133:     /**
134:      * Divides the Largest Common Subsequence (LCS) of the sequences (XOFF,
135:      * XLIM) and (YOFF, YLIM) into NCHUNKS approximately equally sized
136:      * segments.
137:      *
138:      * Returns (LCS, PTS).  LCS is the length of the LCS. PTS is an array of
139:      * NCHUNKS+1 (X, Y) indexes giving the diving points between sub
140:      * sequences.  The first sub-sequence is contained in (X0, X1), (Y0, Y1),
141:      * the second in (X1, X2), (Y1, Y2) and so on.  Note that (X0, Y0) ==
142:      * (XOFF, YOFF) and (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
143:      *
144:      * This public function assumes that the first lines of the specified portions of
145:      * the two files do not match, and likewise that the last lines do not
146:      * match.  The caller must trim matching lines from the beginning and end
147:      * of the portions it is going to specify.
148:      */
149:     protected function _diag ($xoff, $xlim, $yoff, $ylim, $nchunks)
150:     {
151:         $flip = false;
152: 
153:         if ($xlim - $xoff > $ylim - $yoff) {
154:             /* Things seems faster (I'm not sure I understand why) when the
155:              * shortest sequence is in X. */
156:             $flip = true;
157:             list ($xoff, $xlim, $yoff, $ylim)
158:                 = array($yoff, $ylim, $xoff, $xlim);
159:         }
160: 
161:         if ($flip) {
162:             for ($i = $ylim - 1; $i >= $yoff; $i--) {
163:                 $ymatches[$this->xv[$i]][] = $i;
164:             }
165:         } else {
166:             for ($i = $ylim - 1; $i >= $yoff; $i--) {
167:                 $ymatches[$this->yv[$i]][] = $i;
168:             }
169:         }
170: 
171:         $this->lcs = 0;
172:         $this->seq[0]= $yoff - 1;
173:         $this->in_seq = array();
174:         $ymids[0] = array();
175: 
176:         $numer = $xlim - $xoff + $nchunks - 1;
177:         $x = $xoff;
178:         for ($chunk = 0; $chunk < $nchunks; $chunk++) {
179:             if ($chunk > 0) {
180:                 for ($i = 0; $i <= $this->lcs; $i++) {
181:                     $ymids[$i][$chunk - 1] = $this->seq[$i];
182:                 }
183:             }
184: 
185:             $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $chunk) / $nchunks);
186:             for (; $x < $x1; $x++) {
187:                 $line = $flip ? $this->yv[$x] : $this->xv[$x];
188:                 if (empty($ymatches[$line])) {
189:                     continue;
190:                 }
191:                 $matches = $ymatches[$line];
192:                 reset($matches);
193:                 while (list(, $y) = each($matches)) {
194:                     if (empty($this->in_seq[$y])) {
195:                         $k = $this->_lcsPos($y);
196:                         assert($k > 0);
197:                         $ymids[$k] = $ymids[$k - 1];
198:                         break;
199:                     }
200:                 }
201:                 while (list(, $y) = each($matches)) {
202:                     if ($y > $this->seq[$k - 1]) {
203:                         assert($y <= $this->seq[$k]);
204:                         /* Optimization: this is a common case: next match is
205:                          * just replacing previous match. */
206:                         $this->in_seq[$this->seq[$k]] = false;
207:                         $this->seq[$k] = $y;
208:                         $this->in_seq[$y] = 1;
209:                     } elseif (empty($this->in_seq[$y])) {
210:                         $k = $this->_lcsPos($y);
211:                         assert($k > 0);
212:                         $ymids[$k] = $ymids[$k - 1];
213:                     }
214:                 }
215:             }
216:         }
217: 
218:         $seps[] = $flip ? array($yoff, $xoff) : array($xoff, $yoff);
219:         $ymid = $ymids[$this->lcs];
220:         for ($n = 0; $n < $nchunks - 1; $n++) {
221:             $x1 = $xoff + (int)(($numer + ($xlim - $xoff) * $n) / $nchunks);
222:             $y1 = $ymid[$n] + 1;
223:             $seps[] = $flip ? array($y1, $x1) : array($x1, $y1);
224:         }
225:         $seps[] = $flip ? array($ylim, $xlim) : array($xlim, $ylim);
226: 
227:         return array($this->lcs, $seps);
228:     }
229: 
230:     protected function _lcsPos($ypos)
231:     {
232:         $end = $this->lcs;
233:         if ($end == 0 || $ypos > $this->seq[$end]) {
234:             $this->seq[++$this->lcs] = $ypos;
235:             $this->in_seq[$ypos] = 1;
236:             return $this->lcs;
237:         }
238: 
239:         $beg = 1;
240:         while ($beg < $end) {
241:             $mid = (int)(($beg + $end) / 2);
242:             if ($ypos > $this->seq[$mid]) {
243:                 $beg = $mid + 1;
244:             } else {
245:                 $end = $mid;
246:             }
247:         }
248: 
249:         assert($ypos != $this->seq[$end]);
250: 
251:         $this->in_seq[$this->seq[$end]] = false;
252:         $this->seq[$end] = $ypos;
253:         $this->in_seq[$ypos] = 1;
254:         return $end;
255:     }
256: 
257:     /**
258:      * Finds LCS of two sequences.
259:      *
260:      * The results are recorded in the vectors $this->{x,y}changed[], by
261:      * storing a 1 in the element for each line that is an insertion or
262:      * deletion (ie. is not in the LCS).
263:      *
264:      * The subsequence of file 0 is (XOFF, XLIM) and likewise for file 1.
265:      *
266:      * Note that XLIM, YLIM are exclusive bounds.  All line numbers are
267:      * origin-0 and discarded lines are not counted.
268:      */
269:     protected function _compareseq ($xoff, $xlim, $yoff, $ylim)
270:     {
271:         /* Slide down the bottom initial diagonal. */
272:         while ($xoff < $xlim && $yoff < $ylim
273:                && $this->xv[$xoff] == $this->yv[$yoff]) {
274:             ++$xoff;
275:             ++$yoff;
276:         }
277: 
278:         /* Slide up the top initial diagonal. */
279:         while ($xlim > $xoff && $ylim > $yoff
280:                && $this->xv[$xlim - 1] == $this->yv[$ylim - 1]) {
281:             --$xlim;
282:             --$ylim;
283:         }
284: 
285:         if ($xoff == $xlim || $yoff == $ylim) {
286:             $lcs = 0;
287:         } else {
288:             /* This is ad hoc but seems to work well.  $nchunks =
289:              * sqrt(min($xlim - $xoff, $ylim - $yoff) / 2.5); $nchunks =
290:              * max(2,min(8,(int)$nchunks)); */
291:             $nchunks = min(7, $xlim - $xoff, $ylim - $yoff) + 1;
292:             list($lcs, $seps)
293:                 = $this->_diag($xoff, $xlim, $yoff, $ylim, $nchunks);
294:         }
295: 
296:         if ($lcs == 0) {
297:             /* X and Y sequences have no common subsequence: mark all
298:              * changed. */
299:             while ($yoff < $ylim) {
300:                 $this->ychanged[$this->yind[$yoff++]] = 1;
301:             }
302:             while ($xoff < $xlim) {
303:                 $this->xchanged[$this->xind[$xoff++]] = 1;
304:             }
305:         } else {
306:             /* Use the partitions to split this problem into subproblems. */
307:             reset($seps);
308:             $pt1 = $seps[0];
309:             while ($pt2 = next($seps)) {
310:                 $this->_compareseq ($pt1[0], $pt2[0], $pt1[1], $pt2[1]);
311:                 $pt1 = $pt2;
312:             }
313:         }
314:     }
315: 
316:     /**
317:      * Adjusts inserts/deletes of identical lines to join changes as much as
318:      * possible.
319:      *
320:      * We do something when a run of changed lines include a line at one end
321:      * and has an excluded, identical line at the other.  We are free to
322:      * choose which identical line is included.  `compareseq' usually chooses
323:      * the one at the beginning, but usually it is cleaner to consider the
324:      * following identical line to be the "change".
325:      *
326:      * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
327:      */
328:     protected function _shiftBoundaries($lines, &$changed, $other_changed)
329:     {
330:         $i = 0;
331:         $j = 0;
332: 
333:         assert('count($lines) == count($changed)');
334:         $len = count($lines);
335:         $other_len = count($other_changed);
336: 
337:         while (1) {
338:             /* Scan forward to find the beginning of another run of
339:              * changes. Also keep track of the corresponding point in the
340:              * other file.
341:              *
342:              * Throughout this code, $i and $j are adjusted together so that
343:              * the first $i elements of $changed and the first $j elements of
344:              * $other_changed both contain the same number of zeros (unchanged
345:              * lines).
346:              *
347:              * Furthermore, $j is always kept so that $j == $other_len or
348:              * $other_changed[$j] == false. */
349:             while ($j < $other_len && $other_changed[$j]) {
350:                 $j++;
351:             }
352: 
353:             while ($i < $len && ! $changed[$i]) {
354:                 assert('$j < $other_len && ! $other_changed[$j]');
355:                 $i++; $j++;
356:                 while ($j < $other_len && $other_changed[$j]) {
357:                     $j++;
358:                 }
359:             }
360: 
361:             if ($i == $len) {
362:                 break;
363:             }
364: 
365:             $start = $i;
366: 
367:             /* Find the end of this run of changes. */
368:             while (++$i < $len && $changed[$i]) {
369:                 continue;
370:             }
371: 
372:             do {
373:                 /* Record the length of this run of changes, so that we can
374:                  * later determine whether the run has grown. */
375:                 $runlength = $i - $start;
376: 
377:                 /* Move the changed region back, so long as the previous
378:                  * unchanged line matches the last changed one.  This merges
379:                  * with previous changed regions. */
380:                 while ($start > 0 && $lines[$start - 1] == $lines[$i - 1]) {
381:                     $changed[--$start] = 1;
382:                     $changed[--$i] = false;
383:                     while ($start > 0 && $changed[$start - 1]) {
384:                         $start--;
385:                     }
386:                     assert('$j > 0');
387:                     while ($other_changed[--$j]) {
388:                         continue;
389:                     }
390:                     assert('$j >= 0 && !$other_changed[$j]');
391:                 }
392: 
393:                 /* Set CORRESPONDING to the end of the changed run, at the
394:                  * last point where it corresponds to a changed run in the
395:                  * other file. CORRESPONDING == LEN means no such point has
396:                  * been found. */
397:                 $corresponding = $j < $other_len ? $i : $len;
398: 
399:                 /* Move the changed region forward, so long as the first
400:                  * changed line matches the following unchanged one.  This
401:                  * merges with following changed regions.  Do this second, so
402:                  * that if there are no merges, the changed region is moved
403:                  * forward as far as possible. */
404:                 while ($i < $len && $lines[$start] == $lines[$i]) {
405:                     $changed[$start++] = false;
406:                     $changed[$i++] = 1;
407:                     while ($i < $len && $changed[$i]) {
408:                         $i++;
409:                     }
410: 
411:                     assert('$j < $other_len && ! $other_changed[$j]');
412:                     $j++;
413:                     if ($j < $other_len && $other_changed[$j]) {
414:                         $corresponding = $i;
415:                         while ($j < $other_len && $other_changed[$j]) {
416:                             $j++;
417:                         }
418:                     }
419:                 }
420:             } while ($runlength != $i - $start);
421: 
422:             /* If possible, move the fully-merged run of changes back to a
423:              * corresponding run in the other file. */
424:             while ($corresponding < $i) {
425:                 $changed[--$start] = 1;
426:                 $changed[--$i] = 0;
427:                 assert('$j > 0');
428:                 while ($other_changed[--$j]) {
429:                     continue;
430:                 }
431:                 assert('$j >= 0 && !$other_changed[$j]');
432:             }
433:         }
434:     }
435: }
436:
Packages

Classes