ggcov - source src/xdiff/xdiffi.c

Line	Flow	Count	Block(s)	Source
1		-		/*
2		-		* LibXDiff by Davide Libenzi ( File Differential Library )
3		-		* Copyright (C) 2003 Davide Libenzi
4		-		*
5		-		* This library is free software; you can redistribute it and/or
6		-		* modify it under the terms of the GNU Lesser General Public
7		-		* License as published by the Free Software Foundation; either
8		-		* version 2.1 of the License, or (at your option) any later version.
9		-		*
10		-		* This library is distributed in the hope that it will be useful,
11		-		* but WITHOUT ANY WARRANTY; without even the implied warranty of
12		-		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13		-		* Lesser General Public License for more details.
14		-		*
15		-		* You should have received a copy of the GNU Lesser General Public
16		-		* License along with this library; if not, see
17		-		* <http://www.gnu.org/licenses/>.
18		-		*
19		-		* Davide Libenzi <davidel@xmailserver.org>
20		-		*
21		-		*/
22		-
23		-		#include "xinclude.h"
24		-		#include "integer.h"
25		-
26		-
27		-		#define XDL_MAX_COST_MIN 256
28		-		#define XDL_HEUR_MIN_COST 256
29		-		#define XDL_LINE_MAX (long)((1UL << (CHAR_BIT * sizeof(long) - 1)) - 1)
30		-		#define XDL_SNAKE_CNT 20
31		-		#define XDL_K_HEUR 4
32		-
33		-		/** Declare a function as always inlined. */
34		-		#if defined(_MSC_VER)
35		-		# define XDL_INLINE(type) static __inline type
36		-		#elif defined(__GNUC__)
37		-		# define XDL_INLINE(type) static __inline__ type
38		-		#else
39		-		# define XDL_INLINE(type) static type
40		-		#endif
41		-
42		-		typedef struct s_xdpsplit {
43		-		long i1, i2;
44		-		int min_lo, min_hi;
45		-		} xdpsplit_t;
46		-
47		-
48		-
49		-
50		-		static long xdl_split(unsigned long const *ha1, long off1, long lim1,
51		-		unsigned long const *ha2, long off2, long lim2,
52		-		long kvdf, long kvdb, int need_min, xdpsplit_t *spl,
53		-		xdalgoenv_t *xenv);
54		-		static xdchange_t xdl_add_change(xdchange_t xscr, long i1, long i2, long chg1, long chg2);
55		-
56		-
57		-
58		-
59		-
60		-		/*
61		-		* See "An O(ND) Difference Algorithm and its Variations", by Eugene Myers.
62		-		* Basically considers a "box" (off1, off2, lim1, lim2) and scan from both
63		-		* the forward diagonal starting from (off1, off2) and the backward diagonal
64		-		* starting from (lim1, lim2). If the K values on the same diagonal crosses
65		-		* returns the furthest point of reach. We might end up having to expensive
66		-		* cases using this algorithm is full, so a little bit of heuristic is needed
67		-		* to cut the search and to return a suboptimal point.
68		-		*/
69		177	2	static long xdl_split(unsigned long const *ha1, long off1, long lim1,
70		-		unsigned long const *ha2, long off2, long lim2,
71		-		long kvdf, long kvdb, int need_min, xdpsplit_t *spl,
72		-		xdalgoenv_t *xenv) {
73		177	2	long dmin = off1 - lim2, dmax = lim1 - off2;
74		177	2	long fmid = off1 - off2, bmid = lim1 - lim2;
75		177	2	long odd = (fmid - bmid) & 1;
76		177	2	long fmin = fmid, fmax = fmid;
77		177	2	long bmin = bmid, bmax = bmid;
78		-		long ec, d, i1, i2, prev1, best, dd, v, k;
79		-
80		-		/*
81		-		* Set initial diagonal values for both forward and backward path.
82		-		*/
83		177	2	kvdf[fmid] = off1;
84		177	2	kvdb[bmid] = lim1;
85		-
86		377	2,112	for (ec = 1;; ec++) {
87		554	3	int got_snake = 0;
88		-
89		-		/*
90		-		* We need to extent the diagonal "domain" by one. If the next
91		-		* values exits the box boundaries we need to change it in the
92		-		* opposite direction because (max - min) must be a power of two.
93		-		* Also we initialize the external K value to -1 so that we can
94		-		* avoid extra conditions check inside the core loop.
95		-		*/
96		554	3	if (fmin > dmin)
97		547	4	kvdf[--fmin - 1] = -1;
98		-		else
99		7	5	++fmin;
100		554	6	if (fmax < dmax)
101		526	7	kvdf[++fmax + 1] = -1;
102		-		else
103		28	8	--fmax;
104		-
105		2637	9,25,26	for (d = fmax; d >= fmin; d -= 2) {
106		2157	10	if (kvdf[d - 1] >= kvdf[d + 1])
107		565	11	i1 = kvdf[d - 1] + 1;
108		-		else
109		1592	12	i1 = kvdf[d + 1];
110		2157	13	prev1 = i1;
111		2157	13	i2 = i1 - d;
112		2892	13-17	for (; i1 < lim1 && i2 < lim2 && ha1[i1] == ha2[i2]; i1++, i2++);
113		2157	18	if (i1 - prev1 > xenv->snake_cnt)
114		4	19	got_snake = 1;
115		2157	20	kvdf[d] = i1;
116		2157	20-23	if (odd && bmin <= d && d <= bmax && kvdb[d] <= i1) {
117		74	24	spl->i1 = i1;
118		74	24	spl->i2 = i2;
119		74	24	spl->min_lo = spl->min_hi = 1;
120		74	24	return ec;
121		-		}
122		-		}
123		-
124		-		/*
125		-		* We need to extent the diagonal "domain" by one. If the next
126		-		* values exits the box boundaries we need to change it in the
127		-		* opposite direction because (max - min) must be a power of two.
128		-		* Also we initialize the external K value to -1 so that we can
129		-		* avoid extra conditions check inside the core loop.
130		-		*/
131		480	27	if (bmin > dmin)
132		473	28	kvdb[--bmin - 1] = XDL_LINE_MAX;
133	-		else
134	7	29	++bmin;
135	480	30	if (bmax < dmax)
136	475	31	kvdb[++bmax + 1] = XDL_LINE_MAX;
137	-		else
138	5	32	--bmax;
139	-
140	2155	33,49,50	for (d = bmax; d >= bmin; d -= 2) {
141	1778	34	if (kvdb[d - 1] < kvdb[d + 1])
142	1333	35	i1 = kvdb[d - 1];
143	-		else
144	445	36	i1 = kvdb[d + 1] - 1;
145	1778	37	prev1 = i1;
146	1778	37	i2 = i1 - d;
147	2446	37-41	for (; i1 > off1 && i2 > off2 && ha1[i1 - 1] == ha2[i2 - 1]; i1--, i2--);
148	1778	42	if (prev1 - i1 > xenv->snake_cnt)
149	2	43	got_snake = 1;
150	1778	44	kvdb[d] = i1;
151	1778	44-47	if (!odd && fmin <= d && d <= fmax && i1 <= kvdf[d]) {
152	103	48	spl->i1 = i1;
153	103	48	spl->i2 = i2;
154	103	48	spl->min_lo = spl->min_hi = 1;
155	103	48	return ec;
156	-		}
157	-		}
158	-
159	377	51	if (need_min)
160	173	52	continue;
161	-
162	-		/*
163	-		* If the edit cost is above the heuristic trigger and if
164	-		* we got a good snake, we sample current diagonals to see
165	-		* if some of the, have reached an "interesting" path. Our
166	-		* measure is a function of the distance from the diagonal
167	-		* corner (i1 + i2) penalized with the distance from the
168	-		* mid diagonal itself. If this value is above the current
169	-		* edit cost times a magic factor (XDL_K_HEUR) we consider
170	-		* it interesting.
171	-		*/
172	204	53,54	if (got_snake && ec > xenv->heur_min) {
173	#####	55,70,71	for (best = 0, d = fmax; d >= fmin; d -= 2) {
174	#####	56-58	dd = d > fmid ? d - fmid: fmid - d;
175	#####	59	i1 = kvdf[d];
176	#####	59	i2 = i1 - d;
177	#####	59	v = (i1 - off1) + (i2 - off2) - dd;
178	-
179	#####	59-61	if (v > XDL_K_HEUR * ec && v > best &&
180	#####	61-63	off1 + xenv->snake_cnt <= i1 && i1 < lim1 &&
181	#####	63,64	off2 + xenv->snake_cnt <= i2 && i2 < lim2) {
182	#####	65,68,69	for (k = 1; ha1[i1 - k] == ha2[i2 - k]; k++)
183	#####	66	if (k == xenv->snake_cnt) {
184	#####	67	best = v;
185	#####	67	spl->i1 = i1;
186	#####	67	spl->i2 = i2;
187	#####	67	break;
188	-		}
189	-		}
190	-		}
191	#####	72	if (best > 0) {
192	#####	73	spl->min_lo = 1;
193	#####	73	spl->min_hi = 0;
194	#####	73	return ec;
195	-		}
196	-
197	#####	74,89,90	for (best = 0, d = bmax; d >= bmin; d -= 2) {
198	#####	75-77	dd = d > bmid ? d - bmid: bmid - d;
199	#####	78	i1 = kvdb[d];
200	#####	78	i2 = i1 - d;
201	#####	78	v = (lim1 - i1) + (lim2 - i2) - dd;
202	-
203	#####	78-80	if (v > XDL_K_HEUR * ec && v > best &&
204	#####	81,82	off1 < i1 && i1 <= lim1 - xenv->snake_cnt &&
205	#####	83	off2 < i2 && i2 <= lim2 - xenv->snake_cnt) {
206	#####	84,87,88	for (k = 0; ha1[i1 + k] == ha2[i2 + k]; k++)
207	#####	85	if (k == xenv->snake_cnt - 1) {
208	#####	86	best = v;
209	#####	86	spl->i1 = i1;
210	#####	86	spl->i2 = i2;
211	#####	86	break;
212	-		}
213	-		}
214	-		}
215	#####	91	if (best > 0) {
216	#####	92	spl->min_lo = 0;
217	#####	92	spl->min_hi = 1;
218	#####	92	return ec;
219	-		}
220	-		}
221	-
222	-		/*
223	-		* Enough is enough. We spent too much time here and now we collect
224	-		* the furthest reaching path using the (i1 + i2) measure.
225	-		*/
226	204	93	if (ec >= xenv->mxcost) {
227	-		long fbest, fbest1, bbest, bbest1;
228	-
229	#####	94	fbest = fbest1 = -1;
230	#####	94,99,100	for (d = fmax; d >= fmin; d -= 2) {
231	#####	95	i1 = XDL_MIN(kvdf[d], lim1);
232	#####	95	i2 = i1 - d;
233	#####	95	if (lim2 < i2)
234	#####	96	i1 = lim2 + d, i2 = lim2;
235	#####	97	if (fbest < i1 + i2) {
236	#####	98	fbest = i1 + i2;
237	#####	98	fbest1 = i1;
238	-		}
239	-		}
240	-
241	#####	101	bbest = bbest1 = XDL_LINE_MAX;
242	#####	101,106,107	for (d = bmax; d >= bmin; d -= 2) {
243	#####	102	i1 = XDL_MAX(off1, kvdb[d]);
244	#####	102	i2 = i1 - d;
245	#####	102	if (i2 < off2)
246	#####	103	i1 = off2 + d, i2 = off2;
247	#####	104	if (i1 + i2 < bbest) {
248	#####	105	bbest = i1 + i2;
249	#####	105	bbest1 = i1;
250	-		}
251	-		}
252	-
253	#####	108	if ((lim1 + lim2) - bbest < fbest - (off1 + off2)) {
254	#####	109	spl->i1 = fbest1;
255	#####	109	spl->i2 = fbest - fbest1;
256	#####	109	spl->min_lo = 1;
257	#####	109	spl->min_hi = 0;
258	-		} else {
259	#####	110	spl->i1 = bbest1;
260	#####	110	spl->i2 = bbest - bbest1;
261	#####	110	spl->min_lo = 0;
262	#####	110	spl->min_hi = 1;
263	-		}
264	#####	111	return ec;
265	-		}
266	377	112	}
267		-		}
268		-
269		-
270		-		/*
271		-		* Rule: "Divide et Impera". Recursively split the box in sub-boxes by calling
272		-		* the box splitting function. Note that the real job (marking changed lines)
273		-		* is done in the two boundary reaching checks.
274		-		*/
275		2476	2	int xdl_recs_cmp(diffdata_t *dd1, long off1, long lim1,
276		-		diffdata_t *dd2, long off2, long lim2,
277		-		long kvdf, long kvdb, int need_min, xdalgoenv_t *xenv) {
278		2476	2	unsigned long const ha1 = dd1->ha, ha2 = dd2->ha;
279		-
280		-		/*
281		-		* Shrink the box by walking through each diagonal snake (SW and NE).
282		-		*/
283		4413	2-6	for (; off1 < lim1 && off2 < lim2 && ha1[off1] == ha2[off2]; off1++, off2++);
284		3003	7-11	for (; off1 < lim1 && off2 < lim2 && ha1[lim1 - 1] == ha2[lim2 - 1]; lim1--, lim2--);
285		-
286		-		/*
287		-		* If one dimension is empty, then all records on the other one must
288		-		* be obviously changed.
289		-		*/
290		2476	12	if (off1 == lim1) {
291		1882	13	char *rchg2 = dd2->rchg;
292		1882	13	long *rindex2 = dd2->rindex;
293		-
294		2485	13-16	for (; off2 < lim2; off2++)
295		603	14	rchg2[rindex2[off2]] = 1;
296		594	17	} else if (off2 == lim2) {
297		417	18	char *rchg1 = dd1->rchg;
298		417	18	long *rindex1 = dd1->rindex;
299		-
300		1082	18-21	for (; off1 < lim1; off1++)
301		665	19	rchg1[rindex1[off1]] = 1;
302		-		} else {
303		-		xdpsplit_t spl;
304		177	22	spl.i1 = spl.i2 = 0;
305		-
306		-		/*
307		-		* Divide ...
308		-		*/
309		177	22,23	if (xdl_split(ha1, off1, lim1, ha2, off2, lim2, kvdf, kvdb,
310		-		need_min, &spl, xenv) < 0) {
311		-
312		#####	24,31	return -1;
313		-		}
314		-
315		-		/*
316		-		* ... et Impera.
317		-		*/
318		177	25,26	if (xdl_recs_cmp(dd1, off1, spl.i1, dd2, off2, spl.i2,
319		177	28	kvdf, kvdb, spl.min_lo, xenv) < 0 \|\|
320		177	27	xdl_recs_cmp(dd1, spl.i1, lim1, dd2, spl.i2, lim2,
321		-		kvdf, kvdb, spl.min_hi, xenv) < 0) {
322		-
323		177	29,30	return -1;
324		-		}
325		-		}
326		-
327		2476	32	return 0;
328		-		}
329		-
330		-
331		2123	2	int xdl_do_diff(mmfile_t mf1, mmfile_t mf2, xpparam_t const *xpp,
332		-		xdfenv_t *xe) {
333		-		size_t ndiags, allocsize;
334		-		long kvd, kvdf, *kvdb;
335		-		xdalgoenv_t xenv;
336		-		diffdata_t dd1, dd2;
337		-
338		2123	2	if (XDF_DIFF_ALG(xpp->flags) == XDF_PATIENCE_DIFF)
339		1	3	return xdl_do_patience_diff(mf1, mf2, xpp, xe);
340		-
341		2122	4	if (XDF_DIFF_ALG(xpp->flags) == XDF_HISTOGRAM_DIFF)
342		#####	5	return xdl_do_histogram_diff(mf1, mf2, xpp, xe);
343		-
344		2122	6,7	if (xdl_prepare_env(mf1, mf2, xpp, xe) < 0) {
345		-
346		#####	8	return -1;
347		-		}
348		-
349		-		/*
350		-		* Allocate and setup K vectors to be used by the differential algorithm.
351		-		* One is to store the forward path and one to store the backward path.
352		-		*/
353		2122	9-15	GIT_ERROR_CHECK_ALLOC_ADD3(&ndiags, xe->xdf1.nreff, xe->xdf2.nreff, 3);
354		2122	16-22	GIT_ERROR_CHECK_ALLOC_MULTIPLY(&allocsize, ndiags, 2);
355		2122	23-29	GIT_ERROR_CHECK_ALLOC_ADD(&allocsize, allocsize, 2);
356		2122	30-36	GIT_ERROR_CHECK_ALLOC_MULTIPLY(&allocsize, allocsize, sizeof(long));
357		-
358		2122	37,38	if (!(kvd = (long *) xdl_malloc(allocsize))) {
359		#####	39	xdl_free_env(xe);
360		#####	40	return -1;
361		-		}
362		2122	41	kvdf = kvd;
363		2122	41	kvdb = kvdf + ndiags;
364		2122	41	kvdf += xe->xdf2.nreff + 1;
365		2122	41	kvdb += xe->xdf2.nreff + 1;
366		-
367		2122	41	xenv.mxcost = xdl_bogosqrt(ndiags);
368		2122	42	if (xenv.mxcost < XDL_MAX_COST_MIN)
369		2122	43	xenv.mxcost = XDL_MAX_COST_MIN;
370		2122	44	xenv.snake_cnt = XDL_SNAKE_CNT;
371		2122	44	xenv.heur_min = XDL_HEUR_MIN_COST;
372		-
373		2122	44	dd1.nrec = xe->xdf1.nreff;
374		2122	44	dd1.ha = xe->xdf1.ha;
375		2122	44	dd1.rchg = xe->xdf1.rchg;
376		2122	44	dd1.rindex = xe->xdf1.rindex;
377		2122	44	dd2.nrec = xe->xdf2.nreff;
378		2122	44	dd2.ha = xe->xdf2.ha;
379		2122	44	dd2.rchg = xe->xdf2.rchg;
380		2122	44	dd2.rindex = xe->xdf2.rindex;
381		-
382		2122	44,45	if (xdl_recs_cmp(&dd1, 0, dd1.nrec, &dd2, 0, dd2.nrec,
383		2122	44	kvdf, kvdb, (xpp->flags & XDF_NEED_MINIMAL) != 0, &xenv) < 0) {
384		-
385		#####	46	xdl_free(kvd);
386		#####	47	xdl_free_env(xe);
387		#####	48	return -1;
388		-		}
389		-
390		2122	49	xdl_free(kvd);
391		-
392		2122	50	return 0;
393		-		}
394		-
395		-
396		2570	2	static xdchange_t xdl_add_change(xdchange_t xscr, long i1, long i2, long chg1, long chg2) {
397		-		xdchange_t *xch;
398		-
399		2570	2,3	if (!(xch = (xdchange_t *) xdl_malloc(sizeof(xdchange_t))))
400		#####	4	return NULL;
401		-
402		2570	5	xch->next = xscr;
403		2570	5	xch->i1 = i1;
404		2570	5	xch->i2 = i2;
405		2570	5	xch->chg1 = chg1;
406		2570	5	xch->chg2 = chg2;
407		2570	5	xch->ignore = 0;
408		-
409		2570	5	return xch;
410		-		}
411		-
412		-
413		8817	2	static int recs_match(xrecord_t rec1, xrecord_t rec2, long flags)
414		-		{
415		8817	2,4	return (rec1->ha == rec2->ha &&
416		5678	3	xdl_recmatch(rec1->ptr, rec1->size,
417		-		rec2->ptr, rec2->size,
418		-		flags));
419		-		}
420		-
421		-		/*
422		-		* If a line is indented more than this, get_indent() just returns this value.
423		-		* This avoids having to do absurd amounts of work for data that are not
424		-		* human-readable text, and also ensures that the output of get_indent fits within
425		-		* an int.
426		-		*/
427		-		#define MAX_INDENT 200
428		-
429		-		/*
430		-		* Return the amount of indentation of the specified line, treating TAB as 8
431		-		* columns. Return -1 if line is empty or contains only whitespace. Clamp the
432		-		* output value at MAX_INDENT.
433		-		*/
434		#####	2	static int get_indent(xrecord_t *rec)
435		-		{
436		-		long i;
437		#####	2	int ret = 0;
438		-
439		#####	2,12,13	for (i = 0; i < rec->size; i++) {
440		#####	3	char c = rec->ptr[i];
441		-
442		#####	3,4	if (!XDL_ISSPACE(c))
443		#####	5	return ret;
444		#####	6	else if (c == ' ')
445		#####	7	ret += 1;
446		#####	8	else if (c == '\t')
447		#####	9	ret += 8 - ret % 8;
448		-		/* ignore other whitespace characters */
449		-
450		#####	10	if (ret >= MAX_INDENT)
451		#####	11	return MAX_INDENT;
452		-		}
453		-
454		-		/* The line contains only whitespace. */
455		#####	14	return -1;
456		-		}
457		-
458		-		/*
459		-		* If more than this number of consecutive blank rows are found, just return this
460		-		* value. This avoids requiring O(N^2) work for pathological cases, and also
461		-		* ensures that the output of score_split fits in an int.
462		-		*/
463		-		#define MAX_BLANKS 20
464		-
465		-		/* Characteristics measured about a hypothetical split position. */
466		-		struct split_measurement {
467		-		/*
468		-		* Is the split at the end of the file (aside from any blank lines)?
469		-		*/
470		-		int end_of_file;
471		-
472		-		/*
473		-		* How much is the line immediately following the split indented (or -1 if
474		-		* the line is blank):
475		-		*/
476		-		int indent;
477		-
478		-		/*
479		-		* How many consecutive lines above the split are blank?
480		-		*/
481		-		int pre_blank;
482		-
483		-		/*
484		-		* How much is the nearest non-blank line above the split indented (or -1
485		-		* if there is no such line)?
486		-		*/
487		-		int pre_indent;
488		-
489		-		/*
490		-		* How many lines after the line following the split are blank?
491		-		*/
492		-		int post_blank;
493		-
494		-		/*
495		-		* How much is the nearest non-blank line after the line following the
496		-		* split indented (or -1 if there is no such line)?
497		-		*/
498		-		int post_indent;
499		-		};
500		-
501		-		struct split_score {
502		-		/* The effective indent of this split (smaller is preferred). */
503		-		int effective_indent;
504		-
505		-		/* Penalty for this split (smaller is preferred). */
506		-		int penalty;
507		-		};
508		-
509		-		/*
510		-		* Fill m with information about a hypothetical split of xdf above line split.
511		-		*/
512		#####	2	static void measure_split(const xdfile_t *xdf, long split,
513		-		struct split_measurement *m)
514		-		{
515		-		long i;
516		-
517		#####	2	if (split >= xdf->nrec) {
518		#####	3	m->end_of_file = 1;
519		#####	3	m->indent = -1;
520		-		} else {
521		#####	4	m->end_of_file = 0;
522		#####	4,5	m->indent = get_indent(xdf->recs[split]);
523		-		}
524		-
525		#####	6	m->pre_blank = 0;
526		#####	6	m->pre_indent = -1;
527		#####	6,12,13	for (i = split - 1; i >= 0; i--) {
528		#####	7	m->pre_indent = get_indent(xdf->recs[i]);
529		#####	8	if (m->pre_indent != -1)
530		#####	9	break;
531		#####	10	m->pre_blank += 1;
532		#####	10	if (m->pre_blank == MAX_BLANKS) {
533		#####	11	m->pre_indent = 0;
534		#####	11	break;
535		-		}
536		-		}
537		-
538		#####	14	m->post_blank = 0;
539		#####	14	m->post_indent = -1;
540		#####	14,20,21	for (i = split + 1; i < xdf->nrec; i++) {
541		#####	15	m->post_indent = get_indent(xdf->recs[i]);
542		#####	16	if (m->post_indent != -1)
543		#####	17	break;
544		#####	18	m->post_blank += 1;
545		#####	18	if (m->post_blank == MAX_BLANKS) {
546		#####	19	m->post_indent = 0;
547		#####	19	break;
548		-		}
549		-		}
550		#####	22	}
551		-
552		-		/*
553		-		* The empirically-determined weight factors used by score_split() below.
554		-		* Larger values means that the position is a less favorable place to split.
555		-		*
556		-		* Note that scores are only ever compared against each other, so multiplying
557		-		* all of these weight/penalty values by the same factor wouldn't change the
558		-		* heuristic's behavior. Still, we need to set that arbitrary scale somehow.
559		-		* In practice, these numbers are chosen to be large enough that they can be
560		-		* adjusted relative to each other with sufficient precision despite using
561		-		* integer math.
562		-		*/
563		-
564		-		/* Penalty if there are no non-blank lines before the split */
565		-		#define START_OF_FILE_PENALTY 1
566		-
567		-		/* Penalty if there are no non-blank lines after the split */
568		-		#define END_OF_FILE_PENALTY 21
569		-
570		-		/* Multiplier for the number of blank lines around the split */
571		-		#define TOTAL_BLANK_WEIGHT (-30)
572		-
573		-		/* Multiplier for the number of blank lines after the split */
574		-		#define POST_BLANK_WEIGHT 6
575		-
576		-		/*
577		-		* Penalties applied if the line is indented more than its predecessor
578		-		*/
579		-		#define RELATIVE_INDENT_PENALTY (-4)
580		-		#define RELATIVE_INDENT_WITH_BLANK_PENALTY 10
581		-
582		-		/*
583		-		* Penalties applied if the line is indented less than both its predecessor and
584		-		* its successor
585		-		*/
586		-		#define RELATIVE_OUTDENT_PENALTY 24
587		-		#define RELATIVE_OUTDENT_WITH_BLANK_PENALTY 17
588		-
589		-		/*
590		-		* Penalties applied if the line is indented less than its predecessor but not
591		-		* less than its successor
592		-		*/
593		-		#define RELATIVE_DEDENT_PENALTY 23
594		-		#define RELATIVE_DEDENT_WITH_BLANK_PENALTY 17
595		-
596		-		/*
597		-		* We only consider whether the sum of the effective indents for splits are
598		-		* less than (-1), equal to (0), or greater than (+1) each other. The resulting
599		-		* value is multiplied by the following weight and combined with the penalty to
600		-		* determine the better of two scores.
601		-		*/
602		-		#define INDENT_WEIGHT 60
603		-
604		-		/*
605		-		* Compute a badness score for the hypothetical split whose measurements are
606		-		* stored in m. The weight factors were determined empirically using the tools and
607		-		* corpus described in
608		-		*
609		-		* https://github.com/mhagger/diff-slider-tools
610		-		*
611		-		* Also see that project if you want to improve the weights based on, for example,
612		-		* a larger or more diverse corpus.
613		-		*/
614		#####	2	static void score_add_split(const struct split_measurement m, struct split_score s)
615		-		{
616		-		/*
617		-		* A place to accumulate penalty factors (positive makes this index more
618		-		* favored):
619		-		*/
620		-		int post_blank, total_blank, indent, any_blanks;
621		-
622		#####	2,3	if (m->pre_indent == -1 && m->pre_blank == 0)
623		#####	4	s->penalty += START_OF_FILE_PENALTY;
624		-
625		#####	5	if (m->end_of_file)
626		#####	6	s->penalty += END_OF_FILE_PENALTY;
627		-
628		-		/*
629		-		* Set post_blank to the number of blank lines following the split,
630		-		* including the line immediately after the split:
631		-		*/
632		#####	7-9	post_blank = (m->indent == -1) ? 1 + m->post_blank : 0;
633		#####	10	total_blank = m->pre_blank + post_blank;
634		-
635		-		/* Penalties based on nearby blank lines: */
636		#####	10	s->penalty += TOTAL_BLANK_WEIGHT * total_blank;
637		#####	10	s->penalty += POST_BLANK_WEIGHT * post_blank;
638		-
639		#####	10	if (m->indent != -1)
640		#####	11	indent = m->indent;
641		-		else
642		#####	12	indent = m->post_indent;
643		-
644		#####	13	any_blanks = (total_blank != 0);
645		-
646		-		/* Note that the effective indent is -1 at the end of the file: */
647		#####	13	s->effective_indent += indent;
648		-
649		#####	13	if (indent == -1) {
650		-		/* No additional adjustments needed. */
651		#####	14	} else if (m->pre_indent == -1) {
652		-		/* No additional adjustments needed. */
653		#####	15	} else if (indent > m->pre_indent) {
654		-		/*
655		-		* The line is indented more than its predecessor.
656		-		*/
657		#####	16,19	s->penalty += any_blanks ?
658		#####	16-18	RELATIVE_INDENT_WITH_BLANK_PENALTY :
659		-		RELATIVE_INDENT_PENALTY;
660		#####	20	} else if (indent == m->pre_indent) {
661		-		/*
662		-		* The line has the same indentation level as its predecessor.
663		-		* No additional adjustments needed.
664		-		*/
665		-		} else {
666		-		/*
667		-		* The line is indented less than its predecessor. It could be
668		-		* the block terminator of the previous block, but it could
669		-		* also be the start of a new block (e.g., an "else" block, or
670		-		* maybe the previous block didn't have a block terminator).
671		-		* Try to distinguish those cases based on what comes next:
672		-		*/
673		#####	21,22	if (m->post_indent != -1 && m->post_indent > indent) {
674		-		/*
675		-		* The following line is indented more. So it is likely
676		-		* that this line is the start of a block.
677		-		*/
678	#####	23,26	s->penalty += any_blanks ?
679	#####	23-25	RELATIVE_OUTDENT_WITH_BLANK_PENALTY :
680	-		RELATIVE_OUTDENT_PENALTY;
681	-		} else {
682	-		/*
683	-		* That was probably the end of a block.
684	-		*/
685	#####	27	s->penalty += any_blanks ?
686	#####	27-29	RELATIVE_DEDENT_WITH_BLANK_PENALTY :
687	-		RELATIVE_DEDENT_PENALTY;
688	-		}
689	-		}
690	#####	31	}
691		-
692		#####	2	static int score_cmp(struct split_score s1, struct split_score s2)
693		-		{
694		-		/* -1 if s1.effective_indent < s2->effective_indent, etc. */
695		#####	2	int cmp_indents = ((s1->effective_indent > s2->effective_indent) -
696		#####	2	(s1->effective_indent < s2->effective_indent));
697		-
698		#####	2	return INDENT_WEIGHT * cmp_indents + (s1->penalty - s2->penalty);
699		-		}
700		-
701		-		/*
702		-		* Represent a group of changed lines in an xdfile_t (i.e., a contiguous group
703		-		* of lines that was inserted or deleted from the corresponding version of the
704		-		* file). We consider there to be such a group at the beginning of the file, at
705		-		* the end of the file, and between any two unchanged lines, though most such
706		-		* groups will usually be empty.
707		-		*
708		-		* If the first line in a group is equal to the line following the group, then
709		-		* the group can be slid down. Similarly, if the last line in a group is equal
710		-		* to the line preceding the group, then the group can be slid up. See
711		-		* group_slide_down() and group_slide_up().
712		-		*
713		-		* Note that loops that are testing for changed lines in xdf->rchg do not need
714		-		* index bounding since the array is prepared with a zero at position -1 and N.
715		-		*/
716		-		struct xdlgroup {
717		-		/*
718		-		* The index of the first changed line in the group, or the index of
719		-		* the unchanged line above which the (empty) group is located.
720		-		*/
721		-		long start;
722		-
723		-		/*
724		-		* The index of the first unchanged line after the group. For an empty
725		-		* group, end is equal to start.
726		-		*/
727		-		long end;
728		-		};
729		-
730		-		/*
731		-		* Initialize g to point at the first group in xdf.
732		-		*/
733		8492	2	static void group_init(xdfile_t xdf, struct xdlgroup g)
734		-		{
735		8492	2	g->start = g->end = 0;
736		23515	2,4	while (xdf->rchg[g->end])
737		15023	3	g->end++;
738		8492	5	}
739		-
740		-		/*
741		-		* Move g to describe the next (possibly empty) group in xdf and return 0. If g
742		-		* is already at the end of the file, do nothing and return -1.
743		-		*/
744		332691	2	XDL_INLINE(int) group_next(xdfile_t xdf, struct xdlgroup g)
745		-		{
746		332691	2	if (g->end == xdf->nrec)
747		8492	3	return -1;
748		-
749		324199	4	g->start = g->end + 1;
750		332648	4-6	for (g->end = g->start; xdf->rchg[g->end]; g->end++)
751		-		;
752		-
753		324199	7	return 0;
754		-		}
755		-
756		-		/*
757		-		* Move g to describe the previous (possibly empty) group in xdf and return 0.
758		-		* If g is already at the beginning of the file, do nothing and return -1.
759		-		*/
760		2823	2	XDL_INLINE(int) group_previous(xdfile_t xdf, struct xdlgroup g)
761		-		{
762		2823	2	if (g->start == 0)
763		#####	3	return -1;
764		-
765		2823	4	g->end = g->start - 1;
766		3201	4-6	for (g->start = g->end; xdf->rchg[g->start - 1]; g->start--)
767		-		;
768		-
769		2823	7	return 0;
770		-		}
771		-
772		-		/*
773		-		* If g can be slid toward the end of the file, do so, and if it bumps into a
774		-		* following group, expand this group to include it. Return 0 on success or -1
775		-		* if g cannot be slid down.
776		-		*/
777		7000	2	static int group_slide_down(xdfile_t xdf, struct xdlgroup g, long flags)
778		-		{
779		7000	2,4	if (g->end < xdf->nrec &&
780		4353	3	recs_match(xdf->recs[g->start], xdf->recs[g->end], flags)) {
781		2855	5	xdf->rchg[g->start++] = 0;
782		2855	5	xdf->rchg[g->end++] = 1;
783		-
784		2864	5,7	while (xdf->rchg[g->end])
785		9	6	g->end++;
786		-
787		2855	8	return 0;
788		-		} else {
789		4145	9	return -1;
790		-		}
791		-		}
792		-
793		-		/*
794		-		* If g can be slid toward the beginning of the file, do so, and if it bumps
795		-		* into a previous group, expand this group to include it. Return 0 on success
796		-		* or -1 if g cannot be slid up.
797		-		*/
798		6968	2	static int group_slide_up(xdfile_t xdf, struct xdlgroup g, long flags)
799		-		{
800		6968	2,4	if (g->start > 0 &&
801		4464	3	recs_match(xdf->recs[g->start - 1], xdf->recs[g->end - 1], flags)) {
802		2823	5	xdf->rchg[--g->start] = 1;
803		2823	5	xdf->rchg[--g->end] = 0;
804		-
805		2961	5,7	while (xdf->rchg[g->start - 1])
806		138	6	g->start--;
807		-
808		2823	8	return 0;
809		-		} else {
810		4145	9	return -1;
811		-		}
812		-		}
813		-
814		#####	2	static void xdl_bug(const char *msg)
815		-		{
816		#####	2	fprintf(stderr, "BUG: %s\n", msg);
817		#####	3	exit(1);
818		-		}
819		-
820		-		/*
821		-		* Move back and forward change groups for a consistent and pretty diff output.
822		-		* This also helps in finding joinable change groups and reducing the diff
823		-		* size.
824		-		*/
825		4246	2	int xdl_change_compact(xdfile_t xdf, xdfile_t xdfo, long flags) {
826		-		struct xdlgroup g, go;
827		-		long earliest_end, end_matching_other;
828		-		long groupsize;
829		-
830		4246	2	group_init(xdf, &g);
831		4246	3	group_init(xdfo, &go);
832		-
833		-		while (1) {
834		-		/* If the group is empty in the to-be-compacted file, skip it: */
835		164918	4	if (g.end == g.start)
836		160804	5	goto next;
837		-
838		-		/*
839		-		* Now shift the change up and then down as far as possible in
840		-		* each direction. If it bumps into any other changes, merge them.
841		-		*/
842		-		do {
843		4145	6	groupsize = g.end - g.start;
844		-
845		-		/*
846		-		* Keep track of the last "end" index that causes this
847		-		* group to align with a group of changed lines in the
848		-		* other file. -1 indicates that we haven't found such
849		-		* a match yet:
850		-		*/
851		4145	6	end_matching_other = -1;
852		-
853		-		/* Shift the group backward as much as possible: */
854		5918	6,10,11	while (!group_slide_up(xdf, &g, flags))
855		1773	7,8	if (group_previous(xdfo, &go))
856		#####	9	xdl_bug("group sync broken sliding up");
857		-
858		-		/*
859		-		* This is this highest that this group can be shifted.
860		-		* Record its end index:
861		-		*/
862		4145	12	earliest_end = g.end;
863		-
864		4145	12	if (go.end > go.start)
865		2910	13	end_matching_other = g.end;
866		-
867		-		/* Now shift the group forward as far as possible: */
868		-		while (1) {
869		7000	14,15	if (group_slide_down(xdf, &g, flags))
870		4145	16	break;
871		2855	17,18	if (group_next(xdfo, &go))
872		#####	19	xdl_bug("group sync broken sliding down");
873		-
874		2855	20	if (go.end > go.start)
875		232	21	end_matching_other = g.end;
876		2855	22	}
877		4145	23	} while (groupsize != g.end - g.start);
878		-
879		-		/*
880		-		* If the group can be shifted, then we can possibly use this
881		-		* freedom to produce a more intuitive diff.
882		-		*
883		-		* The group is currently shifted as far down as possible, so the
884		-		* heuristics below only have to handle upwards shifts.
885		-		*/
886		-
887		4114	24	if (g.end == earliest_end) {
888		-		/* no shifting was possible */
889	606	25	} else if (end_matching_other != -1) {
890	-		/*
891	-		* Move the possibly merged group of changes back to line
892	-		* up with the last group of changes from the other file
893	-		* that it can align with.
894	-		*/
895	1363	26,33	while (go.end == go.start) {
896	1050	27,28	if (group_slide_up(xdf, &g, flags))
897	#####	29	xdl_bug("match disappeared");
898	1050	30,31	if (group_previous(xdfo, &go))
899	#####	32	xdl_bug("group sync broken sliding to match");
900	-		}
901	293	34	} else if (flags & XDF_INDENT_HEURISTIC) {
902	-		/*
903	-		* Indent heuristic: a group of pure add/delete lines
904	-		* implies two splits, one between the end of the "before"
905	-		* context and the start of the group, and another between
906	-		* the end of the group and the beginning of the "after"
907	-		* context. Some splits are aesthetically better and some
908	-		* are worse. We compute a badness "score" for each split,
909	-		* and add the scores for the two splits to define a
910	-		* "score" for each position that the group can be shifted
911	-		* to. Then we pick the shift with the lowest score.
912	-		*/
913	#####	35	long shift, best_shift = -1;
914	-		struct split_score best_score;
915	-
916	#####	35,44,45	for (shift = earliest_end; shift <= g.end; shift++) {
917	-		struct split_measurement m;
918	#####	36	struct split_score score = {0, 0};
919	-
920	#####	36	measure_split(xdf, shift, &m);
921	#####	37	score_add_split(&m, &score);
922	#####	38	measure_split(xdf, shift - groupsize, &m);
923	#####	39	score_add_split(&m, &score);
924	#####	40,42	if (best_shift == -1 \|\|
925	#####	41	score_cmp(&score, &best_score) <= 0) {
926	#####	43	best_score.effective_indent = score.effective_indent;
927	#####	43	best_score.penalty = score.penalty;
928	#####	43	best_shift = shift;
929	-		}
930	-		}
931	-
932	#####	46,53,54	while (g.end > best_shift) {
933	#####	47,48	if (group_slide_up(xdf, &g, flags))
934	#####	49	xdl_bug("best shift unreached");
935	#####	50,51	if (group_previous(xdfo, &go))
936	#####	52	xdl_bug("group sync broken sliding to blank line");
937	-		}
938	-		}
939	-
940	-		next:
941	-		/* Move past the just-processed group: */
942	164918	55,56	if (group_next(xdf, &g))
943	4246	57	break;
944	160672	58,59	if (group_next(xdfo, &go))
945	#####	60	xdl_bug("group sync broken moving to next group");
946	160672	61	}
947	-
948	4246	62,63	if (!group_next(xdfo, &go))
949	#####	64	xdl_bug("group sync broken at end of file");
950	-
951	4246	65	return 0;
952		-		}
953		-
954		-
955		2123	2	int xdl_build_script(xdfenv_t xe, xdchange_t *xscr) {
956		2123	2	xdchange_t cscr = NULL, xch;
957		2123	2	char rchg1 = xe->xdf1.rchg, rchg2 = xe->xdf2.rchg;
958		-		long i1, i2, l1, l2;
959		-
960		-		/*
961		-		* Trivial. Collects "groups" of changes and creates an edit script.
962		-		*/
963		84598	2,16-18	for (i1 = xe->xdf1.nrec, i2 = xe->xdf2.nrec; i1 >= 0 \|\| i2 >= 0; i1--, i2--)
964		82475	3,4	if (rchg1[i1 - 1] \|\| rchg2[i2 - 1]) {
965		7938	5-7	for (l1 = i1; rchg1[i1 - 1]; i1--);
966		8784	8-10	for (l2 = i2; rchg2[i2 - 1]; i2--);
967		-
968		2570	11,12	if (!(xch = xdl_add_change(cscr, i1, i2, l1 - i1, l2 - i2))) {
969		#####	13	xdl_free_script(cscr);
970		#####	14	return -1;
971		-		}
972		2570	15	cscr = xch;
973		-		}
974		-
975		2123	19	*xscr = cscr;
976		-
977		2123	19	return 0;
978		-		}
979		-
980		-
981		2120	2	void xdl_free_script(xdchange_t *xscr) {
982		-		xdchange_t *xch;
983		-
984		4690	2,4	while ((xch = xscr) != NULL) {
985		2570	3	xscr = xscr->next;
986		2570	3	xdl_free(xch);
987		-		}
988		2120	5	}
989		-
990		188	2	static int xdl_call_hunk_func(xdfenv_t xe, xdchange_t xscr, xdemitcb_t *ecb,
991		-		xdemitconf_t const *xecfg)
992		-		{
993		-		xdchange_t xch, xche;
994		-
995		-		(void)xe;
996		-
997		404	2,9,10	for (xch = xscr; xch; xch = xche->next) {
998		216	3	xche = xdl_get_hunk(&xch, xecfg);
999		216	4	if (!xch)
1000		#####	5	break;
1001		216	6,6,6,7	if (xecfg->hunk_func(xch->i1, xche->i1 + xche->chg1 - xch->i1,
1002		216	6,6	xch->i2, xche->i2 + xche->chg2 - xch->i2,
1003		-		ecb->priv) < 0)
1004		#####	8	return -1;
1005		-		}
1006		188	11	return 0;
1007		-		}
1008		-
1009		#####	2	static void xdl_mark_ignorable(xdchange_t xscr, xdfenv_t xe, long flags)
1010		-		{
1011		-		xdchange_t *xch;
1012		-
1013		#####	2,13,14	for (xch = xscr; xch; xch = xch->next) {
1014		#####	3	int ignore = 1;
1015		-		xrecord_t **rec;
1016		-		long i;
1017		-
1018		#####	3	rec = &xe->xdf1.recs[xch->i1];
1019		#####	3,5-7	for (i = 0; i < xch->chg1 && ignore; i++)
1020		#####	4	ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags);
1021		-
1022		#####	8	rec = &xe->xdf2.recs[xch->i2];
1023		#####	8,10-12	for (i = 0; i < xch->chg2 && ignore; i++)
1024		#####	9	ignore = xdl_blankline(rec[i]->ptr, rec[i]->size, flags);
1025		-
1026		#####	13	xch->ignore = ignore;
1027		-		}
1028		#####	15	}
1029		-
1030		1020	2	int xdl_diff(mmfile_t mf1, mmfile_t mf2, xpparam_t const *xpp,
1031		-		xdemitconf_t const xecfg, xdemitcb_t ecb) {
1032		-		xdchange_t *xscr;
1033		-		xdfenv_t xe;
1034		1020	2-4	emit_func_t ef = xecfg->hunk_func ? xdl_call_hunk_func : xdl_emit_diff;
1035		-
1036		1020	5,6	if (xdl_do_diff(mf1, mf2, xpp, &xe) < 0) {
1037		-
1038		#####	7	return -1;
1039		-		}
1040		1020	8,9,11	if (xdl_change_compact(&xe.xdf1, &xe.xdf2, xpp->flags) < 0 \|\|
1041		1020	10,13	xdl_change_compact(&xe.xdf2, &xe.xdf1, xpp->flags) < 0 \|\|
1042		1020	12	xdl_build_script(&xe, &xscr) < 0) {
1043		-
1044		#####	14	xdl_free_env(&xe);
1045		#####	15	return -1;
1046		-		}
1047		1020	16	if (xscr) {
1048		1017	17	if (xpp->flags & XDF_IGNORE_BLANK_LINES)
1049		#####	18	xdl_mark_ignorable(xscr, &xe, xpp->flags);
1050		-
1051		1017	19,20	if (ef(&xe, xscr, ecb, xecfg) < 0) {
1052		-
1053		#####	21	xdl_free_script(xscr);
1054		#####	22	xdl_free_env(&xe);
1055		#####	23	return -1;
1056		-		}
1057		1017	24	xdl_free_script(xscr);
1058		-		}
1059		1020	25	xdl_free_env(&xe);
1060		-
1061		1020	26	return 0;
1062		-		}