Skip to content

Commit 643300a

Browse files
hotourxin
authored andcommittedMar 1, 2015
SPARK-5984: Fix TimSort bug causes ArrayOutOfBoundsException
Fix TimSort bug which causes a ArrayOutOfBoundsException. Using the proposed fix here http://envisage-project.eu/proving-android-java-and-python-sorting-algorithm-is-broken-and-how-to-fix-it/ Author: Evan Yu <ehotou@gmail.com> Closes #4804 from hotou/SPARK-5984 and squashes the following commits: 3421b6c [Evan Yu] SPARK-5984: Add info to LICENSE e61c6b8 [Evan Yu] SPARK-5984: Fix license and document 6ccc280 [Evan Yu] SPARK-5984: Add License header to file e06c0d2 [Evan Yu] SPARK-5984: Add License header to file 4d95f75 [Evan Yu] SPARK-5984: Fix TimSort bug causes ArrayOutOfBoundsException 479a106 [Evan Yu] SPARK-5984: Fix TimSort bug causes ArrayOutOfBoundsException
1 parent 86fcdae commit 643300a

File tree

4 files changed

+161
-5
lines changed

4 files changed

+161
-5
lines changed
 

‎LICENSE

+16
Original file line numberDiff line numberDiff line change
@@ -771,6 +771,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
771771
See the License for the specific language governing permissions and
772772
limitations under the License.
773773

774+
========================================================================
775+
For TestTimSort (core/src/test/java/org/apache/spark/util/collection/TestTimSort.java):
776+
========================================================================
777+
Copyright (C) 2015 Stijn de Gouw
778+
779+
Licensed under the Apache License, Version 2.0 (the "License");
780+
you may not use this file except in compliance with the License.
781+
You may obtain a copy of the License at
782+
783+
http://www.apache.org/licenses/LICENSE-2.0
784+
785+
Unless required by applicable law or agreed to in writing, software
786+
distributed under the License is distributed on an "AS IS" BASIS,
787+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
788+
See the License for the specific language governing permissions and
789+
limitations under the License.
774790

775791
========================================================================
776792
For LimitedInputStream

‎core/src/main/java/org/apache/spark/util/collection/TimSort.java

+4-5
Original file line numberDiff line numberDiff line change
@@ -425,15 +425,14 @@ private void pushRun(int runBase, int runLen) {
425425
private void mergeCollapse() {
426426
while (stackSize > 1) {
427427
int n = stackSize - 2;
428-
if (n > 0 && runLen[n-1] <= runLen[n] + runLen[n+1]) {
428+
if ( (n >= 1 && runLen[n-1] <= runLen[n] + runLen[n+1])
429+
|| (n >= 2 && runLen[n-2] <= runLen[n] + runLen[n-1])) {
429430
if (runLen[n - 1] < runLen[n + 1])
430431
n--;
431-
mergeAt(n);
432-
} else if (runLen[n] <= runLen[n + 1]) {
433-
mergeAt(n);
434-
} else {
432+
} else if (runLen[n] > runLen[n + 1]) {
435433
break; // Invariant is established
436434
}
435+
mergeAt(n);
437436
}
438437
}
439438

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
/**
2+
* Copyright 2015 Stijn de Gouw
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*
16+
*/
17+
package org.apache.spark.util.collection;
18+
19+
import java.util.*;
20+
21+
/**
22+
* This codes generates a int array which fails the standard TimSort.
23+
*
24+
* The blog that reported the bug
25+
* http://www.envisage-project.eu/timsort-specification-and-verification/
26+
*
27+
* This codes was originally wrote by Stijn de Gouw, modified by Evan Yu to adapt to
28+
* our test suite.
29+
*
30+
* https://github.com/abstools/java-timsort-bug
31+
* https://github.com/abstools/java-timsort-bug/blob/master/LICENSE
32+
*/
33+
public class TestTimSort {
34+
35+
private static final int MIN_MERGE = 32;
36+
37+
/**
38+
* Returns an array of integers that demonstrate the bug in TimSort
39+
*/
40+
public static int[] getTimSortBugTestSet(int length) {
41+
int minRun = minRunLength(length);
42+
List<Long> runs = runsJDKWorstCase(minRun, length);
43+
return createArray(runs, length);
44+
}
45+
46+
private static int minRunLength(int n) {
47+
int r = 0; // Becomes 1 if any 1 bits are shifted off
48+
while (n >= MIN_MERGE) {
49+
r |= (n & 1);
50+
n >>= 1;
51+
}
52+
return n + r;
53+
}
54+
55+
private static int[] createArray(List<Long> runs, int length) {
56+
int[] a = new int[length];
57+
Arrays.fill(a, 0);
58+
int endRun = -1;
59+
for (long len : runs) {
60+
a[endRun += len] = 1;
61+
}
62+
a[length - 1] = 0;
63+
return a;
64+
}
65+
66+
/**
67+
* Fills <code>runs</code> with a sequence of run lengths of the form<br>
68+
* Y_n x_{n,1} x_{n,2} ... x_{n,l_n} <br>
69+
* Y_{n-1} x_{n-1,1} x_{n-1,2} ... x_{n-1,l_{n-1}} <br>
70+
* ... <br>
71+
* Y_1 x_{1,1} x_{1,2} ... x_{1,l_1}<br>
72+
* The Y_i's are chosen to satisfy the invariant throughout execution,
73+
* but the x_{i,j}'s are merged (by <code>TimSort.mergeCollapse</code>)
74+
* into an X_i that violates the invariant.
75+
*
76+
* @param length The sum of all run lengths that will be added to <code>runs</code>.
77+
*/
78+
private static List<Long> runsJDKWorstCase(int minRun, int length) {
79+
List<Long> runs = new ArrayList<Long>();
80+
81+
long runningTotal = 0, Y = minRun + 4, X = minRun;
82+
83+
while (runningTotal + Y + X <= length) {
84+
runningTotal += X + Y;
85+
generateJDKWrongElem(runs, minRun, X);
86+
runs.add(0, Y);
87+
// X_{i+1} = Y_i + x_{i,1} + 1, since runs.get(1) = x_{i,1}
88+
X = Y + runs.get(1) + 1;
89+
// Y_{i+1} = X_{i+1} + Y_i + 1
90+
Y += X + 1;
91+
}
92+
93+
if (runningTotal + X <= length) {
94+
runningTotal += X;
95+
generateJDKWrongElem(runs, minRun, X);
96+
}
97+
98+
runs.add(length - runningTotal);
99+
return runs;
100+
}
101+
102+
/**
103+
* Adds a sequence x_1, ..., x_n of run lengths to <code>runs</code> such that:<br>
104+
* 1. X = x_1 + ... + x_n <br>
105+
* 2. x_j >= minRun for all j <br>
106+
* 3. x_1 + ... + x_{j-2} < x_j < x_1 + ... + x_{j-1} for all j <br>
107+
* These conditions guarantee that TimSort merges all x_j's one by one
108+
* (resulting in X) using only merges on the second-to-last element.
109+
*
110+
* @param X The sum of the sequence that should be added to runs.
111+
*/
112+
private static void generateJDKWrongElem(List<Long> runs, int minRun, long X) {
113+
for (long newTotal; X >= 2 * minRun + 1; X = newTotal) {
114+
//Default strategy
115+
newTotal = X / 2 + 1;
116+
//Specialized strategies
117+
if (3 * minRun + 3 <= X && X <= 4 * minRun + 1) {
118+
// add x_1=MIN+1, x_2=MIN, x_3=X-newTotal to runs
119+
newTotal = 2 * minRun + 1;
120+
} else if (5 * minRun + 5 <= X && X <= 6 * minRun + 5) {
121+
// add x_1=MIN+1, x_2=MIN, x_3=MIN+2, x_4=X-newTotal to runs
122+
newTotal = 3 * minRun + 3;
123+
} else if (8 * minRun + 9 <= X && X <= 10 * minRun + 9) {
124+
// add x_1=MIN+1, x_2=MIN, x_3=MIN+2, x_4=2MIN+2, x_5=X-newTotal to runs
125+
newTotal = 5 * minRun + 5;
126+
} else if (13 * minRun + 15 <= X && X <= 16 * minRun + 17) {
127+
// add x_1=MIN+1, x_2=MIN, x_3=MIN+2, x_4=2MIN+2, x_5=3MIN+4, x_6=X-newTotal to runs
128+
newTotal = 8 * minRun + 9;
129+
}
130+
runs.add(0, X - newTotal);
131+
}
132+
runs.add(0, X);
133+
}
134+
}

‎core/src/test/scala/org/apache/spark/util/collection/SorterSuite.scala

+7
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,13 @@ class SorterSuite extends FunSuite {
6565
}
6666
}
6767

68+
// http://www.envisage-project.eu/timsort-specification-and-verification/
69+
test("SPARK-5984 TimSort bug") {
70+
val data = TestTimSort.getTimSortBugTestSet(67108864)
71+
new Sorter(new IntArraySortDataFormat).sort(data, 0, data.length, Ordering.Int)
72+
(0 to data.length - 2).foreach(i => assert(data(i) <= data(i + 1)))
73+
}
74+
6875
/** Runs an experiment several times. */
6976
def runExperiment(name: String, skip: Boolean = false)(f: => Unit, prepare: () => Unit): Unit = {
7077
if (skip) {

0 commit comments

Comments
 (0)
Please sign in to comment.