6 년 전 · bacd9fca06
--- a/.travis.yml
+++ b/.travis.yml
@@ -48,7 +48,7 @@ before_install:
 
				   - unzip -d ~/.local protoc.zip && rm protoc.zip
			
 
				   - go get -v golang.org/x/lint/golint
			
 
				   - go get -v github.com/golang/dep/cmd/dep
			
 
				-  - (wget https://bootstrap.pypa.io/get-pip.py || wget https://raw.githubusercontent.com/pypa/get-pip/master/get-pip.py) && python3 get-pip.py --user && rm get-pip.py
			
 
				+  - (wget -O - https://bootstrap.pypa.io/get-pip.py || wget -O - https://raw.githubusercontent.com/pypa/get-pip/master/get-pip.py) | python3 - --user pip==18.1
			
 
				   - export PATH=~/usr/bin:$GOPATH/bin:$PATH
			
 
				   - make --version
			
 
				   - pip3 --version
			
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,7 +28,7 @@ echo\n\
 
				 echo "	$@"\n\
			
 
				 echo\n\' > /browser && \
			
 
				     chmod +x /browser && \
			
 
				-    curl https://bootstrap.pypa.io/get-pip.py | python3 && \
			
 
				+    curl https://bootstrap.pypa.io/get-pip.py | python3 - pip==18.1 && \
			
 
				     pip3 install --no-cache-dir --no-build-isolation cython && \
			
 
				     pip3 install --no-cache-dir --no-build-isolation -r /root/src/gopkg.in/src-d/hercules.v6/requirements.txt https://github.com/mind/wheels/releases/download/tf1.7-cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl && \
			
 
				     rm -rf /root/* && \
			
--- a/internal/plumbing/bsdiff.go
+++ b/internal/plumbing/bsdiff.go
@@ -0,0 +1,302 @@
 
				+package plumbing
			
 
				+
			
 
				+// Adapted from https://github.com/kr/binarydist
			
 
				+// Original license:
			
 
				+//
			
 
				+// Copyright 2012 Keith Rarick
			
 
				+//
			
 
				+// Permission is hereby granted, free of charge, to any person
			
 
				+// obtaining a copy of this software and associated documentation
			
 
				+// files (the "Software"), to deal in the Software without
			
 
				+// restriction, including without limitation the rights to use,
			
 
				+// copy, modify, merge, publish, distribute, sublicense, and/or sell
			
 
				+// copies of the Software, and to permit persons to whom the
			
 
				+// Software is furnished to do so, subject to the following
			
 
				+// conditions:
			
 
				+//
			
 
				+// The above copyright notice and this permission notice shall be
			
 
				+// included in all copies or substantial portions of the Software.
			
 
				+//
			
 
				+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
			
 
				+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
			
 
				+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
			
 
				+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
			
 
				+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
			
 
				+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
			
 
				+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
			
 
				+// OTHER DEALINGS IN THE SOFTWARE.
			
 
				+
			
 
				+import (
			
 
				+	"bytes"
			
 
				+)
			
 
				+
			
 
				+func swap(a []int, i, j int) { a[i], a[j] = a[j], a[i] }
			
 
				+
			
 
				+func split(I, V []int, start, length, h int) {
			
 
				+	var i, j, k, x, jj, kk int
			
 
				+
			
 
				+	if length < 16 {
			
 
				+		for k = start; k < start+length; k += j {
			
 
				+			j = 1
			
 
				+			x = V[I[k]+h]
			
 
				+			for i = 1; k+i < start+length; i++ {
			
 
				+				if V[I[k+i]+h] < x {
			
 
				+					x = V[I[k+i]+h]
			
 
				+					j = 0
			
 
				+				}
			
 
				+				if V[I[k+i]+h] == x {
			
 
				+					swap(I, k+i, k+j)
			
 
				+					j++
			
 
				+				}
			
 
				+			}
			
 
				+			for i = 0; i < j; i++ {
			
 
				+				V[I[k+i]] = k + j - 1
			
 
				+			}
			
 
				+			if j == 1 {
			
 
				+				I[k] = -1
			
 
				+			}
			
 
				+		}
			
 
				+		return
			
 
				+	}
			
 
				+
			
 
				+	x = V[I[start+length/2]+h]
			
 
				+	jj = 0
			
 
				+	kk = 0
			
 
				+	for i = start; i < start+length; i++ {
			
 
				+		if V[I[i]+h] < x {
			
 
				+			jj++
			
 
				+		}
			
 
				+		if V[I[i]+h] == x {
			
 
				+			kk++
			
 
				+		}
			
 
				+	}
			
 
				+	jj += start
			
 
				+	kk += jj
			
 
				+
			
 
				+	i = start
			
 
				+	j = 0
			
 
				+	k = 0
			
 
				+	for i < jj {
			
 
				+		if V[I[i]+h] < x {
			
 
				+			i++
			
 
				+		} else if V[I[i]+h] == x {
			
 
				+			swap(I, i, jj+j)
			
 
				+			j++
			
 
				+		} else {
			
 
				+			swap(I, i, kk+k)
			
 
				+			k++
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for jj+j < kk {
			
 
				+		if V[I[jj+j]+h] == x {
			
 
				+			j++
			
 
				+		} else {
			
 
				+			swap(I, jj+j, kk+k)
			
 
				+			k++
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if jj > start {
			
 
				+		split(I, V, start, jj-start, h)
			
 
				+	}
			
 
				+
			
 
				+	for i = 0; i < kk-jj; i++ {
			
 
				+		V[I[jj+i]] = kk - 1
			
 
				+	}
			
 
				+	if jj == kk-1 {
			
 
				+		I[jj] = -1
			
 
				+	}
			
 
				+
			
 
				+	if start+length > kk {
			
 
				+		split(I, V, kk, start+length-kk, h)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func qsufsort(obuf []byte) []int {
			
 
				+	var buckets [256]int
			
 
				+	var i, h int
			
 
				+	I := make([]int, len(obuf)+1)
			
 
				+	V := make([]int, len(obuf)+1)
			
 
				+
			
 
				+	for _, c := range obuf {
			
 
				+		buckets[c]++
			
 
				+	}
			
 
				+	for i = 1; i < 256; i++ {
			
 
				+		buckets[i] += buckets[i-1]
			
 
				+	}
			
 
				+	copy(buckets[1:], buckets[:])
			
 
				+	buckets[0] = 0
			
 
				+
			
 
				+	for i, c := range obuf {
			
 
				+		buckets[c]++
			
 
				+		I[buckets[c]] = i
			
 
				+	}
			
 
				+
			
 
				+	I[0] = len(obuf)
			
 
				+	for i, c := range obuf {
			
 
				+		V[i] = buckets[c]
			
 
				+	}
			
 
				+
			
 
				+	V[len(obuf)] = 0
			
 
				+	for i = 1; i < 256; i++ {
			
 
				+		if buckets[i] == buckets[i-1]+1 {
			
 
				+			I[buckets[i]] = -1
			
 
				+		}
			
 
				+	}
			
 
				+	I[0] = -1
			
 
				+
			
 
				+	for h = 1; I[0] != -(len(obuf) + 1); h += h {
			
 
				+		var n int
			
 
				+		for i = 0; i < len(obuf)+1; {
			
 
				+			if I[i] < 0 {
			
 
				+				n -= I[i]
			
 
				+				i -= I[i]
			
 
				+			} else {
			
 
				+				if n != 0 {
			
 
				+					I[i-n] = -n
			
 
				+				}
			
 
				+				n = V[I[i]] + 1 - i
			
 
				+				split(I, V, i, n, h)
			
 
				+				i += n
			
 
				+				n = 0
			
 
				+			}
			
 
				+		}
			
 
				+		if n != 0 {
			
 
				+			I[i-n] = -n
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for i = 0; i < len(obuf)+1; i++ {
			
 
				+		I[V[i]] = i
			
 
				+	}
			
 
				+	return I
			
 
				+}
			
 
				+
			
 
				+func matchlen(a, b []byte) (i int) {
			
 
				+	for i < len(a) && i < len(b) && a[i] == b[i] {
			
 
				+		i++
			
 
				+	}
			
 
				+	return i
			
 
				+}
			
 
				+
			
 
				+func search(I []int, obuf, nbuf []byte, st, en int) (pos, n int) {
			
 
				+	if en-st < 2 {
			
 
				+		x := matchlen(obuf[I[st]:], nbuf)
			
 
				+		y := matchlen(obuf[I[en]:], nbuf)
			
 
				+
			
 
				+		if x > y {
			
 
				+			return I[st], x
			
 
				+		}
			
 
				+		return I[en], y
			
 
				+	}
			
 
				+
			
 
				+	x := st + (en-st)/2
			
 
				+	if bytes.Compare(obuf[I[x]:], nbuf) < 0 {
			
 
				+		return search(I, obuf, nbuf, x, en)
			
 
				+	}
			
 
				+	return search(I, obuf, nbuf, st, x)
			
 
				+}
			
 
				+
			
 
				+// DiffBytes calculates the approximated number of different bytes between two binary buffers.
			
 
				+// We are not interested in the diff script itself. Instead, we track the sizes of `db` and `eb`
			
 
				+// from the original implementation.
			
 
				+func DiffBytes(obuf, nbuf []byte) int {
			
 
				+	if len(nbuf) < len(obuf) {
			
 
				+		obuf, nbuf = nbuf, obuf
			
 
				+	}
			
 
				+	var lenf int
			
 
				+	I := qsufsort(obuf)
			
 
				+	var dblen, eblen int
			
 
				+
			
 
				+	// Compute the differences, writing ctrl as we go
			
 
				+	var scan, pos, length int
			
 
				+	var lastscan, lastpos, lastoffset int
			
 
				+	for scan < len(nbuf) {
			
 
				+		var oldscore int
			
 
				+		scan += length
			
 
				+		for scsc := scan; scan < len(nbuf); scan++ {
			
 
				+			pos, length = search(I, obuf, nbuf[scan:], 0, len(obuf))
			
 
				+
			
 
				+			for ; scsc < scan+length; scsc++ {
			
 
				+				if scsc+lastoffset < len(obuf) &&
			
 
				+					obuf[scsc+lastoffset] == nbuf[scsc] {
			
 
				+					oldscore++
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			if (length == oldscore && length != 0) || length > oldscore+8 {
			
 
				+				break
			
 
				+			}
			
 
				+
			
 
				+			if scan+lastoffset < len(obuf) && obuf[scan+lastoffset] == nbuf[scan] {
			
 
				+				oldscore--
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		if length != oldscore || scan == len(nbuf) {
			
 
				+			var s, Sf int
			
 
				+			lenf = 0
			
 
				+			for i := 0; lastscan+i < scan && lastpos+i < len(obuf); {
			
 
				+				if obuf[lastpos+i] == nbuf[lastscan+i] {
			
 
				+					s++
			
 
				+				}
			
 
				+				i++
			
 
				+				if s*2-i > Sf*2-lenf {
			
 
				+					Sf = s
			
 
				+					lenf = i
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			lenb := 0
			
 
				+			if scan < len(nbuf) {
			
 
				+				var s, Sb int
			
 
				+				for i := 1; (scan >= lastscan+i) && (pos >= i); i++ {
			
 
				+					if obuf[pos-i] == nbuf[scan-i] {
			
 
				+						s++
			
 
				+					}
			
 
				+					if s*2-i > Sb*2-lenb {
			
 
				+						Sb = s
			
 
				+						lenb = i
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			if lastscan+lenf > scan-lenb {
			
 
				+				overlap := (lastscan + lenf) - (scan - lenb)
			
 
				+				s := 0
			
 
				+				Ss := 0
			
 
				+				lens := 0
			
 
				+				for i := 0; i < overlap; i++ {
			
 
				+					if nbuf[lastscan+lenf-overlap+i] == obuf[lastpos+lenf-overlap+i] {
			
 
				+						s++
			
 
				+					}
			
 
				+					if nbuf[scan-lenb+i] == obuf[pos-lenb+i] {
			
 
				+						s--
			
 
				+					}
			
 
				+					if s > Ss {
			
 
				+						Ss = s
			
 
				+						lens = i + 1
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				lenf += lens - overlap
			
 
				+				lenb -= lens
			
 
				+			}
			
 
				+
			
 
				+			var nonzero int
			
 
				+			for i := 0; i < lenf; i++ {
			
 
				+				if nbuf[lastscan+i]-obuf[lastpos+i] != 0 {
			
 
				+					nonzero++
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			dblen += nonzero
			
 
				+			eblen += (scan - lenb) - (lastscan + lenf)
			
 
				+			lastscan = scan - lenb
			
 
				+			lastpos = pos - lenb
			
 
				+			lastoffset = pos - scan
			
 
				+		}
			
 
				+	}
			
 
				+	return dblen + eblen
			
 
				+}
			
--- a/internal/plumbing/renames.go
+++ b/internal/plumbing/renames.go
@@ -367,6 +367,15 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
 
				 			panic(err)
			
 
				 		}
			
 
				 	}()
			
 
				+	_, err1 := blob1.CountLines()
			
 
				+	_, err2 := blob2.CountLines()
			
 
				+	if err1 == ErrorBinary || err2 == ErrorBinary {
			
 
				+		// binary mode
			
 
				+		bsdifflen := DiffBytes(blob1.Data, blob2.Data)
			
 
				+		delta := int((int64(bsdifflen) * 100) / internal.Max64(
			
 
				+			internal.Min64(blob1.Size, blob2.Size), 1))
			
 
				+		return 100-delta >= ra.SimilarityThreshold, nil
			
 
				+	}
			
 
				 	src, dst := string(blob1.Data), string(blob2.Data)
			
 
				 	maxSize := internal.Max(1, internal.Max(utf8.RuneCountInString(src), utf8.RuneCountInString(dst)))
			
 
				 
			
@@ -412,6 +421,9 @@ func (ra *RenameAnalysis) blobsAreClose(blob1 *CachedBlob, blob2 *CachedBlob) (b
 
				 				posDst += step
			
 
				 			}
			
 
				 		}
			
 
				+		if possibleDelInsBlock {
			
 
				+			continue
			
 
				+		}
			
 
				 		// supposing that the rest of the lines are the same (they are not - too optimistic),
			
 
				 		// estimate the maximum similarity and exit the loop if it lower than our threshold
			
 
				 		maxCommon := common + internal.Min(
			
--- a/internal/plumbing/renames_test.go
+++ b/internal/plumbing/renames_test.go
@@ -1,6 +1,7 @@
 
				 package plumbing
			
 
				 
			
 
				 import (
			
 
				+	"bytes"
			
 
				 	"testing"
			
 
				 
			
 
				 	"github.com/stretchr/testify/assert"
			
@@ -186,3 +187,75 @@ func TestRenameAnalysisSortRenameCandidates(t *testing.T) {
 
				 	assert.Equal(t, candidates[0], 3)
			
 
				 	assert.Equal(t, candidates[1], 1)
			
 
				 }
			
 
				+
			
 
				+func TestBlobsAreCloseText(t *testing.T) {
			
 
				+	blob1 := &CachedBlob{Data: []byte("hello, world!")}
			
 
				+	blob2 := &CachedBlob{Data: []byte("hello, world?")}
			
 
				+	blob1.Size = int64(len(blob1.Data))
			
 
				+	blob2.Size = int64(len(blob2.Data))
			
 
				+	ra := fixtureRenameAnalysis()
			
 
				+	result, err := ra.blobsAreClose(blob1, blob2)
			
 
				+	assert.Nil(t, err)
			
 
				+	assert.True(t, result)
			
 
				+
			
 
				+	blob1.Data = []byte("hello, mloncode")
			
 
				+	blob1.Size = int64(len(blob1.Data))
			
 
				+	result, err = ra.blobsAreClose(blob1, blob2)
			
 
				+	assert.Nil(t, err)
			
 
				+	assert.False(t, result)
			
 
				+}
			
 
				+
			
 
				+func TestBlobsAreCloseBinary(t *testing.T) {
			
 
				+	blob1 := &CachedBlob{}
			
 
				+	blob2 := &CachedBlob{}
			
 
				+	ra := fixtureRenameAnalysis()
			
 
				+	result, err := ra.blobsAreClose(blob1, blob2)
			
 
				+	assert.Nil(t, err)
			
 
				+	assert.True(t, result)
			
 
				+
			
 
				+	blob1.Data = make([]byte, 100)
			
 
				+	blob2.Data = blob1.Data
			
 
				+	blob1.Size = 100
			
 
				+	blob2.Size = 100
			
 
				+	result, err = ra.blobsAreClose(blob1, blob2)
			
 
				+	assert.Nil(t, err)
			
 
				+	assert.True(t, result)
			
 
				+
			
 
				+	blob1.Data = bytes.Repeat([]byte{1}, 100)
			
 
				+	blob2.Data = blob1.Data
			
 
				+	result, err = ra.blobsAreClose(blob1, blob2)
			
 
				+	assert.Nil(t, err)
			
 
				+	assert.True(t, result)
			
 
				+
			
 
				+	for i := 0; i < 100; i++ {
			
 
				+		blob1.Data[i] = byte(i)
			
 
				+	}
			
 
				+	result, err = ra.blobsAreClose(blob1, blob2)
			
 
				+	assert.Nil(t, err)
			
 
				+	assert.True(t, result)
			
 
				+
			
 
				+	blob2.Data = make([]byte, 100)
			
 
				+	for i := 0; i < 80; i++ {
			
 
				+		blob2.Data[i] = byte(i)
			
 
				+	}
			
 
				+	result, err = ra.blobsAreClose(blob1, blob2)
			
 
				+	assert.Nil(t, err)
			
 
				+	assert.True(t, result)
			
 
				+
			
 
				+	blob2.Data[79] = 0
			
 
				+	result, err = ra.blobsAreClose(blob1, blob2)
			
 
				+	assert.Nil(t, err)
			
 
				+	assert.False(t, result)
			
 
				+
			
 
				+	blob1.Data = []byte("hello, world!")
			
 
				+	blob1.Size = int64(len(blob2.Data))
			
 
				+	result, err = ra.blobsAreClose(blob1, blob2)
			
 
				+	assert.Nil(t, err)
			
 
				+	assert.False(t, result)
			
 
				+
			
 
				+	blob1.Data, blob2.Data = blob2.Data, blob1.Data
			
 
				+	blob1.Size, blob2.Size = blob2.Size, blob1.Size
			
 
				+	result, err = ra.blobsAreClose(blob1, blob2)
			
 
				+	assert.Nil(t, err)
			
 
				+	assert.False(t, result)
			
 
				+}