Explorar o código

Add TyposDatasetBuilder

Signed-off-by: Vadim Markovtsev <vadim@sourced.tech>
Vadim Markovtsev %!s(int64=6) %!d(string=hai) anos
pai
achega
0678e062ac
Modificáronse 6 ficheiros con 571 adicións e 107 borrados
  1. 2 6
      core.go
  2. 168 96
      internal/pb/pb.pb.go
  3. 12 0
      internal/pb/pb.proto
  4. 112 5
      internal/pb/pb_pb2.py
  5. 276 0
      leaves/research/typos.go
  6. 1 0
      leaves/research/typos_test.go

+ 2 - 6
core.go

@@ -9,7 +9,8 @@ import (
 	"gopkg.in/src-d/hercules.v9/internal/plumbing/identity"
 	"gopkg.in/src-d/hercules.v9/internal/plumbing/uast"
 	"gopkg.in/src-d/hercules.v9/internal/yaml"
-	"gopkg.in/src-d/hercules.v9/leaves"
+	_ "gopkg.in/src-d/hercules.v9/leaves"  // add burndown and other analyses
+	_ "gopkg.in/src-d/hercules.v9/leaves/research"  // add "research" analyses
 )
 
 // ConfigurationOptionType represents the possible types of a ConfigurationOption's value.
@@ -171,8 +172,3 @@ func PathifyFlagValue(flag *pflag.Flag) {
 func EnablePathFlagTypeMasquerade() {
 	core.EnablePathFlagTypeMasquerade()
 }
-
-func init() {
-	// hack to link with .leaves
-	_ = leaves.BurndownAnalysis{}
-}

+ 168 - 96
internal/pb/pb.pb.go

@@ -32,6 +32,8 @@ It has these top-level messages:
 	CommitFile
 	Commit
 	CommitsAnalysisResults
+	Typo
+	TyposDataset
 	AnalysisResults
 */
 package pb
@@ -819,6 +821,70 @@ func (m *CommitsAnalysisResults) GetAuthorIndex() []string {
 	return nil
 }
 
+type Typo struct {
+	Wrong   string `protobuf:"bytes,1,opt,name=wrong,proto3" json:"wrong,omitempty"`
+	Correct string `protobuf:"bytes,2,opt,name=correct,proto3" json:"correct,omitempty"`
+	Commit  string `protobuf:"bytes,3,opt,name=commit,proto3" json:"commit,omitempty"`
+	File    string `protobuf:"bytes,4,opt,name=file,proto3" json:"file,omitempty"`
+	Line    int32  `protobuf:"varint,5,opt,name=line,proto3" json:"line,omitempty"`
+}
+
+func (m *Typo) Reset()                    { *m = Typo{} }
+func (m *Typo) String() string            { return proto.CompactTextString(m) }
+func (*Typo) ProtoMessage()               {}
+func (*Typo) Descriptor() ([]byte, []int) { return fileDescriptorPb, []int{24} }
+
+func (m *Typo) GetWrong() string {
+	if m != nil {
+		return m.Wrong
+	}
+	return ""
+}
+
+func (m *Typo) GetCorrect() string {
+	if m != nil {
+		return m.Correct
+	}
+	return ""
+}
+
+func (m *Typo) GetCommit() string {
+	if m != nil {
+		return m.Commit
+	}
+	return ""
+}
+
+func (m *Typo) GetFile() string {
+	if m != nil {
+		return m.File
+	}
+	return ""
+}
+
+func (m *Typo) GetLine() int32 {
+	if m != nil {
+		return m.Line
+	}
+	return 0
+}
+
+type TyposDataset struct {
+	Typos []*Typo `protobuf:"bytes,1,rep,name=typos" json:"typos,omitempty"`
+}
+
+func (m *TyposDataset) Reset()                    { *m = TyposDataset{} }
+func (m *TyposDataset) String() string            { return proto.CompactTextString(m) }
+func (*TyposDataset) ProtoMessage()               {}
+func (*TyposDataset) Descriptor() ([]byte, []int) { return fileDescriptorPb, []int{25} }
+
+func (m *TyposDataset) GetTypos() []*Typo {
+	if m != nil {
+		return m.Typos
+	}
+	return nil
+}
+
 type AnalysisResults struct {
 	Header *Metadata `protobuf:"bytes,1,opt,name=header" json:"header,omitempty"`
 	// the mapped values are dynamic messages which require the second parsing pass.
@@ -828,7 +894,7 @@ type AnalysisResults struct {
 func (m *AnalysisResults) Reset()                    { *m = AnalysisResults{} }
 func (m *AnalysisResults) String() string            { return proto.CompactTextString(m) }
 func (*AnalysisResults) ProtoMessage()               {}
-func (*AnalysisResults) Descriptor() ([]byte, []int) { return fileDescriptorPb, []int{24} }
+func (*AnalysisResults) Descriptor() ([]byte, []int) { return fileDescriptorPb, []int{26} }
 
 func (m *AnalysisResults) GetHeader() *Metadata {
 	if m != nil {
@@ -869,105 +935,111 @@ func init() {
 	proto.RegisterType((*CommitFile)(nil), "CommitFile")
 	proto.RegisterType((*Commit)(nil), "Commit")
 	proto.RegisterType((*CommitsAnalysisResults)(nil), "CommitsAnalysisResults")
+	proto.RegisterType((*Typo)(nil), "Typo")
+	proto.RegisterType((*TyposDataset)(nil), "TyposDataset")
 	proto.RegisterType((*AnalysisResults)(nil), "AnalysisResults")
 }
 
 func init() { proto.RegisterFile("pb.proto", fileDescriptorPb) }
 
 var fileDescriptorPb = []byte{
-	// 1495 bytes of a gzipped FileDescriptorProto
-	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x94, 0x57, 0xcb, 0x6e, 0xdb, 0x46,
-	0x17, 0x06, 0x75, 0xd7, 0x91, 0x2c, 0x27, 0x13, 0xff, 0x36, 0xa3, 0x1f, 0xce, 0xaf, 0x10, 0xfe,
-	0x03, 0xb7, 0x49, 0x99, 0xc0, 0xe9, 0x22, 0x4d, 0x37, 0xb1, 0xe5, 0x06, 0x31, 0x10, 0x37, 0x2d,
-	0x95, 0xa4, 0xbb, 0x08, 0x63, 0x71, 0x6c, 0xb1, 0x95, 0x86, 0xc4, 0x0c, 0x29, 0x59, 0x40, 0xfb,
-	0x2a, 0xdd, 0x75, 0xd1, 0x02, 0x5d, 0xf5, 0x05, 0xba, 0xe8, 0xa6, 0xdb, 0x3e, 0x44, 0x81, 0xbe,
-	0x45, 0x31, 0x37, 0x5e, 0x14, 0x3a, 0x4d, 0x77, 0x3c, 0xe7, 0x7c, 0x67, 0xe6, 0xcc, 0x77, 0x2e,
-	0x33, 0x84, 0x56, 0x74, 0xe6, 0x46, 0x2c, 0x8c, 0x43, 0xe7, 0xcf, 0x0a, 0xb4, 0x4e, 0x49, 0x8c,
-	0x7d, 0x1c, 0x63, 0x64, 0x43, 0x73, 0x41, 0x18, 0x0f, 0x42, 0x6a, 0x5b, 0x03, 0x6b, 0xbf, 0xee,
-	0x19, 0x11, 0x21, 0xa8, 0x4d, 0x31, 0x9f, 0xda, 0x95, 0x81, 0xb5, 0xdf, 0xf6, 0xe4, 0x37, 0xba,
-	0x05, 0xc0, 0x48, 0x14, 0xf2, 0x20, 0x0e, 0xd9, 0xca, 0xae, 0x4a, 0x4b, 0x4e, 0x83, 0xee, 0xc0,
-	0xe6, 0x19, 0xb9, 0x08, 0xe8, 0x38, 0xa1, 0xc1, 0xe5, 0x38, 0x0e, 0xe6, 0xc4, 0xae, 0x0d, 0xac,
-	0xfd, 0xaa, 0xb7, 0x21, 0xd5, 0xaf, 0x68, 0x70, 0xf9, 0x32, 0x98, 0x13, 0xe4, 0xc0, 0x06, 0xa1,
-	0x7e, 0x0e, 0x55, 0x97, 0xa8, 0x0e, 0xa1, 0x7e, 0x8a, 0xb1, 0xa1, 0x39, 0x09, 0xe7, 0xf3, 0x20,
-	0xe6, 0x76, 0x43, 0x45, 0xa6, 0x45, 0x74, 0x13, 0x5a, 0x2c, 0xa1, 0xca, 0xb1, 0x29, 0x1d, 0x9b,
-	0x2c, 0xa1, 0xd2, 0xe9, 0x19, 0x5c, 0x37, 0xa6, 0x71, 0x44, 0xd8, 0x38, 0x88, 0xc9, 0xdc, 0x6e,
-	0x0d, 0xaa, 0xfb, 0x9d, 0x83, 0x5d, 0xd7, 0x1c, 0xda, 0xf5, 0x14, 0xfa, 0x0b, 0xc2, 0x4e, 0x62,
-	0x32, 0xff, 0x8c, 0xc6, 0x6c, 0xe5, 0xf5, 0x58, 0x41, 0xd9, 0x3f, 0x84, 0x1b, 0x25, 0x30, 0x74,
-	0x0d, 0xaa, 0xdf, 0x90, 0x95, 0xe4, 0xaa, 0xed, 0x89, 0x4f, 0xb4, 0x05, 0xf5, 0x05, 0x9e, 0x25,
-	0x44, 0x12, 0x65, 0x79, 0x4a, 0x78, 0x5c, 0x79, 0x64, 0x39, 0x0f, 0x61, 0xe7, 0x28, 0x61, 0xd4,
-	0x0f, 0x97, 0x74, 0x14, 0x61, 0xc6, 0xc9, 0x29, 0x8e, 0x59, 0x70, 0xe9, 0x85, 0x4b, 0x75, 0xb8,
-	0x59, 0x32, 0xa7, 0xdc, 0xb6, 0x06, 0xd5, 0xfd, 0x0d, 0xcf, 0x88, 0xce, 0x4f, 0x16, 0x6c, 0x95,
-	0x79, 0x89, 0x7c, 0x50, 0x3c, 0x27, 0x7a, 0x6b, 0xf9, 0x8d, 0xf6, 0xa0, 0x47, 0x93, 0xf9, 0x19,
-	0x61, 0xe3, 0xf0, 0x7c, 0xcc, 0xc2, 0x25, 0x97, 0x41, 0xd4, 0xbd, 0xae, 0xd2, 0xbe, 0x38, 0xf7,
-	0xc2, 0x25, 0x47, 0x1f, 0xc2, 0xf5, 0x0c, 0x65, 0xb6, 0xad, 0x4a, 0xe0, 0xa6, 0x01, 0x0e, 0x95,
-	0x1a, 0xdd, 0x83, 0x9a, 0x5c, 0xa7, 0x26, 0x39, 0xb3, 0xdd, 0x2b, 0x0e, 0xe0, 0x49, 0x94, 0xf3,
-	0x2d, 0xf4, 0x9e, 0x06, 0x33, 0xc2, 0x5f, 0x2c, 0x29, 0x61, 0x7c, 0x1a, 0x44, 0xe8, 0x81, 0x61,
-	0xc3, 0x92, 0x0b, 0xf4, 0xdd, 0xa2, 0xdd, 0x7d, 0x2d, 0x8c, 0x8a, 0x71, 0x05, 0xec, 0x3f, 0x02,
-	0xc8, 0x94, 0x79, 0x7e, 0xeb, 0x25, 0xfc, 0xd6, 0xf3, 0xfc, 0xfe, 0x55, 0xc9, 0x08, 0x3e, 0xa4,
-	0x78, 0xb6, 0xe2, 0x01, 0xf7, 0x08, 0x4f, 0x66, 0x31, 0x47, 0x03, 0xe8, 0x5c, 0x30, 0x4c, 0x93,
-	0x19, 0x66, 0x41, 0x6c, 0xd6, 0xcb, 0xab, 0x50, 0x1f, 0x5a, 0x1c, 0xcf, 0xa3, 0x59, 0x40, 0x2f,
-	0xf4, 0xd2, 0xa9, 0x8c, 0xee, 0x43, 0x33, 0x62, 0xe1, 0xd7, 0x64, 0x12, 0x4b, 0x9e, 0x3a, 0x07,
-	0xff, 0x29, 0x27, 0xc2, 0xa0, 0xd0, 0x5d, 0xa8, 0x9f, 0x8b, 0x83, 0x6a, 0xde, 0xae, 0x80, 0x2b,
-	0x0c, 0xfa, 0x08, 0x1a, 0x11, 0x09, 0xa3, 0x99, 0x28, 0xfb, 0x77, 0xa0, 0x35, 0x08, 0x9d, 0x00,
-	0x52, 0x5f, 0xe3, 0x80, 0xc6, 0x84, 0xe1, 0x49, 0x2c, 0xba, 0xb5, 0x21, 0xe3, 0xea, 0xbb, 0xc3,
-	0x70, 0x1e, 0x31, 0xc2, 0x39, 0xf1, 0x95, 0xb3, 0x17, 0x2e, 0xb5, 0xff, 0x75, 0xe5, 0x75, 0x92,
-	0x39, 0xa1, 0x47, 0xb0, 0x29, 0x43, 0x18, 0x87, 0x26, 0x21, 0x76, 0x53, 0x86, 0xb0, 0xb9, 0x96,
-	0x27, 0xaf, 0x77, 0x5e, 0x90, 0x9d, 0x5f, 0x2c, 0xb8, 0x79, 0xe5, 0x56, 0x25, 0x75, 0x68, 0xbd,
-	0x6f, 0x1d, 0x56, 0xca, 0xeb, 0x10, 0x41, 0x4d, 0xb4, 0xaa, 0x5d, 0x1d, 0x54, 0xf7, 0xab, 0x5e,
-	0xcd, 0xcc, 0xaa, 0x80, 0xfa, 0xc1, 0x44, 0xd3, 0x5c, 0xf7, 0x8c, 0x88, 0xb6, 0xa1, 0x11, 0x50,
-	0x3f, 0x8a, 0x99, 0x64, 0xb4, 0xea, 0x69, 0xc9, 0x19, 0x41, 0x73, 0x18, 0x26, 0x91, 0x20, 0x7d,
-	0x0b, 0xea, 0x01, 0xf5, 0xc9, 0xa5, 0x2c, 0xcc, 0xb6, 0xa7, 0x04, 0x74, 0x00, 0x8d, 0xb9, 0x3c,
-	0x82, 0x8c, 0xe3, 0xdd, 0x7c, 0x6a, 0xa4, 0xb3, 0x07, 0xdd, 0x97, 0x61, 0x32, 0x99, 0x12, 0x5f,
-	0x72, 0x26, 0x56, 0x56, 0xb9, 0xb7, 0x64, 0x50, 0x4a, 0x70, 0x7e, 0xb7, 0x60, 0x5b, 0xef, 0xbd,
-	0x5e, 0x9b, 0x77, 0xa1, 0x2b, 0x30, 0xe3, 0x89, 0x32, 0xeb, 0x54, 0xb6, 0x5c, 0x0d, 0xf7, 0x3a,
-	0xc2, 0x6a, 0xe2, 0xbe, 0x0f, 0x3d, 0x9d, 0x7d, 0x03, 0x6f, 0xae, 0xc1, 0x37, 0x94, 0xdd, 0x38,
-	0x3c, 0x80, 0xae, 0x76, 0x50, 0x51, 0xa9, 0xe9, 0xb7, 0xe1, 0xe6, 0x63, 0xf6, 0x3a, 0x0a, 0xa2,
-	0x0e, 0xf0, 0x3f, 0xe8, 0xa8, 0xaa, 0x98, 0x05, 0x94, 0x70, 0xbb, 0x2d, 0x8f, 0x01, 0x52, 0xf5,
-	0x5c, 0x68, 0x9c, 0x1f, 0x2c, 0x80, 0x57, 0x87, 0xa3, 0x97, 0xc3, 0x29, 0xa6, 0x17, 0x04, 0xfd,
-	0x17, 0xda, 0x32, 0xfe, 0xdc, 0x38, 0x6a, 0x09, 0xc5, 0xe7, 0x62, 0x24, 0xed, 0x02, 0x70, 0x36,
-	0x19, 0x9f, 0x91, 0xf3, 0x90, 0x11, 0x7d, 0x79, 0xb4, 0x39, 0x9b, 0x1c, 0x49, 0x85, 0xf0, 0x15,
-	0x66, 0x7c, 0x1e, 0x13, 0xa6, 0x2f, 0x90, 0x16, 0x67, 0x93, 0x43, 0x21, 0x8b, 0x40, 0x12, 0xcc,
-	0x63, 0xe3, 0x5c, 0x53, 0xf7, 0x8b, 0x50, 0x69, 0xef, 0x5d, 0x90, 0x92, 0x76, 0xaf, 0xab, 0xc5,
-	0x85, 0x46, 0xfa, 0x3b, 0x4f, 0x60, 0x27, 0x0b, 0x93, 0x8f, 0xf0, 0x82, 0x30, 0xc3, 0xf9, 0xff,
-	0xa1, 0x39, 0x51, 0x6a, 0x3d, 0x99, 0x3a, 0x6e, 0x06, 0xf5, 0x8c, 0xcd, 0xf9, 0xcd, 0x82, 0xde,
-	0x68, 0x1a, 0xc6, 0x94, 0x70, 0xee, 0x91, 0x49, 0xc8, 0x7c, 0x51, 0x89, 0xf1, 0x2a, 0x4a, 0xe7,
-	0xae, 0xf8, 0x4e, 0x67, 0x71, 0x25, 0x37, 0x8b, 0x11, 0xd4, 0x04, 0x09, 0xfa, 0x50, 0xf2, 0x1b,
-	0x7d, 0x02, 0xad, 0x49, 0x98, 0x88, 0x06, 0x34, 0x93, 0x61, 0xd7, 0x2d, 0x2e, 0x2f, 0xb2, 0x28,
-	0xed, 0x6a, 0x26, 0xa6, 0xf0, 0xfe, 0xa7, 0xb0, 0x51, 0x30, 0xfd, 0xab, 0xc9, 0x78, 0x0c, 0x3b,
-	0x66, 0x9b, 0xf5, 0xe2, 0xfb, 0x00, 0x9a, 0x4c, 0xee, 0x6c, 0x88, 0xd8, 0x5c, 0x8b, 0xc8, 0x33,
-	0x76, 0xe7, 0x0f, 0x0b, 0x3a, 0xa2, 0x42, 0x9e, 0x05, 0x5c, 0xde, 0xee, 0xb9, 0x1b, 0x59, 0x35,
-	0x51, 0x7a, 0x23, 0xbf, 0x86, 0x2d, 0xcd, 0xe0, 0xf8, 0x6c, 0x35, 0xf6, 0xc9, 0x82, 0xcc, 0xc2,
-	0x88, 0x30, 0xbb, 0x22, 0x77, 0xd8, 0x73, 0x73, 0xab, 0xb8, 0x3a, 0x3b, 0x47, 0xab, 0x63, 0x03,
-	0x53, 0x47, 0x47, 0x93, 0xb7, 0x0c, 0xfd, 0x2f, 0x61, 0xe7, 0x0a, 0x78, 0x09, 0x1d, 0x83, 0x3c,
-	0x1d, 0x9d, 0x03, 0x70, 0x45, 0xf1, 0x8e, 0x62, 0x1c, 0xf3, 0x3c, 0x35, 0xdf, 0x5b, 0x60, 0xe7,
-	0xc2, 0x51, 0xb4, 0x9c, 0x12, 0xce, 0xf1, 0x05, 0x41, 0x8f, 0xf3, 0xad, 0xbc, 0x16, 0x78, 0x01,
-	0xa9, 0xc6, 0xa5, 0xbe, 0xc7, 0xa4, 0x4b, 0xff, 0x29, 0x40, 0xa6, 0x2c, 0x79, 0x27, 0x38, 0xc5,
-	0xf0, 0xba, 0x85, 0xb5, 0x73, 0x01, 0xbe, 0x82, 0x76, 0x1a, 0xb8, 0x48, 0x31, 0xf6, 0x7d, 0xe2,
-	0xeb, 0x73, 0x2a, 0x41, 0x24, 0x82, 0x91, 0x79, 0xb8, 0x20, 0xbe, 0x4e, 0xbd, 0x11, 0x65, 0x8a,
-	0x24, 0x61, 0xbe, 0xbe, 0xe0, 0x8d, 0x28, 0x2a, 0xbb, 0x71, 0x4c, 0x16, 0xc7, 0x78, 0x2d, 0x8f,
-	0x85, 0x97, 0xd5, 0x00, 0xea, 0x5c, 0xec, 0x5b, 0x46, 0xa1, 0x34, 0xa0, 0x8f, 0xa1, 0x3d, 0xc3,
-	0xf4, 0x22, 0xc1, 0xa2, 0x93, 0xaa, 0x92, 0xa5, 0x6d, 0x57, 0xad, 0xeb, 0x3e, 0x37, 0x06, 0xc5,
-	0x4b, 0x06, 0xec, 0x3f, 0x83, 0x5e, 0xd1, 0x58, 0xc2, 0xcf, 0xfb, 0xa5, 0x8f, 0x43, 0xf3, 0x18,
-	0x8b, 0x5a, 0xe0, 0xe8, 0x0e, 0xd4, 0x7c, 0xb2, 0x30, 0xb9, 0x42, 0xae, 0xd6, 0x8b, 0x68, 0x74,
-	0x04, 0xd2, 0xde, 0x7f, 0x02, 0xed, 0x54, 0x55, 0x52, 0x36, 0xbb, 0xc5, 0x7d, 0x9b, 0xfa, 0x34,
-	0xf9, 0x4d, 0x7f, 0xb4, 0xe0, 0x86, 0x58, 0x62, 0xbd, 0x97, 0x0e, 0xc4, 0x25, 0xb5, 0x32, 0x11,
-	0xdc, 0x72, 0x4b, 0x30, 0x22, 0xaa, 0x34, 0x1a, 0xbc, 0xe2, 0x62, 0x00, 0xfa, 0x64, 0x31, 0x56,
-	0x77, 0x51, 0x45, 0xb6, 0x51, 0xcb, 0x27, 0x8b, 0x13, 0x21, 0xf7, 0x0f, 0xa1, 0x9d, 0xe2, 0x4b,
-	0x42, 0xbd, 0x55, 0x0c, 0xb5, 0x65, 0x8e, 0x9c, 0x8f, 0xf5, 0x2b, 0x68, 0x8f, 0x08, 0x15, 0x0f,
-	0x60, 0x1a, 0x67, 0x13, 0x42, 0x2c, 0x52, 0xd1, 0x30, 0xf1, 0xf2, 0x11, 0x09, 0x27, 0x54, 0x26,
-	0x5a, 0x46, 0x60, 0xe4, 0x7c, 0x6d, 0x54, 0x0b, 0x3d, 0xee, 0xfc, 0x6a, 0xc1, 0xce, 0x50, 0xc1,
-	0xd2, 0x0d, 0x0c, 0x11, 0xaf, 0xe1, 0x1a, 0x37, 0x3a, 0x39, 0x01, 0xf0, 0x4a, 0x93, 0x72, 0xcf,
-	0xbd, 0xc2, 0xc7, 0x4d, 0x15, 0x47, 0xab, 0x63, 0xbc, 0xd2, 0x8f, 0x70, 0x5e, 0x50, 0xf6, 0x4f,
-	0xe1, 0x46, 0x09, 0xec, 0x7d, 0x7a, 0x3f, 0xdb, 0x2e, 0xc7, 0xcd, 0x1b, 0x80, 0xa1, 0x3c, 0x8d,
-	0x68, 0xbd, 0xd2, 0x07, 0x75, 0x1f, 0x5a, 0xa6, 0x6a, 0xcd, 0xed, 0x64, 0xe4, 0xac, 0x39, 0x6a,
-	0x57, 0x34, 0x87, 0xf3, 0x1d, 0x34, 0xd4, 0xfa, 0xe9, 0xcf, 0x93, 0x95, 0xfb, 0x79, 0xda, 0x83,
-	0xde, 0x72, 0x4a, 0xf2, 0xff, 0x46, 0x15, 0xf9, 0xf3, 0xd2, 0x15, 0xda, 0xf4, 0xb7, 0x67, 0x1b,
-	0x1a, 0x38, 0x89, 0xa7, 0x21, 0xd3, 0x0d, 0xac, 0x25, 0x74, 0xbb, 0xf8, 0xc2, 0xec, 0xb8, 0xd9,
-	0x49, 0xcc, 0x93, 0xe3, 0x8d, 0x78, 0x71, 0xc8, 0x64, 0xad, 0x17, 0xea, 0xed, 0xe2, 0xe4, 0x16,
-	0x55, 0xae, 0x90, 0x59, 0xeb, 0xdf, 0x86, 0xae, 0xda, 0xa9, 0x50, 0x9a, 0x1d, 0xa5, 0x93, 0xd5,
-	0xe9, 0xfc, 0x6c, 0xc1, 0xe6, 0xdb, 0x2b, 0x37, 0xa6, 0x04, 0xfb, 0x84, 0xc9, 0xa3, 0x76, 0x0e,
-	0xda, 0xe9, 0x5f, 0x96, 0xa7, 0x0d, 0xe8, 0xb1, 0x28, 0x37, 0x1a, 0xa7, 0xe5, 0x26, 0x3a, 0x65,
-	0xbd, 0x4b, 0x86, 0x1a, 0x90, 0xde, 0x82, 0x4a, 0x54, 0xb7, 0x60, 0xce, 0xf4, 0x4f, 0xff, 0x5f,
-	0xdd, 0x5c, 0xba, 0xcf, 0x1a, 0xf2, 0x7f, 0xf7, 0xe1, 0xdf, 0x01, 0x00, 0x00, 0xff, 0xff, 0x2e,
-	0x80, 0xba, 0x8e, 0xfb, 0x0e, 0x00, 0x00,
+	// 1568 bytes of a gzipped FileDescriptorProto
+	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x00, 0x00, 0x00, 0x02, 0xff, 0x94, 0x57, 0xcd, 0x6e, 0xdb, 0xc6,
+	0x13, 0x07, 0x25, 0x51, 0x1f, 0x23, 0x59, 0x4e, 0x36, 0xfe, 0xdb, 0x8c, 0x02, 0xe7, 0xaf, 0x10,
+	0x6e, 0xe0, 0x36, 0x29, 0x13, 0x38, 0x3d, 0xa4, 0xe9, 0x25, 0xfe, 0x68, 0x10, 0x03, 0x71, 0xd3,
+	0xd2, 0x4e, 0x7a, 0x8b, 0xb0, 0x16, 0xd7, 0x12, 0x5b, 0x69, 0x49, 0xec, 0x92, 0x92, 0x05, 0xb4,
+	0xaf, 0xd2, 0x5b, 0x0f, 0x2d, 0xd0, 0x53, 0x5f, 0xa0, 0x87, 0x5e, 0x7a, 0xed, 0x43, 0x14, 0xe8,
+	0x5b, 0x14, 0xfb, 0x45, 0x91, 0x0a, 0x9d, 0xa6, 0x37, 0xce, 0xcc, 0x6f, 0x77, 0x67, 0x7e, 0x33,
+	0xb3, 0xb3, 0x84, 0x66, 0x7c, 0xee, 0xc5, 0x2c, 0x4a, 0x22, 0xf7, 0xaf, 0x0a, 0x34, 0x4f, 0x48,
+	0x82, 0x03, 0x9c, 0x60, 0xe4, 0x40, 0x63, 0x46, 0x18, 0x0f, 0x23, 0xea, 0x58, 0x7d, 0x6b, 0xd7,
+	0xf6, 0x8d, 0x88, 0x10, 0xd4, 0xc6, 0x98, 0x8f, 0x9d, 0x4a, 0xdf, 0xda, 0x6d, 0xf9, 0xf2, 0x1b,
+	0xdd, 0x06, 0x60, 0x24, 0x8e, 0x78, 0x98, 0x44, 0x6c, 0xe1, 0x54, 0xa5, 0x25, 0xa7, 0x41, 0x77,
+	0x61, 0xfd, 0x9c, 0x8c, 0x42, 0x3a, 0x48, 0x69, 0x78, 0x39, 0x48, 0xc2, 0x29, 0x71, 0x6a, 0x7d,
+	0x6b, 0xb7, 0xea, 0xaf, 0x49, 0xf5, 0x2b, 0x1a, 0x5e, 0x9e, 0x85, 0x53, 0x82, 0x5c, 0x58, 0x23,
+	0x34, 0xc8, 0xa1, 0x6c, 0x89, 0x6a, 0x13, 0x1a, 0x64, 0x18, 0x07, 0x1a, 0xc3, 0x68, 0x3a, 0x0d,
+	0x13, 0xee, 0xd4, 0x95, 0x67, 0x5a, 0x44, 0x37, 0xa1, 0xc9, 0x52, 0xaa, 0x16, 0x36, 0xe4, 0xc2,
+	0x06, 0x4b, 0xa9, 0x5c, 0xf4, 0x1c, 0xae, 0x1b, 0xd3, 0x20, 0x26, 0x6c, 0x10, 0x26, 0x64, 0xea,
+	0x34, 0xfb, 0xd5, 0xdd, 0xf6, 0xde, 0xb6, 0x67, 0x82, 0xf6, 0x7c, 0x85, 0xfe, 0x92, 0xb0, 0xe3,
+	0x84, 0x4c, 0x3f, 0xa7, 0x09, 0x5b, 0xf8, 0x5d, 0x56, 0x50, 0xf6, 0xf6, 0xe1, 0x46, 0x09, 0x0c,
+	0x5d, 0x83, 0xea, 0xb7, 0x64, 0x21, 0xb9, 0x6a, 0xf9, 0xe2, 0x13, 0x6d, 0x80, 0x3d, 0xc3, 0x93,
+	0x94, 0x48, 0xa2, 0x2c, 0x5f, 0x09, 0x4f, 0x2a, 0x8f, 0x2d, 0xf7, 0x11, 0x6c, 0x1d, 0xa4, 0x8c,
+	0x06, 0xd1, 0x9c, 0x9e, 0xc6, 0x98, 0x71, 0x72, 0x82, 0x13, 0x16, 0x5e, 0xfa, 0xd1, 0x5c, 0x05,
+	0x37, 0x49, 0xa7, 0x94, 0x3b, 0x56, 0xbf, 0xba, 0xbb, 0xe6, 0x1b, 0xd1, 0xfd, 0xd9, 0x82, 0x8d,
+	0xb2, 0x55, 0x22, 0x1f, 0x14, 0x4f, 0x89, 0x3e, 0x5a, 0x7e, 0xa3, 0x1d, 0xe8, 0xd2, 0x74, 0x7a,
+	0x4e, 0xd8, 0x20, 0xba, 0x18, 0xb0, 0x68, 0xce, 0xa5, 0x13, 0xb6, 0xdf, 0x51, 0xda, 0x97, 0x17,
+	0x7e, 0x34, 0xe7, 0xe8, 0x23, 0xb8, 0xbe, 0x44, 0x99, 0x63, 0xab, 0x12, 0xb8, 0x6e, 0x80, 0x87,
+	0x4a, 0x8d, 0xee, 0x43, 0x4d, 0xee, 0x53, 0x93, 0x9c, 0x39, 0xde, 0x15, 0x01, 0xf8, 0x12, 0xe5,
+	0x7e, 0x07, 0xdd, 0x67, 0xe1, 0x84, 0xf0, 0x97, 0x73, 0x4a, 0x18, 0x1f, 0x87, 0x31, 0x7a, 0x68,
+	0xd8, 0xb0, 0xe4, 0x06, 0x3d, 0xaf, 0x68, 0xf7, 0x5e, 0x0b, 0xa3, 0x62, 0x5c, 0x01, 0x7b, 0x8f,
+	0x01, 0x96, 0xca, 0x3c, 0xbf, 0x76, 0x09, 0xbf, 0x76, 0x9e, 0xdf, 0xbf, 0x2b, 0x4b, 0x82, 0xf7,
+	0x29, 0x9e, 0x2c, 0x78, 0xc8, 0x7d, 0xc2, 0xd3, 0x49, 0xc2, 0x51, 0x1f, 0xda, 0x23, 0x86, 0x69,
+	0x3a, 0xc1, 0x2c, 0x4c, 0xcc, 0x7e, 0x79, 0x15, 0xea, 0x41, 0x93, 0xe3, 0x69, 0x3c, 0x09, 0xe9,
+	0x48, 0x6f, 0x9d, 0xc9, 0xe8, 0x01, 0x34, 0x62, 0x16, 0x7d, 0x43, 0x86, 0x89, 0xe4, 0xa9, 0xbd,
+	0xf7, 0xbf, 0x72, 0x22, 0x0c, 0x0a, 0xdd, 0x03, 0xfb, 0x42, 0x04, 0xaa, 0x79, 0xbb, 0x02, 0xae,
+	0x30, 0xe8, 0x63, 0xa8, 0xc7, 0x24, 0x8a, 0x27, 0xa2, 0xec, 0xdf, 0x81, 0xd6, 0x20, 0x74, 0x0c,
+	0x48, 0x7d, 0x0d, 0x42, 0x9a, 0x10, 0x86, 0x87, 0x89, 0xe8, 0xd6, 0xba, 0xf4, 0xab, 0xe7, 0x1d,
+	0x46, 0xd3, 0x98, 0x11, 0xce, 0x49, 0xa0, 0x16, 0xfb, 0xd1, 0x5c, 0xaf, 0xbf, 0xae, 0x56, 0x1d,
+	0x2f, 0x17, 0xa1, 0xc7, 0xb0, 0x2e, 0x5d, 0x18, 0x44, 0x26, 0x21, 0x4e, 0x43, 0xba, 0xb0, 0xbe,
+	0x92, 0x27, 0xbf, 0x7b, 0x51, 0x90, 0xdd, 0x5f, 0x2d, 0xb8, 0x79, 0xe5, 0x51, 0x25, 0x75, 0x68,
+	0xbd, 0x6f, 0x1d, 0x56, 0xca, 0xeb, 0x10, 0x41, 0x4d, 0xb4, 0xaa, 0x53, 0xed, 0x57, 0x77, 0xab,
+	0x7e, 0xcd, 0xdc, 0x55, 0x21, 0x0d, 0xc2, 0xa1, 0xa6, 0xd9, 0xf6, 0x8d, 0x88, 0x36, 0xa1, 0x1e,
+	0xd2, 0x20, 0x4e, 0x98, 0x64, 0xb4, 0xea, 0x6b, 0xc9, 0x3d, 0x85, 0xc6, 0x61, 0x94, 0xc6, 0x82,
+	0xf4, 0x0d, 0xb0, 0x43, 0x1a, 0x90, 0x4b, 0x59, 0x98, 0x2d, 0x5f, 0x09, 0x68, 0x0f, 0xea, 0x53,
+	0x19, 0x82, 0xf4, 0xe3, 0xdd, 0x7c, 0x6a, 0xa4, 0xbb, 0x03, 0x9d, 0xb3, 0x28, 0x1d, 0x8e, 0x49,
+	0x20, 0x39, 0x13, 0x3b, 0xab, 0xdc, 0x5b, 0xd2, 0x29, 0x25, 0xb8, 0x7f, 0x58, 0xb0, 0xa9, 0xcf,
+	0x5e, 0xad, 0xcd, 0x7b, 0xd0, 0x11, 0x98, 0xc1, 0x50, 0x99, 0x75, 0x2a, 0x9b, 0x9e, 0x86, 0xfb,
+	0x6d, 0x61, 0x35, 0x7e, 0x3f, 0x80, 0xae, 0xce, 0xbe, 0x81, 0x37, 0x56, 0xe0, 0x6b, 0xca, 0x6e,
+	0x16, 0x3c, 0x84, 0x8e, 0x5e, 0xa0, 0xbc, 0x52, 0xb7, 0xdf, 0x9a, 0x97, 0xf7, 0xd9, 0x6f, 0x2b,
+	0x88, 0x0a, 0xe0, 0xff, 0xd0, 0x56, 0x55, 0x31, 0x09, 0x29, 0xe1, 0x4e, 0x4b, 0x86, 0x01, 0x52,
+	0xf5, 0x42, 0x68, 0xdc, 0x1f, 0x2d, 0x80, 0x57, 0xfb, 0xa7, 0x67, 0x87, 0x63, 0x4c, 0x47, 0x04,
+	0xdd, 0x82, 0x96, 0xf4, 0x3f, 0x77, 0x1d, 0x35, 0x85, 0xe2, 0x0b, 0x71, 0x25, 0x6d, 0x03, 0x70,
+	0x36, 0x1c, 0x9c, 0x93, 0x8b, 0x88, 0x11, 0x3d, 0x3c, 0x5a, 0x9c, 0x0d, 0x0f, 0xa4, 0x42, 0xac,
+	0x15, 0x66, 0x7c, 0x91, 0x10, 0xa6, 0x07, 0x48, 0x93, 0xb3, 0xe1, 0xbe, 0x90, 0x85, 0x23, 0x29,
+	0xe6, 0x89, 0x59, 0x5c, 0x53, 0xf3, 0x45, 0xa8, 0xf4, 0xea, 0x6d, 0x90, 0x92, 0x5e, 0x6e, 0xab,
+	0xcd, 0x85, 0x46, 0xae, 0x77, 0x9f, 0xc2, 0xd6, 0xd2, 0x4d, 0x7e, 0x8a, 0x67, 0x84, 0x19, 0xce,
+	0x3f, 0x80, 0xc6, 0x50, 0xa9, 0xf5, 0xcd, 0xd4, 0xf6, 0x96, 0x50, 0xdf, 0xd8, 0xdc, 0xdf, 0x2d,
+	0xe8, 0x9e, 0x8e, 0xa3, 0x84, 0x12, 0xce, 0x7d, 0x32, 0x8c, 0x58, 0x20, 0x2a, 0x31, 0x59, 0xc4,
+	0xd9, 0xbd, 0x2b, 0xbe, 0xb3, 0xbb, 0xb8, 0x92, 0xbb, 0x8b, 0x11, 0xd4, 0x04, 0x09, 0x3a, 0x28,
+	0xf9, 0x8d, 0x3e, 0x85, 0xe6, 0x30, 0x4a, 0x45, 0x03, 0x9a, 0x9b, 0x61, 0xdb, 0x2b, 0x6e, 0x2f,
+	0xb2, 0x28, 0xed, 0xea, 0x4e, 0xcc, 0xe0, 0xbd, 0xcf, 0x60, 0xad, 0x60, 0xfa, 0x4f, 0x37, 0xe3,
+	0x11, 0x6c, 0x99, 0x63, 0x56, 0x8b, 0xef, 0x43, 0x68, 0x30, 0x79, 0xb2, 0x21, 0x62, 0x7d, 0xc5,
+	0x23, 0xdf, 0xd8, 0xdd, 0x3f, 0x2d, 0x68, 0x8b, 0x0a, 0x79, 0x1e, 0x72, 0x39, 0xdd, 0x73, 0x13,
+	0x59, 0x35, 0x51, 0x36, 0x91, 0x5f, 0xc3, 0x86, 0x66, 0x70, 0x70, 0xbe, 0x18, 0x04, 0x64, 0x46,
+	0x26, 0x51, 0x4c, 0x98, 0x53, 0x91, 0x27, 0xec, 0x78, 0xb9, 0x5d, 0x3c, 0x9d, 0x9d, 0x83, 0xc5,
+	0x91, 0x81, 0xa9, 0xd0, 0xd1, 0xf0, 0x2d, 0x43, 0xef, 0x2b, 0xd8, 0xba, 0x02, 0x5e, 0x42, 0x47,
+	0x3f, 0x4f, 0x47, 0x7b, 0x0f, 0x3c, 0x51, 0xbc, 0xa7, 0x09, 0x4e, 0x78, 0x9e, 0x9a, 0x1f, 0x2c,
+	0x70, 0x72, 0xee, 0x28, 0x5a, 0x4e, 0x08, 0xe7, 0x78, 0x44, 0xd0, 0x93, 0x7c, 0x2b, 0xaf, 0x38,
+	0x5e, 0x40, 0xaa, 0xeb, 0x52, 0xcf, 0x31, 0xb9, 0xa4, 0xf7, 0x0c, 0x60, 0xa9, 0x2c, 0x79, 0x27,
+	0xb8, 0x45, 0xf7, 0x3a, 0x85, 0xbd, 0x73, 0x0e, 0xbe, 0x82, 0x56, 0xe6, 0xb8, 0x48, 0x31, 0x0e,
+	0x02, 0x12, 0xe8, 0x38, 0x95, 0x20, 0x12, 0xc1, 0xc8, 0x34, 0x9a, 0x91, 0x40, 0xa7, 0xde, 0x88,
+	0x32, 0x45, 0x92, 0xb0, 0x40, 0x0f, 0x78, 0x23, 0x8a, 0xca, 0xae, 0x1f, 0x91, 0xd9, 0x11, 0x5e,
+	0xc9, 0x63, 0xe1, 0x65, 0xd5, 0x07, 0x9b, 0x8b, 0x73, 0xcb, 0x28, 0x94, 0x06, 0xf4, 0x09, 0xb4,
+	0x26, 0x98, 0x8e, 0x52, 0x2c, 0x3a, 0xa9, 0x2a, 0x59, 0xda, 0xf4, 0xd4, 0xbe, 0xde, 0x0b, 0x63,
+	0x50, 0xbc, 0x2c, 0x81, 0xbd, 0xe7, 0xd0, 0x2d, 0x1a, 0x4b, 0xf8, 0x79, 0xbf, 0xf4, 0x71, 0x68,
+	0x1c, 0x61, 0x51, 0x0b, 0x1c, 0xdd, 0x85, 0x5a, 0x40, 0x66, 0x26, 0x57, 0xc8, 0xd3, 0x7a, 0xe1,
+	0x8d, 0xf6, 0x40, 0xda, 0x7b, 0x4f, 0xa1, 0x95, 0xa9, 0x4a, 0xca, 0x66, 0xbb, 0x78, 0x6e, 0x43,
+	0x47, 0x93, 0x3f, 0xf4, 0x27, 0x0b, 0x6e, 0x88, 0x2d, 0x56, 0x7b, 0x69, 0x4f, 0x0c, 0xa9, 0x85,
+	0xf1, 0xe0, 0xb6, 0x57, 0x82, 0x11, 0x5e, 0x65, 0xde, 0xe0, 0x05, 0x17, 0x17, 0x60, 0x40, 0x66,
+	0x03, 0x35, 0x8b, 0x2a, 0xb2, 0x8d, 0x9a, 0x01, 0x99, 0x1d, 0x0b, 0xb9, 0xb7, 0x0f, 0xad, 0x0c,
+	0x5f, 0xe2, 0xea, 0xed, 0xa2, 0xab, 0x4d, 0x13, 0x72, 0xde, 0xd7, 0xaf, 0xa1, 0x75, 0x4a, 0xa8,
+	0x78, 0x00, 0xd3, 0x64, 0x79, 0x43, 0x88, 0x4d, 0x2a, 0x1a, 0x26, 0x5e, 0x3e, 0x22, 0xe1, 0x84,
+	0xca, 0x44, 0x4b, 0x0f, 0x8c, 0x9c, 0xaf, 0x8d, 0x6a, 0xa1, 0xc7, 0xdd, 0xdf, 0x2c, 0xd8, 0x3a,
+	0x54, 0xb0, 0xec, 0x00, 0x43, 0xc4, 0x6b, 0xb8, 0xc6, 0x8d, 0x4e, 0xde, 0x00, 0x78, 0xa1, 0x49,
+	0xb9, 0xef, 0x5d, 0xb1, 0xc6, 0xcb, 0x14, 0x07, 0x8b, 0x23, 0xbc, 0xd0, 0x8f, 0x70, 0x5e, 0x50,
+	0xf6, 0x4e, 0xe0, 0x46, 0x09, 0xec, 0x7d, 0x7a, 0x7f, 0x79, 0x5c, 0x8e, 0x9b, 0x37, 0x00, 0x87,
+	0x32, 0x1a, 0xd1, 0x7a, 0xa5, 0x0f, 0xea, 0x1e, 0x34, 0x4d, 0xd5, 0x9a, 0xe9, 0x64, 0xe4, 0x65,
+	0x73, 0xd4, 0xae, 0x68, 0x0e, 0xf7, 0x7b, 0xa8, 0xab, 0xfd, 0xb3, 0x9f, 0x27, 0x2b, 0xf7, 0xf3,
+	0xb4, 0x03, 0xdd, 0xf9, 0x98, 0xe4, 0xff, 0x8d, 0x2a, 0xf2, 0xe7, 0xa5, 0x23, 0xb4, 0xd9, 0x6f,
+	0xcf, 0x26, 0xd4, 0x71, 0x9a, 0x8c, 0x23, 0xa6, 0x1b, 0x58, 0x4b, 0xe8, 0x4e, 0xf1, 0x85, 0xd9,
+	0xf6, 0x96, 0x91, 0x98, 0x27, 0xc7, 0x1b, 0xf1, 0xe2, 0x90, 0xc9, 0x5a, 0x2d, 0xd4, 0x3b, 0xc5,
+	0x9b, 0x5b, 0x54, 0xb9, 0x42, 0x2e, 0x5b, 0xff, 0x0e, 0x74, 0xd4, 0x49, 0x85, 0xd2, 0x6c, 0x2b,
+	0x9d, 0xac, 0x4e, 0x77, 0x06, 0xb5, 0xb3, 0x45, 0x1c, 0x89, 0xaa, 0x9a, 0xb3, 0x88, 0x8e, 0x74,
+	0x74, 0x4a, 0x50, 0x95, 0xc3, 0x98, 0x78, 0x33, 0xab, 0xb1, 0x68, 0x44, 0x11, 0x92, 0x3a, 0x45,
+	0x53, 0xaa, 0xa5, 0x6c, 0x62, 0xd6, 0x72, 0x13, 0x13, 0x41, 0x4d, 0xbc, 0x42, 0xe4, 0x6c, 0xb7,
+	0x7d, 0xf9, 0xed, 0xde, 0x83, 0x8e, 0x38, 0x97, 0x1f, 0xe1, 0x04, 0x73, 0x92, 0xa0, 0x5b, 0x60,
+	0x27, 0x42, 0xd6, 0xb1, 0xd8, 0x9e, 0xb0, 0xfa, 0x4a, 0xe7, 0xfe, 0x62, 0xc1, 0xfa, 0xdb, 0xe1,
+	0xd7, 0xc7, 0x04, 0x07, 0x84, 0x49, 0x8f, 0xdb, 0x7b, 0xad, 0xec, 0x57, 0xd0, 0xd7, 0x06, 0xf4,
+	0x44, 0xf4, 0x04, 0x4d, 0xb2, 0x9e, 0x10, 0xed, 0xbc, 0xda, 0xca, 0x87, 0x1a, 0x90, 0x8d, 0x6a,
+	0x25, 0xaa, 0x51, 0x9d, 0x33, 0xfd, 0xdb, 0x4f, 0x62, 0x27, 0x57, 0x93, 0xe7, 0x75, 0xf9, 0x53,
+	0xfe, 0xe8, 0x9f, 0x00, 0x00, 0x00, 0xff, 0xff, 0x1e, 0xab, 0x56, 0x6d, 0xa0, 0x0f, 0x00, 0x00,
 }

+ 12 - 0
internal/pb/pb.proto

@@ -165,6 +165,18 @@ message CommitsAnalysisResults {
     repeated string author_index = 2;
 }
 
+message Typo {
+    string wrong = 1;
+    string correct = 2;
+    string commit = 3;
+    string file = 4;
+    int32 line = 5;
+}
+
+message TyposDataset {
+    repeated Typo typos = 1;
+}
+
 message AnalysisResults {
     Metadata header = 1;
     // the mapped values are dynamic messages which require the second parsing pass.

A diferenza do arquivo foi suprimida porque é demasiado grande
+ 112 - 5
internal/pb/pb_pb2.py


+ 276 - 0
leaves/research/typos.go

@@ -0,0 +1,276 @@
+package research
+
+import (
+	"bytes"
+	"fmt"
+	"io"
+	"unicode/utf8"
+
+	"github.com/gogo/protobuf/proto"
+	"github.com/sergi/go-diff/diffmatchpatch"
+	"gopkg.in/bblfsh/sdk.v2/uast"
+	"gopkg.in/bblfsh/sdk.v2/uast/nodes"
+	"gopkg.in/src-d/go-git.v4"
+	"gopkg.in/src-d/go-git.v4/plumbing"
+	"gopkg.in/src-d/go-git.v4/plumbing/object"
+	"gopkg.in/src-d/hercules.v9/internal/core"
+	"gopkg.in/src-d/hercules.v9/internal/levenshtein"
+	"gopkg.in/src-d/hercules.v9/internal/pb"
+	items "gopkg.in/src-d/hercules.v9/internal/plumbing"
+	uast_items "gopkg.in/src-d/hercules.v9/internal/plumbing/uast"
+)
+
+// TyposDatasetBuilder collects pairs of typo-fix in source code identifiers.
+type TyposDatasetBuilder struct {
+	core.NoopMerger
+
+	// MaximumAllowedDistance is the maximum Levenshtein distance between two identifiers
+	// to consider them a typo-fix pair.
+	MaximumAllowedDistance int
+
+	// typos stores the found typo-fix pairs.
+	typos []Typo
+	// lcontext is the Context for measuring Levenshtein distance between lines.
+	lcontext *levenshtein.Context
+	// xpather filters identifiers.
+	xpather uast_items.ChangesXPather
+}
+
+// TyposResult is returned by TyposDatasetBuilder.Finalize() and carries the found typo-fix
+// pairs of identifiers.
+type TyposResult struct {
+	Typos []Typo
+}
+
+// Typo carries the information about a typo-fix pair.
+type Typo struct {
+	Wrong   string
+	Correct string
+	Commit  plumbing.Hash
+	File    string
+	Line    int
+}
+
+const (
+	// DefaultMaximumAllowedTypoDistance is the default value of the maximum Levenshtein distance
+	// between two identifiers to consider them a typo-fix pair.
+	DefaultMaximumAllowedTypoDistance = 4
+)
+
+// Name of this PipelineItem. Uniquely identifies the type, used for mapping keys, etc.
+func (tdb *TyposDatasetBuilder) Name() string {
+	return "TyposDataset"
+}
+
+// Provides returns the list of names of entities which are produced by this PipelineItem.
+// Each produced entity will be inserted into `deps` of dependent Consume()-s according
+// to this list. Also used by core.Registry to build the global map of providers.
+func (tdb *TyposDatasetBuilder) Provides() []string {
+	return []string{}
+}
+
+// Requires returns the list of names of entities which are needed by this PipelineItem.
+// Each requested entity will be inserted into `deps` of Consume(). In turn, those
+// entities are Provides() upstream.
+func (tdb *TyposDatasetBuilder) Requires() []string {
+	arr := [...]string{
+		uast_items.DependencyUastChanges, items.DependencyFileDiff, items.DependencyBlobCache}
+	return arr[:]
+}
+
+// ListConfigurationOptions returns the list of changeable public properties of this PipelineItem.
+func (tdb *TyposDatasetBuilder) ListConfigurationOptions() []core.ConfigurationOption {
+	return nil
+}
+
+// Configure sets the properties previously published by ListConfigurationOptions().
+func (tdb *TyposDatasetBuilder) Configure(facts map[string]interface{}) error {
+	return nil
+}
+
+// Flag for the command line switch which enables this analysis.
+func (tdb *TyposDatasetBuilder) Flag() string {
+	return "typos-dataset"
+}
+
+// Description returns the text which explains what the analysis is doing.
+func (tdb *TyposDatasetBuilder) Description() string {
+	return "Extracts typo-fix identifier pairs from source code in commit diffs."
+}
+
+// Initialize resets the temporary caches and prepares this PipelineItem for a series of Consume()
+// calls. The repository which is going to be analysed is supplied as an argument.
+func (tdb *TyposDatasetBuilder) Initialize(repository *git.Repository) error {
+	if tdb.MaximumAllowedDistance == 0 {
+		tdb.MaximumAllowedDistance = DefaultMaximumAllowedTypoDistance
+	}
+	tdb.lcontext = &levenshtein.Context{}
+	tdb.xpather.XPath = "//uast:Identifier"
+	return nil
+}
+
+type candidate struct {
+	Before int
+	After  int
+}
+
+// Consume runs this PipelineItem on the next commit data.
+// `deps` contain all the results from upstream PipelineItem-s as requested by Requires().
+// Additionally, DependencyCommit is always present there and represents the analysed *object.Commit.
+// This function returns the mapping with analysis results. The keys must be the same as
+// in Provides(). If there was an error, nil is returned.
+func (tdb *TyposDatasetBuilder) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
+	if deps[core.DependencyIsMerge].(bool) {
+		return nil, nil
+	}
+	commit := deps[core.DependencyCommit].(*object.Commit).Hash
+	cache := deps[items.DependencyBlobCache].(map[plumbing.Hash]*items.CachedBlob)
+	diffs := deps[items.DependencyFileDiff].(map[string]items.FileDiffData)
+	changes := deps[uast_items.DependencyUastChanges].([]uast_items.Change)
+	for _, change := range changes {
+		if change.Before == nil || change.After == nil {
+			continue
+		}
+		linesBefore := bytes.Split(cache[change.Change.From.TreeEntry.Hash].Data, []byte{'\n'})
+		linesAfter := bytes.Split(cache[change.Change.To.TreeEntry.Hash].Data, []byte{'\n'})
+		diff := diffs[change.Change.To.Name]
+		var lineNumBefore, lineNumAfter int
+		clear := false
+		var candidates []candidate
+		focusedLinesBefore := map[int]bool{}
+		focusedLinesAfter := map[int]bool{}
+		for _, edit := range diff.Diffs {
+			size := utf8.RuneCountInString(edit.Text)
+			switch edit.Type {
+			case diffmatchpatch.DiffDelete:
+				lineNumBefore += size
+				clear = size == 1
+			case diffmatchpatch.DiffInsert:
+				if size == 1 && clear {
+					dist := tdb.lcontext.Distance(
+						string(linesBefore[lineNumBefore-1]),
+						string(linesAfter[lineNumAfter]))
+					if dist <= tdb.MaximumAllowedDistance {
+						candidates = append(candidates, candidate{lineNumBefore - 1, lineNumAfter})
+						focusedLinesBefore[lineNumBefore-1] = true
+						focusedLinesAfter[lineNumAfter] = true
+					}
+				}
+				lineNumAfter += size
+				clear = false
+			case diffmatchpatch.DiffEqual:
+				lineNumBefore += size
+				lineNumAfter += size
+				clear = false
+			}
+		}
+		if len(candidates) == 0 {
+			continue
+		}
+		// at this point we have pairs of very similar lines
+		// we need to build the line mappings of the identifiers before/after the change
+		// we should keep only those which are present on those focused lines
+		nodesAdded, nodesRemoved := tdb.xpather.Extract([]uast_items.Change{change})
+		addedIdentifiers := map[int][]nodes.Node{}
+		removedIdentifiers := map[int][]nodes.Node{}
+		for _, n := range nodesAdded {
+			pos := uast.PositionsOf(n.(nodes.Object))
+			if pos.Start() != nil {
+				line := int(pos.Start().Line) - 1
+				if focusedLinesAfter[line] {
+					addedIdentifiers[line] = append(addedIdentifiers[line], n)
+				}
+			}
+		}
+		for _, n := range nodesRemoved {
+			pos := uast.PositionsOf(n.(nodes.Object))
+			line := int(pos.Start().Line) - 1
+			if pos.Start() != nil {
+				if focusedLinesBefore[line] {
+					removedIdentifiers[line] = append(removedIdentifiers[line], n)
+				}
+			}
+		}
+		for _, c := range candidates {
+			nodesBefore := addedIdentifiers[c.Before]
+			nodesAfter := removedIdentifiers[c.After]
+			if len(nodesBefore) == 1 && len(nodesAfter) == 1 {
+				idBefore := string(nodesBefore[0].(nodes.Object)["Name"].(nodes.String))
+				idAfter := string(nodesAfter[0].(nodes.Object)["Name"].(nodes.String))
+				tdb.typos = append(tdb.typos, Typo{
+					Wrong:   idBefore,
+					Correct: idAfter,
+					Commit:  commit,
+					File:    change.Change.To.Name,
+					Line:    c.After,
+				})
+			}
+		}
+	}
+	return nil, nil
+}
+
+// Finalize returns the result of the analysis. Further Consume() calls are not expected.
+func (tdb *TyposDatasetBuilder) Finalize() interface{} {
+	// deduplicate
+	typos := make([]Typo, 0, len(tdb.typos))
+	pairs := map[string]bool{}
+	for _, t := range tdb.typos {
+		id := t.Wrong + "|" + t.Correct
+		if _, exists := pairs[id]; !exists {
+			pairs[id] = true
+			typos = append(typos, t)
+		}
+	}
+	return TyposResult{Typos: typos}
+}
+
+// Fork clones this pipeline item.
+func (tdb *TyposDatasetBuilder) Fork(n int) []core.PipelineItem {
+	return core.ForkSamePipelineItem(tdb, n)
+}
+
+// Serialize converts the analysis result as returned by Finalize() to text or bytes.
+// The text format is YAML and the bytes format is Protocol Buffers.
+func (tdb *TyposDatasetBuilder) Serialize(result interface{}, binary bool, writer io.Writer) error {
+	commitsResult := result.(TyposResult)
+	if binary {
+		return tdb.serializeBinary(&commitsResult, writer)
+	}
+	tdb.serializeText(&commitsResult, writer)
+	return nil
+}
+
+func (tdb *TyposDatasetBuilder) serializeText(result *TyposResult, writer io.Writer) {
+	for _, t := range result.Typos {
+		fmt.Fprintf(writer, "  - wrong: %s\n", t.Wrong)
+		fmt.Fprintf(writer, "    correct: %s\n", t.Correct)
+		fmt.Fprintf(writer, "    commit: %s\n", t.Commit.String())
+		fmt.Fprintf(writer, "    file: %s\n", t.File)
+		fmt.Fprintf(writer, "    line: %d\n", t.Line)
+	}
+}
+
+func (tdb *TyposDatasetBuilder) serializeBinary(result *TyposResult, writer io.Writer) error {
+	message := pb.TyposDataset{}
+	message.Typos = make([]*pb.Typo, len(result.Typos))
+	for i, t := range result.Typos {
+		message.Typos[i] = &pb.Typo{
+			Wrong:   t.Wrong,
+			Correct: t.Correct,
+			Commit:  t.Commit.String(),
+			File:    t.File,
+			Line:    int32(t.Line),
+		}
+	}
+	serialized, err := proto.Marshal(&message)
+	if err != nil {
+		return err
+	}
+	_, err = writer.Write(serialized)
+	return err
+}
+
+func init() {
+	core.Registry.Register(&TyposDatasetBuilder{})
+}

+ 1 - 0
leaves/research/typos_test.go

@@ -0,0 +1 @@
+package research