Forráskód Böngészése

Better input format detection in labours

Fixes #333

Signed-off-by: Vadim Markovtsev <vadim@athenian.co>
Vadim Markovtsev 5 éve
szülő
commit
5ce5467503

+ 2 - 2
internal/plumbing/identity/identity_test.go

@@ -238,7 +238,7 @@ func TestIdentityDetectorGeneratePeopleDict(t *testing.T) {
 	assert.Equal(t, id.PeopleDict["bzz@apache.org"], 1)
 	assert.Equal(t, id.PeopleDict["máximo cuadros"], 2)
 	assert.Equal(t, id.PeopleDict["mcuadros@gmail.com"], 2)
-	assert.Equal(t, id.ReversedPeopleDict[0], "vadim markovtsev|gmarkhor@gmail.com|vadim@sourced.tech")
+	assert.Equal(t, id.ReversedPeopleDict[0], "vadim markovtsev|gmarkhor@gmail.com|vadim@athenian.co|vadim@sourced.tech")
 	assert.Equal(t, id.ReversedPeopleDict[1], "alexander bezzubov|bzz@apache.org")
 	assert.Equal(t, id.ReversedPeopleDict[2], "máximo cuadros|mcuadros@gmail.com")
 	assert.NotEqual(t, id.ReversedPeopleDict[len(id.ReversedPeopleDict)-1], AuthorMissingName)
@@ -383,7 +383,7 @@ func TestIdentityDetectorGeneratePeopleDictMailmap(t *testing.T) {
 	commits = append(commits, fake)
 	id.GeneratePeopleDict(commits)
 	assert.Contains(t, id.ReversedPeopleDict,
-		"strange guy|vadim markovtsev|gmarkhor@gmail.com|vadim@sourced.tech")
+		"strange guy|vadim markovtsev|gmarkhor@gmail.com|vadim@athenian.co|vadim@sourced.tech")
 }
 
 func TestIdentityDetectorMergeReversedDictsLiteral(t *testing.T) {

+ 8 - 1
leaves/imports_printer.go

@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"io"
 	"log"
+	"sort"
 
 	"github.com/gogo/protobuf/proto"
 	imports2 "github.com/src-d/imports"
@@ -143,7 +144,13 @@ func (ipd *ImportsPerDeveloper) Serialize(result interface{}, binary bool, write
 }
 
 func (ipd *ImportsPerDeveloper) serializeText(result *ImportsPerDeveloperResult, writer io.Writer) {
-	for dev, imps := range result.Imports {
+	devs := make([]int, 0, len(result.Imports))
+	for dev := range result.Imports {
+		devs = append(devs, dev)
+	}
+	sort.Ints(devs)
+	for _, dev := range devs {
+		imps := result.Imports[dev]
 		obj, err := json.Marshal(imps)
 		if err != nil {
 			log.Panicf("Could not serialize %v: %v", imps, err)

+ 81 - 29
python/labours/readers.py

@@ -1,8 +1,9 @@
 from argparse import Namespace
 from importlib import import_module
+import io
 import re
 import sys
-from typing import Any, Dict, List, Tuple, TYPE_CHECKING
+from typing import Any, BinaryIO, Dict, List, Tuple, TYPE_CHECKING
 
 import numpy
 import yaml
@@ -14,7 +15,7 @@ if TYPE_CHECKING:
 
 
 class Reader(object):
-    def read(self, file):
+    def read(self, fileobj: BinaryIO):
         raise NotImplementedError
 
     def get_name(self):
@@ -61,7 +62,7 @@ class Reader(object):
 
 
 class YamlReader(Reader):
-    def read(self, file: str):
+    def read(self, fileobj: BinaryIO):
         yaml.reader.Reader.NON_PRINTABLE = re.compile(r"(?!x)x")
         try:
             loader = yaml.CLoader
@@ -71,12 +72,9 @@ class YamlReader(Reader):
             )
             loader = yaml.Loader
         try:
-            if file != "-":
-                with open(file) as fin:
-                    data = yaml.load(fin, Loader=loader)
-            else:
-                data = yaml.load(sys.stdin, Loader=loader)
-        except (UnicodeEncodeError, yaml.reader.ReaderError) as e:
+            wrapper = io.TextIOWrapper(fileobj, encoding="utf-8")
+            data = yaml.load(wrapper, Loader=loader)
+        except (UnicodeEncodeError, UnicodeDecodeError, yaml.reader.ReaderError) as e:
             print(
                 "\nInvalid unicode in the input: %s\nPlease filter it through "
                 "fix_yaml_unicode.py" % e
@@ -217,7 +215,7 @@ class YamlReader(Reader):
 
 
 class ProtobufReader(Reader):
-    def read(self, file: str) -> None:
+    def read(self, fileobj: BinaryIO) -> None:
         try:
             from labours.pb_pb2 import AnalysisResults
         except ImportError as e:
@@ -227,14 +225,10 @@ class ProtobufReader(Reader):
             )
             raise e from None
         self.data = AnalysisResults()
-        if file != "-":
-            with open(file, "rb") as fin:
-                bytes = fin.read()
-        else:
-            bytes = sys.stdin.buffer.read()
-        if not bytes:
+        all_bytes = fileobj.read()
+        if not all_bytes:
             raise ValueError("empty input")
-        self.data.ParseFromString(bytes)
+        self.data.ParseFromString(all_bytes)
         self.contents = {}
         for key, val in self.data.contents.items():
             try:
@@ -370,23 +364,81 @@ PB_MESSAGES = {
 }
 
 
+def chain_streams(streams, buffer_size=io.DEFAULT_BUFFER_SIZE):
+    """
+    Chain an iterable of streams together into a single buffered stream.
+    Source: https://stackoverflow.com/a/50770511
+
+    Usage:
+        f = chain_streams(open(f, "rb") for f in filenames)
+        f.read()
+    """
+
+    class ChainStream(io.RawIOBase):
+        def __init__(self):
+            self.leftover = b""
+            self.stream_iter = iter(streams)
+            try:
+                self.stream = next(self.stream_iter)
+            except StopIteration:
+                self.stream = None
+
+        def readable(self):
+            return True
+
+        def _read_next_chunk(self, max_length):
+            # Return 0 or more bytes from the current stream, first returning all
+            # leftover bytes. If the stream is closed returns b''
+            if self.leftover:
+                return self.leftover
+            elif self.stream is not None:
+                return self.stream.read(max_length)
+            else:
+                return b""
+
+        def readinto(self, b):
+            buffer_length = len(b)
+            chunk = self._read_next_chunk(buffer_length)
+            while len(chunk) == 0:
+                # move to next stream
+                if self.stream is not None:
+                    self.stream.close()
+                try:
+                    self.stream = next(self.stream_iter)
+                    chunk = self._read_next_chunk(buffer_length)
+                except StopIteration:
+                    # No more streams to chain together
+                    self.stream = None
+                    return 0  # indicate EOF
+            output, self.leftover = chunk[:buffer_length], chunk[buffer_length:]
+            b[:len(output)] = output
+            return len(output)
+
+    return io.BufferedReader(ChainStream(), buffer_size=buffer_size)
+
+
 def read_input(args: Namespace) -> ProtobufReader:
     sys.stdout.write("Reading the input... ")
     sys.stdout.flush()
     if args.input != "-":
+        stream = open(args.input, "rb")
+    else:
+        stream = sys.stdin.buffer
+    try:
         if args.input_format == "auto":
+            buffer = stream.read(1 << 16)
             try:
-                args.input_format = args.input.rsplit(".", 1)[1]
-            except IndexError:
-                try:
-                    with open(args.input) as f:
-                        f.read(1 << 16)
-                    args.input_format = "yaml"
-                except UnicodeDecodeError:
-                    args.input_format = "pb"
-    elif args.input_format == "auto":
-        args.input_format = "yaml"
-    reader = READERS[args.input_format]()
-    reader.read(args.input)
+                buffer.decode("utf-8")
+                args.input_format = "yaml"
+            except UnicodeDecodeError:
+                args.input_format = "pb"
+            ins = chain_streams((io.BytesIO(buffer), stream), len(buffer))
+        else:
+            ins = stream
+        reader = READERS[args.input_format]()
+        reader.read(ins)
+    finally:
+        if args.input != "-":
+            stream.close()
     print("done")
     return reader

+ 1 - 1
python/requirements.in

@@ -1,7 +1,7 @@
 matplotlib>=2.0,<4.0
 numpy>=1.12.0,<2.0
 pandas>=0.20.0,<1.0
-PyYAML>=3.0,<5.0
+PyYAML>=3.0,<6.0
 scipy>=0.19.0,<1.2.2
 protobuf>=3.5.0,<4.0
 munch>=2.0,<3.0