Przeglądaj źródła

Extract labours package

Signed-off-by: Vadim Markovtsev <vadim@sourced.tech>
Vadim Markovtsev 6 lat temu
rodzic
commit
f425151728
15 zmienionych plików z 2091 dodań i 103 usunięć
  1. 2 0
      .gitignore
  2. 26 14
      .travis.yml
  3. 3 3
      Dockerfile
  4. 3 3
      Makefile
  5. 34 33
      README.md
  6. 1 1
      .flake8
  7. 1 0
      python/LICENSE.md
  8. 1 0
      python/README.md
  9. 1 0
      python/labours/__init__.py
  10. 7 0
      python/labours/__main__.py
  11. 11 46
      labours.py
  12. 1944 0
      python/labours/pb_pb2.py
  13. 0 0
      python/labours/swivel.py
  14. 3 3
      requirements.txt
  15. 54 0
      python/setup.py

+ 2 - 0
.gitignore

@@ -5,6 +5,8 @@ pb/pb.pb.go
 pb/pb_pb2.py
 coverage.txt
 
+**/*.egg-info
+**/__pycache__
 **/.DS_Store
 .idea
 

+ 26 - 14
.travis.yml

@@ -43,12 +43,12 @@ before_install:
   - unzip -d ~/.local protoc.zip && rm protoc.zip
   - go get -v golang.org/x/lint/golint
   - go get -v github.com/golang/dep/cmd/dep
-  - (wget -O - https://bootstrap.pypa.io/get-pip.py || wget -O - https://raw.githubusercontent.com/pypa/get-pip/master/get-pip.py) | python3 - --user pip==18.1
+  - (wget -O - https://bootstrap.pypa.io/get-pip.py || wget -O - https://raw.githubusercontent.com/pypa/get-pip/master/get-pip.py) | sudo python3 - pip==18.1
   - export PATH=~/usr/bin:$GOPATH/bin:$PATH
   - make --version
   - pip3 --version
-  - pip3 install --user cython
-  - pip3 install --user --no-build-isolation -r requirements.txt tensorflow flake8
+  - sudo pip3 install cython
+  - sudo pip3 install tensorflow flake8 ./python
   - docker run -d --privileged -p 9432:9432 --name bblfshd bblfsh/bblfshd
   - docker exec -it bblfshd bblfshctl driver install python bblfsh/python-driver:latest
   - docker exec -it bblfshd bblfshctl driver install go bblfsh/go-driver:latest
@@ -63,7 +63,7 @@ script:
   - if [ $TRAVIS_GO_VERSION == "1.11.x" ]; then test -z "$(gofmt -s -l . | grep -v vendor/)"; fi
   - go vet -tags tensorflow ./...
   - golint -set_exit_status $(go list ./... | grep -v /vendor/)
-  - flake8
+  - cd python && flake8 && cd ..
   - go test -coverpkg=all -v -coverprofile=coverage.txt -covermode=count gopkg.in/src-d/hercules.v10/... && sed -i '/cmd\/hercules\|core.go/d' coverage.txt
   - # race checks increase the elapsed time by 10 minutes, we run them only in AppVeyor
   - $GOPATH/bin/hercules version
@@ -72,8 +72,8 @@ script:
   - $GOPATH/bin/hercules combine 1.pb 2.pb > 12.pb
   - ($GOPATH/bin/hercules generate-plugin -n MyPlug -o myplug && cd myplug && make)
   - (cd contrib/_plugin_example && make)
-  - $GOPATH/bin/hercules --burndown --burndown-files --burndown-people --couples --devs --quiet https://github.com/src-d/hercules | python3 labours.py -m all -o out --backend Agg --disable-projector
-  - $GOPATH/bin/hercules --burndown --burndown-files --burndown-people --couples --devs --quiet --pb https://github.com/src-d/hercules | python3 labours.py -f pb -m all -o out --backend Agg --disable-projector
+  - $GOPATH/bin/hercules --burndown --burndown-files --burndown-people --couples --devs --quiet https://github.com/src-d/hercules | labours -m all -o out --backend Agg --disable-projector
+  - $GOPATH/bin/hercules --burndown --burndown-files --burndown-people --couples --devs --quiet --pb https://github.com/src-d/hercules | labours -f pb -m all -o out --backend Agg --disable-projector
   - # $GOPATH/bin/hercules --sentiment --quiet --languages Python https://github.com/src-d/hercules > /dev/null
   - set +e
   - if [ $TRAVIS_GO_VERSION == "1.11.x" ]; then bash <(curl -s https://codecov.io/bash); fi
@@ -119,19 +119,31 @@ jobs:
         - unzip -d ~/.local protoc.zip && rm protoc.zip
         - go get -v github.com/golang/dep/cmd/dep
         - export PATH=$GOPATH/bin:$PATH
-      script: skip
+        - pip install twine pyopenssl
+      script:
+        - test $(python3 python/setup.py --version) == $TRAVIS_TAG
       install:
+        - set -e
         - travis_retry make
+        - cd python
+        - python3 setup.py bdist_wheel
+        - cd ..
       after_success:
         - gzip -S .linux_amd64.gz $GOPATH/bin/hercules
       deploy:
-        provider: releases
-        api_key:
-          secure: $GITHUB_TOKEN
-        file: "$GOPATH/bin/hercules.linux_amd64.gz"
-        skip_cleanup: true
-        on:
-          tags: true
+        - provider: releases
+          api_key:
+            secure: $GITHUB_TOKEN
+          file: "$GOPATH/bin/hercules.linux_amd64.gz"
+          skip_cleanup: true
+          on:
+            tags: true
+        - provider: script
+          script: twine upload python/dist/*py3-none-any* -u $PYPI_LOGIN -p $PYPI_PASS
+          skip_cleanup: true
+          on:
+            tags: true
+
 
 notifications:
   email: false

+ 3 - 3
Dockerfile

@@ -20,8 +20,6 @@ RUN apt-get update && \
     make && \
     rm /usr/local/bin/protoc && rm /usr/local/readme.txt && rm -rf /usr/local/include/google && \
     cp /root/bin/hercules /usr/local/bin && \
-    cp -r /root/src/gopkg.in/src-d/hercules.v10/*.py /root/src/gopkg.in/src-d/hercules.v10/internal /usr/local/bin && \
-    sed -i 's/parser.add_argument("--backend",/parser.add_argument("--backend", default="Agg",/' /usr/local/bin/labours.py && \
     echo '#!/bin/bash\n\
 \n\
 echo\n\
@@ -30,7 +28,9 @@ echo\n\' > /browser && \
     chmod +x /browser && \
     curl https://bootstrap.pypa.io/get-pip.py | python3 - pip==18.1 && \
     pip3 install --no-cache-dir --no-build-isolation cython && \
-    pip3 install --no-cache-dir --no-build-isolation -r /root/src/gopkg.in/src-d/hercules.v10/requirements.txt https://github.com/mind/wheels/releases/download/tf1.7-cpu/tensorflow-1.7.0-cp36-cp36m-linux_x86_64.whl && \
+    sed -i 's/parser.add_argument("--backend",/parser.add_argument("--backend", default="Agg",/' /root/src/gopkg.in/src-d/hercules.v10/python/labours/labours.py && \
+    pip3 install --no-cache-dir /root/src/gopkg.in/src-d/hercules.v10/python && \
+    pip3 install --no-cache-dir "tensorflow<2.0" && \
     rm -rf /root/* && \
     apt-get remove -y software-properties-common golang-1.10-go python3-dev libyaml-dev libxml2-dev curl git make unzip g++ && \
     apt-get remove -qy *-doc *-man && \

+ 3 - 3
Makefile

@@ -24,8 +24,8 @@ internal/pb/pb.pb.go: internal/pb/pb.proto ${GOPATH}/bin/protoc-gen-gogo.exe
 	protoc --gogo_out=internal/pb --proto_path=internal/pb internal/pb/pb.proto
 endif
 
-internal/pb/pb_pb2.py: internal/pb/pb.proto
-	protoc --python_out internal/pb --proto_path=internal/pb internal/pb/pb.proto
+python/labours/pb_pb2.py: internal/pb/pb.proto
+	protoc --python_out python/hercules --proto_path=internal/pb internal/pb/pb.proto
 
 cmd/hercules/plugin_template_source.go: cmd/hercules/plugin.template
 	cd cmd/hercules && go generate
@@ -33,5 +33,5 @@ cmd/hercules/plugin_template_source.go: cmd/hercules/plugin.template
 vendor:
 	dep ensure -v
 
-${GOPATH}/bin/hercules${EXE}: vendor *.go */*.go */*/*.go */*/*/*.go internal/pb/pb.pb.go internal/pb/pb_pb2.py cmd/hercules/plugin_template_source.go
+${GOPATH}/bin/hercules${EXE}: vendor *.go */*.go */*/*.go */*/*/*.go internal/pb/pb.pb.go python/labours/pb_pb2.py cmd/hercules/plugin_template_source.go
 	go get -tags "$(TAGS)" -ldflags "-X gopkg.in/src-d/hercules.v10.BinaryGitHash=$(shell git rev-parse HEAD)" gopkg.in/src-d/hercules.v10/cmd/hercules

+ 34 - 33
README.md

@@ -27,7 +27,7 @@
 Hercules is an amazingly fast and highly customizable Git repository analysis engine written in Go. Batteries are included.
 It is powered by [go-git](https://github.com/src-d/go-git) and [Babelfish](https://doc.bblf.sh).
 
-There are two command-line tools: `hercules` and `labours.py`. The first is the program
+There are two command-line tools: `hercules` and `labours`. The first is the program
 written in Go which takes a Git repository and runs a Directed Acyclic Graph (DAG) of [analysis tasks](doc/PIPELINE_ITEMS.md) over the full commit history.
 The second is the Python script which draws some predefined plots. These two tools are normally used together through
 a pipe. It is possible to write custom analyses using the plugin system. It is also possible
@@ -40,15 +40,15 @@ Blog posts: [1](https://blog.sourced.tech/post/hercules.v10), [2](https://blog.s
 <p align="center">The DAG of burndown and couples analyses with UAST diff refining. Generated with <code>hercules --burndown --burndown-people --couples --feature=uast --dry-run --dump-dag doc/dag.dot https://github.com/src-d/hercules</code></p>
 
 ![git/git image](doc/linux.png)
-<p align="center">torvalds/linux line burndown (granularity 30, sampling 30, resampled by year). Generated with <code>hercules --burndown --first-parent --pb https://github.com/torvalds/linux | python3 labours.py -f pb -m burndown-project</code> in 1h 40min.</p>
+<p align="center">torvalds/linux line burndown (granularity 30, sampling 30, resampled by year). Generated with <code>hercules --burndown --first-parent --pb https://github.com/torvalds/linux | labours -f pb -m burndown-project</code> in 1h 40min.</p>
 
 ## Installation
 
 Grab `hercules` binary from the [Releases page](https://github.com/src-d/hercules/releases).
-`labours.py` requires the Python packages listed in [requirements.txt](requirements.txt):
+`labours` is installable from [PyPi](https://pypi.org/):
 
 ```
-pip3 install -r requirements.txt
+pip3 install labours
 ```
 
 [`pip3`](https://pip.pypa.io/en/stable/installing/) is the Python package manager.
@@ -62,6 +62,7 @@ and [`dep`](https://github.com/golang/dep).
 go get -d gopkg.in/src-d/hercules.v10/cmd/hercules
 cd $GOPATH/src/gopkg.in/src-d/hercules.v10
 make
+pip3 install -e ./python
 ```
 
 Replace `$GOPATH` with `%GOPATH%` on Windows.
@@ -85,21 +86,21 @@ Some examples:
 
 ```
 # Use "memory" go-git backend and display the burndown plot. "memory" is the fastest but the repository's git data must fit into RAM.
-hercules --burndown https://github.com/src-d/go-git | python3 labours.py -m burndown-project --resample month
+hercules --burndown https://github.com/src-d/go-git | labours -m burndown-project --resample month
 # Use "file system" go-git backend and print some basic information about the repository.
 hercules /path/to/cloned/go-git
 # Use "file system" go-git backend, cache the cloned repository to /tmp/repo-cache, use Protocol Buffers and display the burndown plot without resampling.
-hercules --burndown --pb https://github.com/git/git /tmp/repo-cache | python3 labours.py -m burndown-project -f pb --resample raw
+hercules --burndown --pb https://github.com/git/git /tmp/repo-cache | labours -m burndown-project -f pb --resample raw
 
 # Now something fun
 # Get the linear history from git rev-list, reverse it
 # Pipe to hercules, produce burndown snapshots for every 30 days grouped by 30 days
-# Save the raw data to cache.yaml, so that later is possible to python3 labours.py -i cache.yaml
-# Pipe the raw data to labours.py, set text font size to 16pt, use Agg matplotlib backend and save the plot to output.png
-git rev-list HEAD | tac | hercules --commits - --burndown https://github.com/git/git | tee cache.yaml | python3 labours.py -m burndown-project --font-size 16 --backend Agg --output git.png
+# Save the raw data to cache.yaml, so that later is possible to labours -i cache.yaml
+# Pipe the raw data to labours, set text font size to 16pt, use Agg matplotlib backend and save the plot to output.png
+git rev-list HEAD | tac | hercules --commits - --burndown https://github.com/git/git | tee cache.yaml | labours -m burndown-project --font-size 16 --backend Agg --output git.png
 ```
 
-`labours.py -i /path/to/yaml` allows to read the output from `hercules` which was saved on disk.
+`labours -i /path/to/yaml` allows to read the output from `hercules` which was saved on disk.
 
 #### Caching
 
@@ -117,7 +118,7 @@ hercules --some-analysis /tmp/repo-cache
 #### Docker image
 
 ```
-docker run --rm srcd/hercules hercules --burndown --pb https://github.com/git/git | docker run --rm -i -v $(pwd):/io srcd/hercules labours.py -f pb -m burndown-project -o /io/git_git.png
+docker run --rm srcd/hercules hercules --burndown --pb https://github.com/git/git | docker run --rm -i -v $(pwd):/io srcd/hercules labours -f pb -m burndown-project -o /io/git_git.png
 ```
 
 ### Built-in analyses
@@ -126,7 +127,7 @@ docker run --rm srcd/hercules hercules --burndown --pb https://github.com/git/gi
 
 ```
 hercules --burndown
-python3 labours.py -m burndown-project
+labours -m burndown-project
 ```
 
 Line burndown statistics for the whole repository.
@@ -139,7 +140,7 @@ Granularity is the number of days each band in the stack consists of. Sampling
 is the frequency with which the burnout state is snapshotted. The smaller the
 value, the more smooth is the plot but the more work is done.
 
-There is an option to resample the bands inside `labours.py`, so that you can
+There is an option to resample the bands inside `labours`, so that you can
 define a very precise distribution and visualize it different ways. Besides,
 resampling aligns the bands across periodic boundaries, e.g. months or years.
 Unresampled bands are apparently not aligned and start from the project's birth date.
@@ -148,7 +149,7 @@ Unresampled bands are apparently not aligned and start from the project's birth
 
 ```
 hercules --burndown --burndown-files
-python3 labours.py -m burndown-file
+labours -m burndown-file
 ```
 
 Burndown statistics for every file in the repository which is alive in the latest revision.
@@ -159,7 +160,7 @@ Note: it will generate separate graph for every file. You might don't want to ru
 
 ```
 hercules --burndown --burndown-people [--people-dict=/path/to/identities]
-python3 labours.py -m burndown-person
+labours -m burndown-person
 ```
 
 Burndown statistics for the repository's contributors. If `--people-dict` is not specified, the identities are
@@ -183,7 +184,7 @@ by `|`. The case is ignored.
 
 ```
 hercules --burndown --burndown-people [--people-dict=/path/to/identities]
-python3 labours.py -m churn-matrix
+labours -m churn-matrix
 ```
 
 Beside the burndown information, `--burndown-people` collects the added and deleted line statistics per
@@ -207,7 +208,7 @@ The sequence of developers is stored in `people_sequence` YAML node.
 
 ```
 hercules --burndown --burndown-people [--people-dict=/path/to/identities]
-python3 labours.py -m ownership
+labours -m ownership
 ```
 
 `--burndown-people` also allows to draw the code share through time stacked area plot. That is,
@@ -220,14 +221,14 @@ how many lines are alive at the sampled moments in time for each identified deve
 
 ```
 hercules --couples [--people-dict=/path/to/identities]
-python3 labours.py -m couples -o <name> [--couples-tmp-dir=/tmp]
+labours -m couples -o <name> [--couples-tmp-dir=/tmp]
 ```
 
 **Important**: it requires Tensorflow to be installed, please follow [official instructions](https://www.tensorflow.org/install/).
 
 The files are coupled if they are changed in the same commit. The developers are coupled if they
 change the same file. `hercules` records the number of couples throughout the whole commit history
-and outputs the two corresponding co-occurrence matrices. `labours.py` then trains
+and outputs the two corresponding co-occurrence matrices. `labours` then trains
 [Swivel embeddings](https://github.com/src-d/tensorflow-swivel) - dense vectors which reflect the
 co-occurrence probability through the Euclidean distance. The training requires a working
 [Tensorflow](http://tensorflow.org) installation. The intermediate files are stored in the
@@ -257,13 +258,13 @@ manual to switch to something else.
 
 ```
 hercules --shotness [--shotness-xpath-*]
-python3 labours.py -m shotness
+labours -m shotness
 ```
 
 Couples analysis automatically loads "shotness" data if available.
 
 ![Jinja2 functions grouped by structural hotness](doc/jinja.png)
-<p align="center"><code>hercules --shotness --pb https://github.com/pallets/jinja | python3 labours.py -m couples -f pb</code></p>
+<p align="center"><code>hercules --shotness --pb https://github.com/pallets/jinja | labours -m couples -f pb</code></p>
 
 #### Aligned commit series
 
@@ -272,7 +273,7 @@ Couples analysis automatically loads "shotness" data if available.
 
 ```
 hercules --devs [--people-dict=/path/to/identities]
-python3 labours.py -m devs -o <name>
+labours -m devs -o <name>
 ```
 
 We record how many commits made, as well as lines added, removed and changed per day for each developer.
@@ -307,7 +308,7 @@ insights from the `tensorflow/tensorflow` plot above:
 
 ```
 hercules --devs [--people-dict=/path/to/identities]
-python3 labours.py -m old-vs-new -o <name>
+labours -m old-vs-new -o <name>
 ```
 
 `--devs` from the previous section allows to plot how many lines were added and how many existing changed
@@ -320,7 +321,7 @@ python3 labours.py -m old-vs-new -o <name>
 
 ```
 hercules --devs [--people-dict=/path/to/identities]
-python3 labours.py -m devs-efforts -o <name>
+labours -m devs-efforts -o <name>
 ```
 
 Besides, `--devs` allows to plot how many lines have been changed (added or removed) by each developer.
@@ -332,7 +333,7 @@ with owning lines.
 #### Sentiment (positive and negative comments)
 
 ![Django sentiment](doc/sentiment.png)
-<p align="center">It can be clearly seen that Django comments were positive/optimistic in the beginning, but later became negative/pessimistic.<br><code>hercules --sentiment --pb https://github.com/django/django | python3 labours.py -m sentiment -f pb</code></p>
+<p align="center">It can be clearly seen that Django comments were positive/optimistic in the beginning, but later became negative/pessimistic.<br><code>hercules --sentiment --pb https://github.com/django/django | labours -m sentiment -f pb</code></p>
 
 We extract new and changed comments from source code on every commit, apply [BiDiSentiment](https://github.com/vmarkovtsev/bidisentiment)
 general purpose sentiment recurrent neural network and plot the results. Requires
@@ -354,7 +355,7 @@ Such a build requires [`libtensorflow`](https://www.tensorflow.org/install/insta
 
 ```
 hercules --burndown --burndown-files --burndown-people --couples --shotness --devs [--people-dict=/path/to/identities]
-python3 labours.py -m all
+labours -m all
 ```
 
 ### Plugins
@@ -368,17 +369,17 @@ Hercules has a plugin system and allows to run custom analyses. See [PLUGINS.md]
 ```
 hercules --burndown --pb https://github.com/src-d/go-git > go-git.pb
 hercules --burndown --pb https://github.com/src-d/hercules > hercules.pb
-hercules combine go-git.pb hercules.pb | python3 labours.py -f pb -m burndown-project --resample M
+hercules combine go-git.pb hercules.pb | labours -f pb -m burndown-project --resample M
 ```
 
 ### Bad unicode errors
 
-YAML does not support the whole range of Unicode characters and the parser on `labours.py` side
+YAML does not support the whole range of Unicode characters and the parser on `labours` side
 may raise exceptions. Filter the output from `hercules` through `fix_yaml_unicode.py` to discard
 such offending characters.
 
 ```
-hercules --burndown --burndown-people https://github.com/... | python3 fix_yaml_unicode.py | python3 labours.py -m people
+hercules --burndown --burndown-people https://github.com/... | python3 fix_yaml_unicode.py | labours -m people
 ```
 
 ### Plotting
@@ -386,10 +387,10 @@ hercules --burndown --burndown-people https://github.com/... | python3 fix_yaml_
 These options affects all plots:
 
 ```
-python3 labours.py [--style=white|black] [--backend=] [--size=Y,X]
+labours [--style=white|black] [--backend=] [--size=Y,X]
 ```
 
-`--style` sets the general style of the plot (see `labours.py --help`).
+`--style` sets the general style of the plot (see `labours --help`).
 `--background` changes the plot background to be either white or black.
 `--backend` chooses the Matplotlib backend.
 `--size` sets the size of the figure in inches. The default is `12,9`.
@@ -403,7 +404,7 @@ echo "backend: TkAgg" > ~/.matplotlib/matplotlibrc
 These options are effective in burndown charts only:
 
 ```
-python3 labours.py [--text-size] [--relative]
+labours [--text-size] [--relative]
 ```
 
 `--text-size` changes the font size, `--relative` activate the stretched burndown layout.
@@ -423,7 +424,7 @@ please report there and specify `--first-parent` as a workaround.
 1. Parsing YAML in Python is slow when the number of internal objects is big. `hercules`' output
 for the Linux kernel in "couples" mode is 1.5 GB and takes more than an hour / 180GB RAM to be
 parsed. However, most of the repositories are parsed within a minute. Try using Protocol Buffers
-instead (`hercules --pb` and `labours.py -f pb`).
+instead (`hercules --pb` and `labours -f pb`).
 1. To speed up yaml parsing
    ```
    # Debian, Ubuntu

+ 1 - 1
.flake8

@@ -3,4 +3,4 @@ ignore=D,B007
 max-line-length=99
 inline-quotes="
 import-order-style=appnexus
-exclude=.git,internal/pb/pb_pb2.py,vendor
+exclude=labours/pb_pb2.py

+ 1 - 0
python/LICENSE.md

@@ -0,0 +1 @@
+../LICENSE.md

+ 1 - 0
python/README.md

@@ -0,0 +1 @@
+../README.md

+ 1 - 0
python/labours/__init__.py

@@ -0,0 +1 @@
+from labours.labours import *  # noqa:F

+ 7 - 0
python/labours/__main__.py

@@ -0,0 +1,7 @@
+import sys
+
+from labours.labours import main
+
+
+if __name__ == "__main__":
+    sys.exit(main())

+ 11 - 46
labours.py

@@ -263,9 +263,9 @@ class YamlReader(Reader):
 class ProtobufReader(Reader):
     def read(self, file):
         try:
-            from internal.pb.pb_pb2 import AnalysisResults
+            from labours.pb_pb2 import AnalysisResults
         except ImportError as e:
-            print("\n\n>>> You need to generate internal/pb/pb_pb2.py - run \"make\"\n",
+            print("\n\n>>> You need to generate python/hercules/pb/pb_pb2.py - run \"make\"\n",
                   file=sys.stderr)
             raise e from None
         self.data = AnalysisResults()
@@ -382,10 +382,10 @@ class ProtobufReader(Reader):
 
 READERS = {"yaml": YamlReader, "yml": YamlReader, "pb": ProtobufReader}
 PB_MESSAGES = {
-    "Burndown": "internal.pb.pb_pb2.BurndownAnalysisResults",
-    "Couples": "internal.pb.pb_pb2.CouplesAnalysisResults",
-    "Shotness": "internal.pb.pb_pb2.ShotnessAnalysisResults",
-    "Devs": "internal.pb.pb_pb2.DevsAnalysisResults",
+    "Burndown": "labours.pb_pb2.BurndownAnalysisResults",
+    "Couples": "labours.pb_pb2.CouplesAnalysisResults",
+    "Shotness": "labours.pb_pb2.ShotnessAnalysisResults",
+    "Devs": "labours.pb_pb2.DevsAnalysisResults",
 }
 
 
@@ -1343,6 +1343,7 @@ def show_devs(args, name, start_date, end_date, people, days):
 
 
 def order_commits(chosen_people, days, people):
+    from seriate import seriate
     try:
         from fastdtw import fastdtw
     except ImportError as e:
@@ -1379,17 +1380,13 @@ def order_commits(chosen_people, days, people):
         arr[1] = commits * 7  # 7 is a pure heuristic here and is not related to window size
         series[i] = list(arr.transpose())
     # calculate the distance matrix using dynamic time warping metric
-    dists = numpy.full((len(series) + 1, len(series) + 1), -100500, dtype=numpy.float32)
+    dists = numpy.full((len(series),) * 2, -100500, dtype=numpy.float32)
     for x in range(len(series)):
         dists[x, x] = 0
         for y in range(x + 1, len(series)):
             # L1 norm
             dist, _ = fastdtw(series[x], series[y], radius=5, dist=1)
             dists[x, y] = dists[y, x] = dist
-    # preparation for seriation ordering
-    dists[len(series), :] = 0
-    dists[:, len(series)] = 0
-    assert (dists >= 0).all()
     print("Ordering the series")
     route = seriate(dists)
     return dists, devseries, devstats, route
@@ -1409,37 +1406,6 @@ def hdbscan_cluster_routed_series(dists, route):
     return clusters
 
 
-def seriate(dists):
-    try:
-        from ortools.constraint_solver import pywrapcp, routing_enums_pb2
-    except ImportError as e:
-        print("Cannot import ortools: %s\nInstall it from "
-              "https://developers.google.com/optimization/install/python/" % e)
-        sys.exit(1)
-
-    # solve the TSP on the distance matrix
-    routing = pywrapcp.RoutingModel(dists.shape[0], 1, dists.shape[0] - 1)
-
-    def dist_callback(x, y):
-        # ortools wants integers, so we approximate here
-        return int(dists[x][y] * 1000)
-
-    routing.SetArcCostEvaluatorOfAllVehicles(dist_callback)
-    search_parameters = pywrapcp.RoutingModel.DefaultSearchParameters()
-    search_parameters.local_search_metaheuristic = (
-        routing_enums_pb2.LocalSearchMetaheuristic.GUIDED_LOCAL_SEARCH)
-    search_parameters.time_limit_ms = 2000
-    assignment = routing.SolveWithParameters(search_parameters)
-    index = routing.Start(0)
-    route = []
-    while not routing.IsEnd(index):
-        node = routing.IndexToNode(index)
-        if node < dists.shape[0] - 1:
-            route.append(node)
-        index = assignment.Value(routing.NextVar(index))
-    return route
-
-
 def show_devs_efforts(args, name, start_date, end_date, people, days, max_people):
     from scipy.signal import convolve, slepian
 
@@ -1564,6 +1530,7 @@ class ParallelDevData:
 
 
 def load_devs_parallel(ownership, couples, devs, max_people):
+    from seriate import seriate
     try:
         from hdbscan import HDBSCAN
     except ImportError as e:
@@ -1611,10 +1578,8 @@ def load_devs_parallel(ownership, couples, devs, max_people):
     embeddings /= numpy.linalg.norm(embeddings, axis=1)[:, None]
     cos = embeddings.dot(embeddings.T)
     cos[cos > 1] = 1  # tiny precision faults
-    dists = numpy.zeros((len(chosen) + 1,) * 2)
-    dists[:len(chosen), :len(chosen)] = numpy.arccos(cos)
-    clusters = HDBSCAN(min_cluster_size=2, metric="precomputed").fit_predict(
-        dists[:len(chosen), :len(chosen)])
+    dists = numpy.arccos(cos)
+    clusters = HDBSCAN(min_cluster_size=2, metric="precomputed").fit_predict(dists)
     for k, v in result.items():
         v.couples_cluster = clusters[chosen.index(k)]
 

Plik diff jest za duży
+ 1944 - 0
python/labours/pb_pb2.py


swivel.py → python/labours/swivel.py


+ 3 - 3
requirements.txt

@@ -1,12 +1,12 @@
 clint>=0.5.1,<1.0
 matplotlib>=2.0,<4.0
-numpy>=1.16.0,<2.0
+numpy==1.16.0
 pandas>=0.20.0,<1.0
-PyYAML>=4.2b1,<5.0
+PyYAML==4.2b1
 scipy>=0.19.0,<2.0
 protobuf>=3.5.0,<4.0
 munch>=2.0
 hdbscan==0.8.18
-ortools==6.9.5824
+seriate==1.0.0
 fastdtw==0.3.2
 python-dateutil==2.8.0

+ 54 - 0
python/setup.py

@@ -0,0 +1,54 @@
+import os
+
+from setuptools import setup
+
+
+try:
+    with open(os.path.join(os.path.dirname(__file__), "README.md"), encoding="utf-8") as f:
+        long_description = f.read()
+except FileNotFoundError:
+    long_description = ""
+
+
+setup(
+    name="labours",
+    description="Python companion for github.com/src-d/hercules to visualize the results.",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    version="10.0.0",
+    license="Apache-2.0",
+    author="source{d}",
+    author_email="machine-learning@sourced.tech",
+    url="https://github.com/src-d/hercules",
+    download_url="https://github.com/src-d/hercules",
+    packages=["labours"],
+    keywords=["git", "mloncode", "mining software repositories", "hercules"],
+    install_requires=[
+        "clint>=0.5.1,<1.0",
+        "matplotlib>=2.0,<4.0",
+        "numpy>=1.12.0,<2.0",
+        "pandas>=0.20.0,<1.0",
+        "PyYAML>=3.0,<5.0",
+        "scipy>=0.19.0,<2.0",
+        "protobuf>=3.5.0,<4.0",
+        "munch>=2.0,<3.0",
+        "hdbscan>=0.8.0,<2.0",
+        "seriate>=1.0,<2.0",
+        "fastdtw>=0.3.2,<2.0",
+        "python-dateutil>=2.6.0,<3.0",
+    ],
+    package_data={"labours": ["../LICENSE.md", "../README.md", "../requirements.txt"]},
+    entry_points={
+        "console_scripts": ["labours=labours.__main__:main"],
+    },
+    classifiers=[
+        "Development Status :: 5 - Production/Stable",
+        "Intended Audience :: Developers",
+        "Environment :: Console",
+        "License :: OSI Approved :: Apache Software License",
+        "Programming Language :: Python :: 3.5",
+        "Programming Language :: Python :: 3.6",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+    ],
+)