8 년 전 · 763ead8089
--- a/couples.go
+++ b/couples.go
@@ -13,6 +13,8 @@ type Couples struct {
 
				 
			
 
				 	// people store how many times every developer committed to every file.
			
 
				 	people []map[string]int
			
 
				+	// people_commits is the number of commits each author made
			
 
				+	people_commits []int
			
 
				 	// files store every file occurred in the same commit with every other file.
			
 
				 	files map[string]map[string]int
			
 
				 }
			
@@ -41,11 +43,13 @@ func (couples *Couples) Initialize(repository *git.Repository) {
 
				 	for i := range couples.people {
			
 
				 		couples.people[i] = map[string]int{}
			
 
				 	}
			
 
				+	couples.people_commits = make([]int, couples.PeopleNumber)
			
 
				 	couples.files = map[string]map[string]int{}
			
 
				 }
			
 
				 
			
 
				 func (couples *Couples) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
			
 
				 	author := deps["author"].(int)
			
 
				+	couples.people_commits[author] += 1
			
 
				 	tree_diff := deps["renamed_changes"].(object.Changes)
			
 
				 	context := make([]string, 0)
			
 
				 	deleteFile := func(name string) {
			
@@ -116,6 +120,7 @@ func (couples *Couples) Finalize() interface{} {
 
				 				}
			
 
				 			}
			
 
				 		}
			
 
				+		peopleMatrix[i][i] = int64(couples.people_commits[i])
			
 
				 	}
			
 
				 	filesSequence := make([]string, len(couples.files))
			
 
				 	i := 0
			
--- a/labours.py
+++ b/labours.py
@@ -3,6 +3,7 @@ from datetime import datetime, timedelta
 
				 import io
			
 
				 import os
			
 
				 import sys
			
 
				+import tempfile
			
 
				 import warnings
			
 
				 
			
 
				 try:
			
@@ -32,8 +33,10 @@ def parse_args():
 
				                         help="Plot's general color scheme.")
			
 
				     parser.add_argument("--relative", action="store_true",
			
 
				                         help="Occupy 100%% height for every measurement.")
			
 
				+    parser.add_argument("--couples-tmp-dir", help="Temporary directory to work with couples.")
			
 
				     parser.add_argument("-m", "--mode",
			
 
				-                        choices=["project", "file", "person", "matrix", "people", "all"],
			
 
				+                        choices=["project", "file", "person", "matrix", "people", "couples",
			
 
				+                                 "all"],
			
 
				                         default="project", help="What to plot.")
			
 
				     parser.add_argument(
			
 
				         "--resample", default="year",
			
@@ -46,13 +49,17 @@ def parse_args():
 
				 
			
 
				 
			
 
				 def read_input(args):
			
 
				+    sys.stdout.write("Reading the input... ")
			
 
				+    sys.stdout.flush()
			
 
				     if args.input != "-":
			
 
				         with open(args.input) as fin:
			
 
				             data = yaml.load(fin)
			
 
				     else:
			
 
				         data = yaml.load(sys.stdin)
			
 
				+    print("done")
			
 
				     return data["burndown"], data["project"], data.get("files"), data.get("people_sequence"), \
			
 
				-           data.get("people"), data.get("people_interaction")
			
 
				+           data.get("people"), data.get("people_interaction"), data.get("files_coocc"), \
			
 
				+           data.get("people_coocc")
			
 
				 
			
 
				 
			
 
				 def calculate_average_lifetime(matrix):
			
@@ -370,10 +377,129 @@ def plot_people(args, repo, names, people, date_range, last):
 
				     deploy_plot("%s code ratio through time" % repo, output, args.style)
			
 
				 
			
 
				 
			
 
				+def train_embeddings(coocc_tree, tmpdir, shard_size=4096):
			
 
				+    from scipy.sparse import csr_matrix
			
 
				+    import tensorflow as tf
			
 
				+    try:
			
 
				+        from . import swivel
			
 
				+    except SystemError:
			
 
				+        import swivel
			
 
				+
			
 
				+    index = coocc_tree["index"]
			
 
				+    print("Reading the sparse matrix...")
			
 
				+    data = []
			
 
				+    indices = []
			
 
				+    indptr = [0]
			
 
				+    for row, cd in enumerate(coocc_tree["matrix"]):
			
 
				+        for col, val in sorted(cd.items()):
			
 
				+            data.append(val)
			
 
				+            indices.append(col)
			
 
				+        indptr.append(indptr[-1] + len(cd))
			
 
				+    matrix = csr_matrix((data, indices, indptr), shape=(len(index), len(index)))
			
 
				+    meta_index = []
			
 
				+    for i, name in enumerate(index):
			
 
				+        meta_index.append((name, matrix[i, i]))
			
 
				+    with tempfile.TemporaryDirectory(prefix="hercules_labours_", dir=tmpdir or None) as tmproot:
			
 
				+        print("Writing Swivel metadata...")
			
 
				+        vocabulary = "\n".join(index)
			
 
				+        with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:
			
 
				+            out.write(vocabulary)
			
 
				+        with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:
			
 
				+            out.write(vocabulary)
			
 
				+        del vocabulary
			
 
				+        bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]
			
 
				+        bool_sums_str = "\n".join(map(str, bool_sums.tolist()))
			
 
				+        with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:
			
 
				+            out.write(bool_sums_str)
			
 
				+        with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:
			
 
				+            out.write(bool_sums_str)
			
 
				+        del bool_sums_str
			
 
				+        reorder = numpy.argsort(-bool_sums)
			
 
				+        nshards = len(index) // shard_size
			
 
				+        if nshards == 0:
			
 
				+            nshards = 1
			
 
				+            shard_size = len(index)
			
 
				+        print("Writing Swivel shards...")
			
 
				+        for row in range(nshards):
			
 
				+            for col in range(nshards):
			
 
				+                def _int64s(xs):
			
 
				+                    return tf.train.Feature(
			
 
				+                        int64_list=tf.train.Int64List(value=list(xs)))
			
 
				+
			
 
				+                def _floats(xs):
			
 
				+                    return tf.train.Feature(
			
 
				+                        float_list=tf.train.FloatList(value=list(xs)))
			
 
				+
			
 
				+                indices_row = reorder[row::nshards]
			
 
				+                indices_col = reorder[col::nshards]
			
 
				+                shard = matrix[indices_row][:, indices_col].tocoo()
			
 
				+
			
 
				+                example = tf.train.Example(features=tf.train.Features(feature={
			
 
				+                    "global_row": _int64s(indices_row),
			
 
				+                    "global_col": _int64s(indices_col),
			
 
				+                    "sparse_local_row": _int64s(shard.row),
			
 
				+                    "sparse_local_col": _int64s(shard.col),
			
 
				+                    "sparse_value": _floats(shard.data)}))
			
 
				+
			
 
				+                with open(os.path.join(tmproot, "shard-%03d-%03d.pb" % (row, col)), "wb") as out:
			
 
				+                    out.write(example.SerializeToString())
			
 
				+        print("Training Swivel model...")
			
 
				+        swivel.FLAGS.submatrix_rows = shard_size
			
 
				+        swivel.FLAGS.submatrix_cols = shard_size
			
 
				+        if len(index) < 10000:
			
 
				+            embedding_size = 50
			
 
				+            num_epochs = 40
			
 
				+        elif len(index) < 100000:
			
 
				+            embedding_size = 100
			
 
				+            num_epochs = 50
			
 
				+        elif len(index) < 500000:
			
 
				+            embedding_size = 200
			
 
				+            num_epochs = 60
			
 
				+        else:
			
 
				+            embedding_size = 300
			
 
				+            num_epochs = 80
			
 
				+        swivel.FLAGS.embedding_size = embedding_size
			
 
				+        swivel.FLAGS.input_base_path = tmproot
			
 
				+        swivel.FLAGS.output_base_path = tmproot
			
 
				+        swivel.FLAGS.loss_multiplier = 1.0 / shard_size
			
 
				+        swivel.FLAGS.num_epochs = num_epochs
			
 
				+        swivel.main(None)
			
 
				+        print("Reading Swivel embeddings...")
			
 
				+        embeddings = []
			
 
				+        with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:
			
 
				+            with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:
			
 
				+                for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
			
 
				+                    prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
			
 
				+                    assert prow[0] == pcol[0]
			
 
				+                    erow, ecol = \
			
 
				+                        (numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
			
 
				+                         for p in (prow, pcol))
			
 
				+                    embeddings.append((erow + ecol) / 2)
			
 
				+    return meta_index, embeddings
			
 
				+
			
 
				+
			
 
				+def write_embeddings(name, output, index, embeddings):
			
 
				+    print("Writing Tensorflow Projector files...")
			
 
				+    if not output:
			
 
				+        output = "couples_" + name
			
 
				+    metaf = "%s_%s_meta.tsv" % (output, name)
			
 
				+    with open(metaf, "w") as fout:
			
 
				+        fout.write("name\tcommits\n")
			
 
				+        for pair in index:
			
 
				+            fout.write("%s\t%s\n" % pair)
			
 
				+    print("Wrote", metaf)
			
 
				+    dataf = "%s_%s_data.tsv" % (output, name)
			
 
				+    with open(dataf, "w") as fout:
			
 
				+        for vec in embeddings:
			
 
				+            fout.write("\t".join(str(v) for v in vec))
			
 
				+            fout.write("\n")
			
 
				+    print("Wrote", dataf)
			
 
				+
			
 
				+
			
 
				 def main():
			
 
				     args = parse_args()
			
 
				-    header, main_contents, files_contents, people_sequence, people_contents, people_matrix = \
			
 
				-        read_input(args)
			
 
				+    header, main_contents, files_contents, people_sequence, people_contents, people_matrix, \
			
 
				+        files_coocc, people_coocc = read_input(args)
			
 
				     name = next(iter(main_contents))
			
 
				 
			
 
				     files_warning = "Files stats were not collected. Re-run hercules with -files."
			
@@ -402,6 +528,11 @@ def main():
 
				             print(people_warning)
			
 
				             return
			
 
				         plot_people(args, name, *load_people(header, people_sequence, people_contents))
			
 
				+    elif args.mode == "couples":
			
 
				+        write_embeddings("files", args.output,
			
 
				+                         *train_embeddings(files_coocc, args.couples_tmp_dir))
			
 
				+        write_embeddings("people", args.output,
			
 
				+                         *train_embeddings(people_coocc, args.couples_tmp_dir))
			
 
				     elif args.mode == "all":
			
 
				         plot_burndown(args, "project",
			
 
				                       *load_main(header, name, main_contents[name], args.resample))
			
@@ -411,6 +542,12 @@ def main():
 
				             plot_many(args, "person", header, people_contents)
			
 
				             plot_matrix(args, name, people_sequence, load_matrix(people_matrix))
			
 
				             plot_people(args, name, *load_people(header, people_sequence, people_contents))
			
 
				+        if people_coocc:
			
 
				+            assert files_coocc
			
 
				+            write_embeddings("files", args.output,
			
 
				+                             *train_embeddings(files_coocc, args.couples_tmp_dir))
			
 
				+            write_embeddings("people", args.output,
			
 
				+                             *train_embeddings(people_coocc, args.couples_tmp_dir))
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     sys.exit(main())
			
--- a/swivel.py
+++ b/swivel.py
@@ -0,0 +1,484 @@
 
				+#!/usr/bin/env python3
			
 
				+#
			
 
				+# Copyright 2016 Google Inc. All Rights Reserved.
			
 
				+# Copyright 2017 Sourced Technologies S. L.
			
 
				+#
			
 
				+# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				+# you may not use this file except in compliance with the License.
			
 
				+# You may obtain a copy of the License at
			
 
				+#
			
 
				+#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				+#
			
 
				+# Unless required by applicable law or agreed to in writing, software
			
 
				+# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				+# See the License for the specific language governing permissions and
			
 
				+# limitations under the License.
			
 
				+
			
 
				+"""Submatrix-wise Vector Embedding Learner.
			
 
				+
			
 
				+Implementation of SwiVel algorithm described at:
			
 
				+http://arxiv.org/abs/1602.02215
			
 
				+
			
 
				+This program expects an input directory that contains the following files.
			
 
				+
			
 
				+  row_vocab.txt, col_vocab.txt
			
 
				+
			
 
				+    The row an column vocabulary files.  Each file should contain one token per
			
 
				+    line; these will be used to generate a tab-separate file containing the
			
 
				+    trained embeddings.
			
 
				+
			
 
				+  row_sums.txt, col_sum.txt
			
 
				+
			
 
				+    The matrix row and column marginal sums.  Each file should contain one
			
 
				+    decimal floating point number per line which corresponds to the marginal
			
 
				+    count of the matrix for that row or column.
			
 
				+
			
 
				+  shards.recs
			
 
				+
			
 
				+    A file containing the sub-matrix shards, stored as TFRecords. Each shard is
			
 
				+    expected to be a serialzed tf.Example protocol buffer with the following
			
 
				+    properties:
			
 
				+
			
 
				+      global_row: the global row indicies contained in the shard
			
 
				+      global_col: the global column indicies contained in the shard
			
 
				+      sparse_local_row, sparse_local_col, sparse_value: three parallel arrays
			
 
				+      that are a sparse representation of the submatrix counts.
			
 
				+
			
 
				+It will generate embeddings, training from the input directory for
			
 
				+the specified number of epochs.  When complete, it will output the trained
			
 
				+vectors to a tab-separated file that contains one line per embedding.  Row and
			
 
				+column embeddings are stored in separate files.
			
 
				+
			
 
				+"""
			
 
				+
			
 
				+import glob
			
 
				+import math
			
 
				+import os
			
 
				+import time
			
 
				+import threading
			
 
				+
			
 
				+import numpy as np
			
 
				+import tensorflow as tf
			
 
				+from tensorflow.python.client import device_lib
			
 
				+
			
 
				+flags = tf.app.flags
			
 
				+
			
 
				+flags.DEFINE_string("input_base_path", None,
			
 
				+                    "Directory containing input shards, vocabularies, "
			
 
				+                    "and marginals.")
			
 
				+flags.DEFINE_string("output_base_path", None,
			
 
				+                    "Path where to write the trained embeddings.")
			
 
				+flags.DEFINE_integer("embedding_size", 300, "Size of the embeddings")
			
 
				+flags.DEFINE_boolean("trainable_bias", False, "Biases are trainable")
			
 
				+flags.DEFINE_integer("submatrix_rows", 4096,
			
 
				+                     "Rows in each training submatrix. This must match"
			
 
				+                     "the training data.")
			
 
				+flags.DEFINE_integer("submatrix_cols", 4096,
			
 
				+                     "Rows in each training submatrix. This must match"
			
 
				+                     "the training data.")
			
 
				+flags.DEFINE_float("loss_multiplier", 1.0 / 4096,
			
 
				+                   "constant multiplier on loss.")
			
 
				+flags.DEFINE_float("confidence_exponent", 0.5,
			
 
				+                   "Exponent for l2 confidence function")
			
 
				+flags.DEFINE_float("confidence_scale", 0.25,
			
 
				+                   "Scale for l2 confidence function")
			
 
				+flags.DEFINE_float("confidence_base", 0.1, "Base for l2 confidence function")
			
 
				+flags.DEFINE_float("learning_rate", 1.0, "Initial learning rate")
			
 
				+flags.DEFINE_string("optimizer", "Adagrad",
			
 
				+                    "SGD optimizer (tf.train.*Optimizer)")
			
 
				+flags.DEFINE_integer("num_concurrent_steps", 2,
			
 
				+                     "Number of threads to train with")
			
 
				+flags.DEFINE_integer("num_readers", 4,
			
 
				+                     "Number of threads to read the input data and feed it")
			
 
				+flags.DEFINE_float("num_epochs", 40, "Number epochs to train for")
			
 
				+flags.DEFINE_float("per_process_gpu_memory_fraction", 0,
			
 
				+                   "Fraction of GPU memory to use, 0 means allow_growth")
			
 
				+flags.DEFINE_integer("num_gpus", 0,
			
 
				+                     "Number of GPUs to use, 0 means all available")
			
 
				+flags.DEFINE_string("logs", "",
			
 
				+                    "Path for TensorBoard logs (empty value disables them)")
			
 
				+
			
 
				+FLAGS = flags.FLAGS
			
 
				+
			
 
				+
			
 
				+def log(message, *args, **kwargs):
			
 
				+    tf.logging.info(message, *args, **kwargs)
			
 
				+
			
 
				+
			
 
				+def get_available_gpus():
			
 
				+    return [d.name for d in device_lib.list_local_devices()
			
 
				+            if d.device_type == "GPU"]
			
 
				+
			
 
				+
			
 
				+def embeddings_with_init(vocab_size, embedding_dim, name):
			
 
				+    """Creates and initializes the embedding tensors."""
			
 
				+    return tf.get_variable(name=name,
			
 
				+                           shape=[vocab_size, embedding_dim],
			
 
				+                           initializer=tf.random_normal_initializer(
			
 
				+                               stddev=math.sqrt(1.0 / embedding_dim)))
			
 
				+
			
 
				+
			
 
				+def count_matrix_input(filenames, submatrix_rows, submatrix_cols):
			
 
				+    """Reads submatrix shards from disk."""
			
 
				+    filename_queue = tf.train.string_input_producer(filenames)
			
 
				+    reader = tf.WholeFileReader()
			
 
				+    _, serialized_example = reader.read(filename_queue)
			
 
				+    features = tf.parse_single_example(
			
 
				+        serialized_example,
			
 
				+        features={
			
 
				+            "global_row": tf.FixedLenFeature([submatrix_rows], dtype=tf.int64),
			
 
				+            "global_col": tf.FixedLenFeature([submatrix_cols], dtype=tf.int64),
			
 
				+            "sparse_local_row": tf.VarLenFeature(dtype=tf.int64),
			
 
				+            "sparse_local_col": tf.VarLenFeature(dtype=tf.int64),
			
 
				+            "sparse_value": tf.VarLenFeature(dtype=tf.float32)
			
 
				+        })
			
 
				+
			
 
				+    global_row = features["global_row"]
			
 
				+    global_col = features["global_col"]
			
 
				+
			
 
				+    sparse_local_row = features["sparse_local_row"].values
			
 
				+    sparse_local_col = features["sparse_local_col"].values
			
 
				+    sparse_count = features["sparse_value"].values
			
 
				+
			
 
				+    sparse_indices = tf.concat(axis=1, values=[tf.expand_dims(sparse_local_row, 1),
			
 
				+                                               tf.expand_dims(sparse_local_col, 1)])
			
 
				+    count = tf.sparse_to_dense(sparse_indices, [submatrix_rows, submatrix_cols],
			
 
				+                               sparse_count, validate_indices=False)
			
 
				+
			
 
				+    queued_global_row, queued_global_col, queued_count = tf.train.batch(
			
 
				+        [global_row, global_col, count],
			
 
				+        batch_size=1,
			
 
				+        num_threads=FLAGS.num_readers,
			
 
				+        capacity=32)
			
 
				+
			
 
				+    queued_global_row = tf.reshape(queued_global_row, [submatrix_rows])
			
 
				+    queued_global_col = tf.reshape(queued_global_col, [submatrix_cols])
			
 
				+    queued_count = tf.reshape(queued_count, [submatrix_rows, submatrix_cols])
			
 
				+
			
 
				+    return queued_global_row, queued_global_col, queued_count
			
 
				+
			
 
				+
			
 
				+def read_marginals_file(filename):
			
 
				+    """Reads text file with one number per line to an array."""
			
 
				+    with open(filename) as lines:
			
 
				+        return [float(line) for line in lines]
			
 
				+
			
 
				+
			
 
				+def write_embedding_tensor_to_disk(vocab_path, output_path, sess, embedding):
			
 
				+    """Writes tensor to output_path as tsv"""
			
 
				+    # Fetch the embedding values from the model
			
 
				+    embeddings = sess.run(embedding)
			
 
				+
			
 
				+    with open(output_path, "w") as out_f:
			
 
				+        with open(vocab_path) as vocab_f:
			
 
				+            for index, word in enumerate(vocab_f):
			
 
				+                word = word.strip()
			
 
				+                embedding = embeddings[index]
			
 
				+                out_f.write(word + "\t" + "\t".join(
			
 
				+                    [str(x) for x in embedding]) + "\n")
			
 
				+
			
 
				+
			
 
				+def write_embeddings_to_disk(config, model, sess):
			
 
				+    """Writes row and column embeddings disk"""
			
 
				+    # Row Embedding
			
 
				+    row_vocab_path = config.input_base_path + "/row_vocab.txt"
			
 
				+    row_embedding_output_path = config.output_base_path + "/row_embedding.tsv"
			
 
				+    log("Writing row embeddings to: %s", row_embedding_output_path)
			
 
				+    write_embedding_tensor_to_disk(row_vocab_path, row_embedding_output_path,
			
 
				+                                   sess, model.row_embedding)
			
 
				+
			
 
				+    # Column Embedding
			
 
				+    col_vocab_path = config.input_base_path + "/col_vocab.txt"
			
 
				+    col_embedding_output_path = config.output_base_path + "/col_embedding.tsv"
			
 
				+    log("Writing column embeddings to: %s", col_embedding_output_path)
			
 
				+    write_embedding_tensor_to_disk(col_vocab_path, col_embedding_output_path,
			
 
				+                                   sess, model.col_embedding)
			
 
				+
			
 
				+
			
 
				+class SwivelModel:
			
 
				+    """Small class to gather needed pieces from a Graph being built."""
			
 
				+
			
 
				+    def __init__(self, config):
			
 
				+        """Construct graph for dmc."""
			
 
				+        self._config = config
			
 
				+
			
 
				+        # Create paths to input data files
			
 
				+        log("Reading model from: %s", config.input_base_path)
			
 
				+        count_matrix_files = glob.glob(config.input_base_path + "/shard-*.pb")
			
 
				+        row_sums_path = config.input_base_path + "/row_sums.txt"
			
 
				+        col_sums_path = config.input_base_path + "/col_sums.txt"
			
 
				+
			
 
				+        # Read marginals
			
 
				+        row_sums = read_marginals_file(row_sums_path)
			
 
				+        col_sums = read_marginals_file(col_sums_path)
			
 
				+
			
 
				+        self.n_rows = len(row_sums)
			
 
				+        self.n_cols = len(col_sums)
			
 
				+        log("Matrix dim: (%d,%d) SubMatrix dim: (%d,%d)",
			
 
				+            self.n_rows, self.n_cols, config.submatrix_rows,
			
 
				+            config.submatrix_cols)
			
 
				+        if self.n_cols < config.submatrix_cols:
			
 
				+            raise ValueError(
			
 
				+                "submatrix_cols={0} can not be bigger than columns number={1} "
			
 
				+                "(specify submatrix_cols={1})".format(config.submatrix_cols, self.n_cols))
			
 
				+        if self.n_rows < config.submatrix_rows:
			
 
				+            raise ValueError(
			
 
				+                "submatrix_rows={0} can not be bigger than rows number={1} "
			
 
				+                "(specify submatrix_rows={1})".format(config.submatrix_rows, self.n_cols))
			
 
				+        self.n_submatrices = (self.n_rows * self.n_cols /
			
 
				+                              (config.submatrix_rows * config.submatrix_cols))
			
 
				+        log("n_submatrices: %d", self.n_submatrices)
			
 
				+
			
 
				+        with tf.device("/cpu:0"):
			
 
				+            # ===== CREATE VARIABLES ======
			
 
				+            # Get input
			
 
				+            global_row, global_col, count = count_matrix_input(
			
 
				+                count_matrix_files, config.submatrix_rows,
			
 
				+                config.submatrix_cols)
			
 
				+
			
 
				+            # Embeddings
			
 
				+            self.row_embedding = embeddings_with_init(
			
 
				+                embedding_dim=config.embedding_size,
			
 
				+                vocab_size=self.n_rows,
			
 
				+                name="row_embedding")
			
 
				+            self.col_embedding = embeddings_with_init(
			
 
				+                embedding_dim=config.embedding_size,
			
 
				+                vocab_size=self.n_cols,
			
 
				+                name="col_embedding")
			
 
				+            tf.summary.histogram("row_emb", self.row_embedding)
			
 
				+            tf.summary.histogram("col_emb", self.col_embedding)
			
 
				+
			
 
				+            matrix_log_sum = math.log(np.sum(row_sums) + 1)
			
 
				+            row_bias_init = [math.log(x + 1) for x in row_sums]
			
 
				+            col_bias_init = [math.log(x + 1) for x in col_sums]
			
 
				+            self.row_bias = tf.Variable(
			
 
				+                row_bias_init, trainable=config.trainable_bias)
			
 
				+            self.col_bias = tf.Variable(
			
 
				+                col_bias_init, trainable=config.trainable_bias)
			
 
				+            tf.summary.histogram("row_bias", self.row_bias)
			
 
				+            tf.summary.histogram("col_bias", self.col_bias)
			
 
				+
			
 
				+            # Add optimizer
			
 
				+            l2_losses = []
			
 
				+            sigmoid_losses = []
			
 
				+            self.global_step = tf.Variable(0, name="global_step")
			
 
				+            learning_rate = tf.Variable(config.learning_rate,
			
 
				+                                        name="learning_rate")
			
 
				+            opt = getattr(tf.train, FLAGS.optimizer + "Optimizer")(
			
 
				+                learning_rate)
			
 
				+            tf.summary.scalar("learning_rate", learning_rate)
			
 
				+
			
 
				+            all_grads = []
			
 
				+
			
 
				+        devices = ["/gpu:%d" % i for i in range(FLAGS.num_gpus)] \
			
 
				+            if FLAGS.num_gpus > 0 else get_available_gpus()
			
 
				+        self.devices_number = len(devices)
			
 
				+        if not self.devices_number:
			
 
				+            devices = ["/cpu:0"]
			
 
				+            self.devices_number = 1
			
 
				+        for dev in devices:
			
 
				+            with tf.device(dev):
			
 
				+                with tf.name_scope(dev[1:].replace(":", "_")):
			
 
				+                    # ===== CREATE GRAPH =====
			
 
				+                    # Fetch embeddings.
			
 
				+                    selected_row_embedding = tf.nn.embedding_lookup(
			
 
				+                        self.row_embedding, global_row)
			
 
				+                    selected_col_embedding = tf.nn.embedding_lookup(
			
 
				+                        self.col_embedding, global_col)
			
 
				+
			
 
				+                    # Fetch biases.
			
 
				+                    selected_row_bias = tf.nn.embedding_lookup(
			
 
				+                        [self.row_bias], global_row)
			
 
				+                    selected_col_bias = tf.nn.embedding_lookup(
			
 
				+                        [self.col_bias], global_col)
			
 
				+
			
 
				+                    # Multiply the row and column embeddings to generate
			
 
				+                    # predictions.
			
 
				+                    predictions = tf.matmul(
			
 
				+                        selected_row_embedding, selected_col_embedding,
			
 
				+                        transpose_b=True)
			
 
				+
			
 
				+                    # These binary masks separate zero from non-zero values.
			
 
				+                    count_is_nonzero = tf.to_float(tf.cast(count, tf.bool))
			
 
				+                    count_is_zero = 1 - count_is_nonzero
			
 
				+
			
 
				+                    objectives = count_is_nonzero * tf.log(count + 1e-30)
			
 
				+                    objectives -= tf.reshape(
			
 
				+                        selected_row_bias, [config.submatrix_rows, 1])
			
 
				+                    objectives -= selected_col_bias
			
 
				+                    objectives += matrix_log_sum
			
 
				+
			
 
				+                    err = predictions - objectives
			
 
				+
			
 
				+                    # The confidence function scales the L2 loss based on
			
 
				+                    # the raw co-occurrence count.
			
 
				+                    l2_confidence = (
			
 
				+                        config.confidence_base +
			
 
				+                        config.confidence_scale * tf.pow(
			
 
				+                            count, config.confidence_exponent))
			
 
				+
			
 
				+                    l2_loss = config.loss_multiplier * tf.reduce_sum(
			
 
				+                        0.5 * l2_confidence * err * err * count_is_nonzero)
			
 
				+                    l2_losses.append(tf.expand_dims(l2_loss, 0))
			
 
				+
			
 
				+                    sigmoid_loss = config.loss_multiplier * tf.reduce_sum(
			
 
				+                        tf.nn.softplus(err) * count_is_zero)
			
 
				+                    sigmoid_losses.append(tf.expand_dims(sigmoid_loss, 0))
			
 
				+
			
 
				+                    loss = l2_loss + sigmoid_loss
			
 
				+                    grads = opt.compute_gradients(loss)
			
 
				+                    all_grads.append(grads)
			
 
				+
			
 
				+        with tf.device("/cpu:0"):
			
 
				+            # ===== MERGE LOSSES =====
			
 
				+            l2_loss = tf.reduce_mean(tf.concat(axis=0, values=l2_losses), 0,
			
 
				+                                     name="l2_loss")
			
 
				+            sigmoid_loss = tf.reduce_mean(
			
 
				+                tf.concat(axis=0, values=sigmoid_losses), 0,
			
 
				+                name="sigmoid_loss")
			
 
				+            overall_loss = l2_loss + sigmoid_loss
			
 
				+            average = tf.train.ExponentialMovingAverage(0.999)
			
 
				+            loss_average_op = average.apply(
			
 
				+                (overall_loss, l2_loss, sigmoid_loss))
			
 
				+            self.loss = average.average(overall_loss)
			
 
				+            tf.summary.scalar("overall_loss", self.loss)
			
 
				+            tf.summary.scalar("l2_loss", average.average(l2_loss))
			
 
				+            tf.summary.scalar("sigmoid_loss", average.average(sigmoid_loss))
			
 
				+
			
 
				+            # Apply the gradients to adjust the shared variables.
			
 
				+            apply_gradient_ops = []
			
 
				+            for grads in all_grads:
			
 
				+                apply_gradient_ops.append(opt.apply_gradients(
			
 
				+                    grads, global_step=self.global_step))
			
 
				+
			
 
				+            self.train_op = tf.group(loss_average_op, *apply_gradient_ops)
			
 
				+            self.saver = tf.train.Saver(sharded=True)
			
 
				+
			
 
				+    def initialize_summary(self, sess):
			
 
				+        log("creating TensorBoard stuff...")
			
 
				+        self.summary = tf.summary.merge_all()
			
 
				+        self.writer = tf.summary.FileWriter(FLAGS.logs, sess.graph)
			
 
				+        projector_config = \
			
 
				+            tf.contrib.tensorboard.plugins.projector.ProjectorConfig()
			
 
				+        embedding_config = projector_config.embeddings.add()
			
 
				+        length = min(10000, self.n_rows, self.n_cols)
			
 
				+        self.embedding10k = tf.Variable(
			
 
				+            tf.zeros((length, self._config.embedding_size)),
			
 
				+            name="top10k_embedding")
			
 
				+        embedding_config.tensor_name = self.embedding10k.name
			
 
				+        tf.contrib.tensorboard.plugins.projector.visualize_embeddings(
			
 
				+            self.writer, projector_config)
			
 
				+        self.saver = tf.train.Saver((self.embedding10k,), max_to_keep=1)
			
 
				+
			
 
				+    def write_summary(self, sess):
			
 
				+        log("writing the summary...")
			
 
				+        length = min(10000, self.n_rows, self.n_cols)
			
 
				+        assignment = self.embedding10k.assign(
			
 
				+            (self.row_embedding[:length] + self.col_embedding[:length]) / 2)
			
 
				+        summary, _, global_step = sess.run(
			
 
				+            (self.summary, assignment, self.global_step))
			
 
				+        self.writer.add_summary(summary, global_step)
			
 
				+        self.saver.save(
			
 
				+            sess, os.path.join(FLAGS.logs, "embeddings10k.checkpoint"),
			
 
				+            global_step)
			
 
				+
			
 
				+
			
 
				+def main(_):
			
 
				+    tf.logging.set_verbosity(tf.logging.INFO)
			
 
				+    start_time = time.time()
			
 
				+
			
 
				+    # Create the output path.  If this fails, it really ought to fail now. :)
			
 
				+    if not os.path.isdir(FLAGS.output_base_path):
			
 
				+        os.makedirs(FLAGS.output_base_path)
			
 
				+
			
 
				+    # Create and run model
			
 
				+    with tf.Graph().as_default():
			
 
				+        log("creating the model...")
			
 
				+        model = SwivelModel(FLAGS)
			
 
				+
			
 
				+        # Create a session for running Ops on the Graph.
			
 
				+        gpu_opts = {}
			
 
				+        if FLAGS.per_process_gpu_memory_fraction > 0:
			
 
				+            gpu_opts["per_process_gpu_memory_fraction"] = \
			
 
				+                FLAGS.per_process_gpu_memory_fraction
			
 
				+        else:
			
 
				+            gpu_opts["allow_growth"] = True
			
 
				+        gpu_options = tf.GPUOptions(**gpu_opts)
			
 
				+        sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
			
 
				+        if FLAGS.logs:
			
 
				+            model.initialize_summary(sess)
			
 
				+
			
 
				+        # Run the Op to initialize the variables.
			
 
				+        log("initializing the variables...")
			
 
				+        sess.run(tf.global_variables_initializer())
			
 
				+
			
 
				+        # Start feeding input
			
 
				+        log("starting the input threads...")
			
 
				+        coord = tf.train.Coordinator()
			
 
				+        threads = tf.train.start_queue_runners(sess=sess, coord=coord)
			
 
				+
			
 
				+        # Calculate how many steps each thread should run
			
 
				+        n_total_steps = int(FLAGS.num_epochs * model.n_rows * model.n_cols) / (
			
 
				+            FLAGS.submatrix_rows * FLAGS.submatrix_cols)
			
 
				+        n_steps_per_thread = n_total_steps / (
			
 
				+            FLAGS.num_concurrent_steps * model.devices_number)
			
 
				+        n_submatrices_to_train = model.n_submatrices * FLAGS.num_epochs
			
 
				+        t0 = [time.time()]
			
 
				+        n_steps_between_status_updates = 100
			
 
				+        n_steps_between_summary_updates = 10000
			
 
				+        status_i = [0, 0]
			
 
				+        status_lock = threading.Lock()
			
 
				+        msg = ("%%%dd/%%d submatrices trained (%%.1f%%%%), "
			
 
				+               "%%5.1f submatrices/sec | loss %%f") % \
			
 
				+            len(str(n_submatrices_to_train))
			
 
				+
			
 
				+        def TrainingFn():
			
 
				+            for _ in range(int(n_steps_per_thread)):
			
 
				+                _, global_step, loss = sess.run((
			
 
				+                    model.train_op, model.global_step, model.loss))
			
 
				+
			
 
				+                show_status = False
			
 
				+                update_summary = False
			
 
				+                with status_lock:
			
 
				+                    new_i = global_step // n_steps_between_status_updates
			
 
				+                    if new_i > status_i[0]:
			
 
				+                        status_i[0] = new_i
			
 
				+                        show_status = True
			
 
				+                    new_i = global_step // n_steps_between_summary_updates
			
 
				+                    if new_i > status_i[1]:
			
 
				+                        status_i[1] = new_i
			
 
				+                        update_summary = True
			
 
				+                if show_status:
			
 
				+                    elapsed = float(time.time() - t0[0])
			
 
				+                    log(msg, global_step, n_submatrices_to_train,
			
 
				+                        100.0 * global_step / n_submatrices_to_train,
			
 
				+                        n_steps_between_status_updates / elapsed, loss)
			
 
				+                    t0[0] = time.time()
			
 
				+            if update_summary and FLAGS.logs:
			
 
				+                model.write_summary(sess)
			
 
				+
			
 
				+    # Start training threads
			
 
				+    train_threads = []
			
 
				+    for _ in range(FLAGS.num_concurrent_steps):
			
 
				+        t = threading.Thread(target=TrainingFn)
			
 
				+        train_threads.append(t)
			
 
				+        t.start()
			
 
				+
			
 
				+    # Wait for threads to finish.
			
 
				+    for t in train_threads:
			
 
				+        t.join()
			
 
				+
			
 
				+    coord.request_stop()
			
 
				+    coord.join(threads)
			
 
				+
			
 
				+    # Write out vectors
			
 
				+    write_embeddings_to_disk(FLAGS, model, sess)
			
 
				+
			
 
				+    # Shutdown
			
 
				+    sess.close()
			
 
				+    log("Elapsed: %s", time.time() - start_time)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    tf.app.run()