123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209 |
- import os
- import shutil
- import sys
- import tempfile
- from typing import List, Tuple
- import numpy
- from scipy.sparse.csr import csr_matrix
- from labours.cors_web_server import web_server
- IDEAL_SHARD_SIZE = 4096
- def train_embeddings(
- index: List[str],
- matrix: csr_matrix,
- tmpdir: None,
- shard_size: int = IDEAL_SHARD_SIZE,
- ) -> Tuple[List[Tuple[str, numpy.int64]], List[numpy.ndarray]]:
- import tensorflow as tf
- from labours._vendor import swivel
- assert matrix.shape[0] == matrix.shape[1]
- assert len(index) <= matrix.shape[0]
- outlier_threshold = numpy.percentile(matrix.data, 99)
- matrix.data[matrix.data > outlier_threshold] = outlier_threshold
- nshards = len(index) // shard_size
- if nshards * shard_size < len(index):
- nshards += 1
- shard_size = len(index) // nshards
- nshards = len(index) // shard_size
- remainder = len(index) - nshards * shard_size
- if remainder > 0:
- lengths = matrix.indptr[1:] - matrix.indptr[:-1]
- filtered = sorted(numpy.argsort(lengths)[remainder:])
- else:
- filtered = list(range(len(index)))
- if len(filtered) < matrix.shape[0]:
- print("Truncating the sparse matrix...")
- matrix = matrix[filtered, :][:, filtered]
- meta_index = []
- for i, j in enumerate(filtered):
- meta_index.append((index[j], matrix[i, i]))
- index = [mi[0] for mi in meta_index]
- with tempfile.TemporaryDirectory(
- prefix="hercules_labours_", dir=tmpdir or None
- ) as tmproot:
- print("Writing Swivel metadata...")
- vocabulary = "\n".join(index)
- with open(os.path.join(tmproot, "row_vocab.txt"), "w") as out:
- out.write(vocabulary)
- with open(os.path.join(tmproot, "col_vocab.txt"), "w") as out:
- out.write(vocabulary)
- del vocabulary
- bool_sums = matrix.indptr[1:] - matrix.indptr[:-1]
- bool_sums_str = "\n".join(map(str, bool_sums.tolist()))
- with open(os.path.join(tmproot, "row_sums.txt"), "w") as out:
- out.write(bool_sums_str)
- with open(os.path.join(tmproot, "col_sums.txt"), "w") as out:
- out.write(bool_sums_str)
- del bool_sums_str
- reorder = numpy.argsort(-bool_sums)
- print("Writing Swivel shards...")
- for row in range(nshards):
- for col in range(nshards):
- def _int64s(xs):
- return tf.train.Feature(
- int64_list=tf.train.Int64List(value=list(xs))
- )
- def _floats(xs):
- return tf.train.Feature(
- float_list=tf.train.FloatList(value=list(xs))
- )
- indices_row = reorder[row::nshards]
- indices_col = reorder[col::nshards]
- shard = matrix[indices_row][:, indices_col].tocoo()
- example = tf.train.Example(
- features=tf.train.Features(
- feature={
- "global_row": _int64s(indices_row),
- "global_col": _int64s(indices_col),
- "sparse_local_row": _int64s(shard.row),
- "sparse_local_col": _int64s(shard.col),
- "sparse_value": _floats(shard.data),
- }
- )
- )
- with open(
- os.path.join(tmproot, "shard-%03d-%03d.pb" % (row, col)), "wb"
- ) as out:
- out.write(example.SerializeToString())
- print("Training Swivel model...")
- swivel.FLAGS.submatrix_rows = shard_size
- swivel.FLAGS.submatrix_cols = shard_size
- if len(meta_index) <= IDEAL_SHARD_SIZE / 16:
- embedding_size = 50
- num_epochs = 100000
- elif len(meta_index) <= IDEAL_SHARD_SIZE:
- embedding_size = 50
- num_epochs = 50000
- elif len(meta_index) <= IDEAL_SHARD_SIZE * 2:
- embedding_size = 60
- num_epochs = 10000
- elif len(meta_index) <= IDEAL_SHARD_SIZE * 4:
- embedding_size = 70
- num_epochs = 8000
- elif len(meta_index) <= IDEAL_SHARD_SIZE * 10:
- embedding_size = 80
- num_epochs = 5000
- elif len(meta_index) <= IDEAL_SHARD_SIZE * 25:
- embedding_size = 100
- num_epochs = 1000
- elif len(meta_index) <= IDEAL_SHARD_SIZE * 100:
- embedding_size = 200
- num_epochs = 600
- else:
- embedding_size = 300
- num_epochs = 300
- if os.getenv("CI"):
- # Travis, AppVeyor etc. during the integration tests
- num_epochs /= 10
- swivel.FLAGS.embedding_size = embedding_size
- swivel.FLAGS.input_base_path = tmproot
- swivel.FLAGS.output_base_path = tmproot
- swivel.FLAGS.loss_multiplier = 1.0 / shard_size
- swivel.FLAGS.num_epochs = num_epochs
- # Tensorflow 1.5 parses sys.argv unconditionally *applause*
- argv_backup = sys.argv[1:]
- del sys.argv[1:]
- swivel.main(None)
- sys.argv.extend(argv_backup)
- print("Reading Swivel embeddings...")
- embeddings = []
- with open(os.path.join(tmproot, "row_embedding.tsv")) as frow:
- with open(os.path.join(tmproot, "col_embedding.tsv")) as fcol:
- for i, (lrow, lcol) in enumerate(zip(frow, fcol)):
- prow, pcol = (l.split("\t", 1) for l in (lrow, lcol))
- assert prow[0] == pcol[0]
- erow, ecol = (
- numpy.fromstring(p[1], dtype=numpy.float32, sep="\t")
- for p in (prow, pcol)
- )
- embeddings.append((erow + ecol) / 2)
- return meta_index, embeddings
- def write_embeddings(
- name: str,
- output: str,
- run_server: bool,
- index: List[Tuple[str, numpy.int64]],
- embeddings: List[numpy.ndarray],
- ) -> None:
- print("Writing Tensorflow Projector files...")
- if not output:
- output = "couples"
- if output.endswith(".json"):
- output = os.path.join(output[:-5], "couples")
- run_server = False
- metaf = "%s_%s_meta.tsv" % (output, name)
- with open(metaf, "w") as fout:
- fout.write("name\tcommits\n")
- for pair in index:
- fout.write("%s\t%s\n" % pair)
- print("Wrote", metaf)
- dataf = "%s_%s_data.tsv" % (output, name)
- with open(dataf, "w") as fout:
- for vec in embeddings:
- fout.write("\t".join(str(v) for v in vec))
- fout.write("\n")
- print("Wrote", dataf)
- jsonf = "%s_%s.json" % (output, name)
- with open(jsonf, "w") as fout:
- fout.write(
- """{
- "embeddings": [
- {
- "tensorName": "%s %s coupling",
- "tensorShape": [%s, %s],
- "tensorPath": "http://0.0.0.0:8000/%s",
- "metadataPath": "http://0.0.0.0:8000/%s"
- }
- ]
- }
- """
- % (output, name, len(embeddings), len(embeddings[0]), dataf, metaf)
- )
- print("Wrote %s" % jsonf)
- if run_server and not web_server.running:
- web_server.start()
- url = "http://projector.tensorflow.org/?config=http://0.0.0.0:8000/" + jsonf
- print(url)
- if run_server:
- if shutil.which("xdg-open") is not None:
- os.system("xdg-open " + url)
- else:
- browser = os.getenv("BROWSER", "")
- if browser:
- os.system(browser + " " + url)
- else:
- print("\t" + url)
|