data_utils.py 4.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. # Copyright 2016 Google Inc. All Rights Reserved.
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. """Utilities for parsing Kaggle baby names files."""
  15. from __future__ import absolute_import
  16. from __future__ import division
  17. from __future__ import print_function
  18. import collections
  19. import os
  20. import numpy as np
  21. import tensorflow as tf
  22. import pandas as pd
  23. # the default end of name rep will be zero
  24. _EON = 0
  25. def read_names(names_path):
  26. """read data from downloaded file. See SmallNames.txt for example format
  27. or go to https://www.kaggle.com/kaggle/us-baby-names for full lists
  28. Args:
  29. names_path: path to the csv file similar to the example type
  30. Returns:
  31. Dataset: a namedtuple of two elements: deduped names and their associated
  32. counts. The names contain only 26 chars and are all lower case
  33. """
  34. names_data = pd.read_csv(names_path)
  35. names_data.Name = names_data.Name.str.lower()
  36. name_data = names_data.groupby(by=["Name"])["Count"].sum()
  37. name_counts = np.array(name_data.tolist())
  38. names_deduped = np.array(name_data.index.tolist())
  39. Dataset = collections.namedtuple('Dataset', ['Name', 'Count'])
  40. return Dataset(names_deduped, name_counts)
  41. def _letter_to_number(letter):
  42. """converts letters to numbers between 1 and 27"""
  43. # ord of lower case 'a' is 97
  44. return ord(letter) - 96
  45. def namignizer_iterator(names, counts, batch_size, num_steps, epoch_size):
  46. """Takes a list of names and counts like those output from read_names, and
  47. makes an iterator yielding a batch_size by num_steps array of random names
  48. separated by an end of name token. The names are choosen randomly according
  49. to their counts. The batch may end mid-name
  50. Args:
  51. names: a set of lowercase names composed of 26 characters
  52. counts: a list of the frequency of those names
  53. batch_size: int
  54. num_steps: int
  55. epoch_size: number of batches to yield
  56. Yields:
  57. (x, y): a batch_size by num_steps array of ints representing letters, where
  58. x will be the input and y will be the target
  59. """
  60. name_distribution = counts / counts.sum()
  61. for i in range(epoch_size):
  62. data = np.zeros(batch_size * num_steps + 1)
  63. samples = np.random.choice(names, size=batch_size * num_steps // 2,
  64. replace=True, p=name_distribution)
  65. data_index = 0
  66. for sample in samples:
  67. if data_index >= batch_size * num_steps:
  68. break
  69. for letter in map(_letter_to_number, sample) + [_EON]:
  70. if data_index >= batch_size * num_steps:
  71. break
  72. data[data_index] = letter
  73. data_index += 1
  74. x = data[:batch_size * num_steps].reshape((batch_size, num_steps))
  75. y = data[1:batch_size * num_steps + 1].reshape((batch_size, num_steps))
  76. yield (x, y)
  77. def name_to_batch(name, batch_size, num_steps):
  78. """ Takes a single name and fills a batch with it
  79. Args:
  80. name: lowercase composed of 26 characters
  81. batch_size: int
  82. num_steps: int
  83. Returns:
  84. x, y: a batch_size by num_steps array of ints representing letters, where
  85. x will be the input and y will be the target. The array is filled up
  86. to the length of the string, the rest is filled with zeros
  87. """
  88. data = np.zeros(batch_size * num_steps + 1)
  89. data_index = 0
  90. for letter in map(_letter_to_number, name) + [_EON]:
  91. data[data_index] = letter
  92. data_index += 1
  93. x = data[:batch_size * num_steps].reshape((batch_size, num_steps))
  94. y = data[1:batch_size * num_steps + 1].reshape((batch_size, num_steps))
  95. return x, y