// DRAGNN Configuration proto. See go/dragnn-design for more information.

syntax = "proto2";

package syntaxnet.dragnn;

// Proto to specify a set of DRAGNN components (transition systems) that are
// trained and evaluated jointly. Each component gets one ComponentSpec.
//
// The order of component is important: a component can only link to components
// that come before (for now.)
// NEXT ID: 6
message MasterSpec {
  repeated ComponentSpec component = 1;

  // Whether to extract debug traces.
  optional bool debug_tracing = 4 [default = false];

  reserved 2, 3, 5;
}

// Complete specification for a single task.
message ComponentSpec {
  // Name for this component: this is used in linked features via the
  // "source_component" field.
  optional string name = 1;

  // TransitionSystem to use.
  optional RegisteredModuleSpec transition_system = 2;

  // Resources that this component depends on. These are copied to TaskInputs
  // when calling SAFT code.
  repeated Resource resource = 3;

  // Feature space configurations.
  repeated FixedFeatureChannel fixed_feature = 4;
  repeated LinkedFeatureChannel linked_feature = 5;

  // Neural Network builder specification.
  optional RegisteredModuleSpec network_unit = 6;

  // The registered C++ implementation of the dragnn::Component class; e.g.
  // "SyntaxNetComponent".
  optional RegisteredModuleSpec backend = 7;

  // Number of possible actions from every state.
  optional int32 num_actions = 8;

  // Specify the name of the lower level component on which it has attention.
  optional string attention_component = 9 [default = ""];

  // Options for the ComponentBuilder. If this is empty, the regular
  // tf.while_loop based builder is assumed.
  optional RegisteredModuleSpec component_builder = 10;

  // Default max number of active states for beam training.
  optional int32 training_beam_size = 11 [default = 1];

  // Default max number of active states for beam inference.
  optional int32 inference_beam_size = 12 [default = 1];
}

// Super generic container for any registered sub-piece of DRAGNN.
message RegisteredModuleSpec {
  // Name of the registered class.
  optional string registered_name = 1;

  // Parameters to set while initializing this system; these are copied to
  // Parameters in a TaskSpec when calling SAFT code, or via kwargs in TF Python
  // code.
  map<string, string> parameters = 2;
}

// Fixed resources that will be converted into TaskInput's when calling SAFT
// code.
message Resource {
  optional string name = 1;
  repeated Part part = 2;
}

// The Parts here should be more or less compatible with TaskInput.
message Part {
  optional string file_pattern = 1;
  optional string file_format = 2;
  optional string record_format = 3;
}

// ------------------------------------------------------------------------
// Feature specifications.
//
// A *feature channel* is a named collection of feature templates that share an
// embedding matrix. Thus all features in the channel are assumed to use the
// same vocabulary: e.g., words, POS tags, hidden layer activations, etc. These
// are extracted, embedded, and then concatenated together as a group.

// Specification for a feature channel that is a *fixed* function of the input.
// NEXT_ID: 10
message FixedFeatureChannel {
  // Interpretable name for this feature channel. NN builders might depend on
  // this to determine how to hook different channels up internally.
  optional string name = 1;

  // String describing the FML for this feature channel.
  optional string fml = 2;

  // Size of parameters for this space:

  // Dimensions of embedding space, or -1 if the feature should not be embedded.
  optional int32 embedding_dim = 3;

  // No. of possible values returned.
  optional int32 vocabulary_size = 4;

  // No. of different feature templates in the channel, i.e. the # of features
  // that will be concatenated but share the embedding for this channel.
  optional int32 size = 5;

  // Whether the embeddings for this channel should be held constant at their
  // pretrained values, instead of being trained.  Pretrained embeddings are
  // required when true.
  optional bool is_constant = 9;

  // Resources for this space:

  // Predicate map for compacting feature values.
  optional string predicate_map = 6;

  // Pointer to a pretrained embedding matrix for this feature set.
  optional Resource pretrained_embedding_matrix = 7;

  // Vocab file, containing all vocabulary words one per line.
  optional Resource vocab = 8;
}

// Specification for a feature channel that *links* to component
// activations. Note that the "vocabulary" of these features is the activations
// that they are linked to, so it is determined by the other components in the
// spec.
message LinkedFeatureChannel {
  // Interpretable name for this feature channel. NN builders might depend on
  // this to determine how to hook different channels up internally.
  optional string name = 1;

  // Feature function specification. Note: these should all be of type
  // LinkedFeatureType.
  optional string fml = 2;

  // Embedding dimension, or -1 if the link should not be embedded.
  optional int32 embedding_dim = 3;

  // No. of different feature templates in the channel, i.e. the # of features
  // that will be concatenated but share the embedding for this channel.
  optional int32 size = 4;

  // Component to use for translation, e.g. "tagger"
  optional string source_component = 5;

  // Translator target, e.g. "token" or "last_action", to translate raw feature
  // values into indices. This must be interpretable by the Component referenced
  // by source_component.
  optional string source_translator = 6;

  // Layer that these features should connect to.
  optional string source_layer = 7;
}

// A vector of hyperparameter configurations to search over.
message TrainingGridSpec {
  // Grid points to search over.
  repeated GridPoint grid_point = 1;

  // Training targets to create in the graph builder stage.
  repeated TrainTarget target = 2;
}

// A hyperparameter configuration for a training run.
// NEXT ID: 22
message GridPoint {
  // Global learning rate initialization point.
  optional double learning_rate = 1 [default = 0.1];

  // Momentum coefficient when using MomentumOptimizer.
  optional double momentum = 2 [default = 0.9];

  // Decay rate and base for global learning rate decay.  The learning rate is
  // reduced by a factor of |decay_base| every |decay_steps|.
  optional double decay_base = 16 [default = 0.96];
  optional int32 decay_steps = 3 [default = 1000];

  // Whether to decay the learning rate in a "staircase" manner.  If true, the
  // rate is adjusted exactly once every |decay_steps|.  Otherwise, the rate is
  // adjusted in smaller increments on every step, such that the overall rate of
  // decay is still |decay_base| every |decay_steps|.
  optional bool decay_staircase = 17 [default = true];

  // Random seed to initialize parameters.
  optional int32 seed = 4 [default = 0];

  // Specify the optimizer used in training, the default is MomentumOptimizer.
  optional string learning_method = 7 [default = 'momentum'];

  // Whether or not to use a moving average of the weights in inference time.
  optional bool use_moving_average = 8 [default = false];

  // Rolling average update co-efficient.
  optional double average_weight = 9 [default = 0.9999];

  // The dropout *keep* probability rate used in the model. 1.0 = no dropout.
  optional double dropout_rate = 10 [default = 1.0];

  // The dropout *keep* probability rate for recurrent connections.  If < 0.0,
  // recurrent connections should use |dropout_rate| instead.  1.0 = no dropout.
  optional double recurrent_dropout_rate = 20 [default = -1.0];

  // Gradient clipping threshold, applied if greater than zero. A value in the
  // range 1-20 seems to work well to prevent large learning rates from causing
  // problems for updates at the start of training.
  optional double gradient_clip_norm = 11 [default = 0.0];

  // A spec for using multiple optimization methods.
  message CompositeOptimizerSpec {
    // First optimizer.
    optional GridPoint method1 = 1;

    // Second optimizer.
    optional GridPoint method2 = 2;

    // After this number of steps, switch from first to second.
    optional int32 switch_after_steps = 3;
  }
  optional CompositeOptimizerSpec composite_optimizer_spec = 12;

  // Parameters for Adam training.
  optional double adam_beta1 = 13 [default = 0.01];
  optional double adam_beta2 = 14 [default = 0.9999];
  optional double adam_eps = 15 [default = 1e-8];

  // Coefficient for global L2 regularization.
  optional double l2_regularization_coefficient = 18 [default = 1e-4];

  // Coefficient for global self normalization regularization.
  // A value of zero turns it off.
  optional double self_norm_alpha = 19 [default = 0.0];

  // Comma separated list of components to which self_norm_alpha
  // should be restricted. If left empty, no filtering will take
  // place. Typically a single component.
  optional string self_norm_components_filter = 21;

  reserved 5, 6;
}

// Training target to be built into the graph.
message TrainTarget {
  // Name for this target. This should be unique across all targets.
  optional string name = 1;

  // Specify the weights for different components. This should be the same size
  // as the number of components in the spec, or empty (defaults to equal
  // weights). Weights are normalized across the components being trained to sum
  // to one.
  repeated double component_weights = 2;

  // Specify whether to train a component using supervised signal or not. This
  // should be the same size as the number of components in the spec, or empty
  // (defaults to all true).
  repeated bool unroll_using_oracle = 3;

  // Maximum length of the pipeline to train. E.g. if max_index is 1, then only
  // the first component will be trained via this target.
  optional int32 max_index = 4 [default = -1];
}