// DRAGNN Configuration proto. See go/dragnn-design for more information. syntax = "proto2"; package syntaxnet.dragnn; // Proto to specify a set of DRAGNN components (transition systems) that are // trained and evaluated jointly. Each component gets one ComponentSpec. // // The order of component is important: a component can only link to components // that come before (for now.) // NEXT ID: 6 message MasterSpec { repeated ComponentSpec component = 1; // Whether to extract debug traces. optional bool debug_tracing = 4 [default = false]; reserved 2, 3, 5; } // Complete specification for a single task. message ComponentSpec { // Name for this component: this is used in linked features via the // "source_component" field. optional string name = 1; // TransitionSystem to use. optional RegisteredModuleSpec transition_system = 2; // Resources that this component depends on. These are copied to TaskInputs // when calling SAFT code. repeated Resource resource = 3; // Feature space configurations. repeated FixedFeatureChannel fixed_feature = 4; repeated LinkedFeatureChannel linked_feature = 5; // Neural Network builder specification. optional RegisteredModuleSpec network_unit = 6; // The registered C++ implementation of the dragnn::Component class; e.g. // "SyntaxNetComponent". optional RegisteredModuleSpec backend = 7; // Number of possible actions from every state. optional int32 num_actions = 8; // Specify the name of the lower level component on which it has attention. optional string attention_component = 9 [default = ""]; // Options for the ComponentBuilder. If this is empty, the regular // tf.while_loop based builder is assumed. optional RegisteredModuleSpec component_builder = 10; // Default max number of active states for beam training. optional int32 training_beam_size = 11 [default = 1]; // Default max number of active states for beam inference. optional int32 inference_beam_size = 12 [default = 1]; } // Super generic container for any registered sub-piece of DRAGNN. message RegisteredModuleSpec { // Name of the registered class. optional string registered_name = 1; // Parameters to set while initializing this system; these are copied to // Parameters in a TaskSpec when calling SAFT code, or via kwargs in TF Python // code. map parameters = 2; } // Fixed resources that will be converted into TaskInput's when calling SAFT // code. message Resource { optional string name = 1; repeated Part part = 2; } // The Parts here should be more or less compatible with TaskInput. message Part { optional string file_pattern = 1; optional string file_format = 2; optional string record_format = 3; } // ------------------------------------------------------------------------ // Feature specifications. // // A *feature channel* is a named collection of feature templates that share an // embedding matrix. Thus all features in the channel are assumed to use the // same vocabulary: e.g., words, POS tags, hidden layer activations, etc. These // are extracted, embedded, and then concatenated together as a group. // Specification for a feature channel that is a *fixed* function of the input. // NEXT_ID: 10 message FixedFeatureChannel { // Interpretable name for this feature channel. NN builders might depend on // this to determine how to hook different channels up internally. optional string name = 1; // String describing the FML for this feature channel. optional string fml = 2; // Size of parameters for this space: // Dimensions of embedding space, or -1 if the feature should not be embedded. optional int32 embedding_dim = 3; // No. of possible values returned. optional int32 vocabulary_size = 4; // No. of different feature templates in the channel, i.e. the # of features // that will be concatenated but share the embedding for this channel. optional int32 size = 5; // Whether the embeddings for this channel should be held constant at their // pretrained values, instead of being trained. Pretrained embeddings are // required when true. optional bool is_constant = 9; // Resources for this space: // Predicate map for compacting feature values. optional string predicate_map = 6; // Pointer to a pretrained embedding matrix for this feature set. optional Resource pretrained_embedding_matrix = 7; // Vocab file, containing all vocabulary words one per line. optional Resource vocab = 8; } // Specification for a feature channel that *links* to component // activations. Note that the "vocabulary" of these features is the activations // that they are linked to, so it is determined by the other components in the // spec. message LinkedFeatureChannel { // Interpretable name for this feature channel. NN builders might depend on // this to determine how to hook different channels up internally. optional string name = 1; // Feature function specification. Note: these should all be of type // LinkedFeatureType. optional string fml = 2; // Embedding dimension, or -1 if the link should not be embedded. optional int32 embedding_dim = 3; // No. of different feature templates in the channel, i.e. the # of features // that will be concatenated but share the embedding for this channel. optional int32 size = 4; // Component to use for translation, e.g. "tagger" optional string source_component = 5; // Translator target, e.g. "token" or "last_action", to translate raw feature // values into indices. This must be interpretable by the Component referenced // by source_component. optional string source_translator = 6; // Layer that these features should connect to. optional string source_layer = 7; } // A vector of hyperparameter configurations to search over. message TrainingGridSpec { // Grid points to search over. repeated GridPoint grid_point = 1; // Training targets to create in the graph builder stage. repeated TrainTarget target = 2; } // A hyperparameter configuration for a training run. // NEXT ID: 22 message GridPoint { // Global learning rate initialization point. optional double learning_rate = 1 [default = 0.1]; // Momentum coefficient when using MomentumOptimizer. optional double momentum = 2 [default = 0.9]; // Decay rate and base for global learning rate decay. The learning rate is // reduced by a factor of |decay_base| every |decay_steps|. optional double decay_base = 16 [default = 0.96]; optional int32 decay_steps = 3 [default = 1000]; // Whether to decay the learning rate in a "staircase" manner. If true, the // rate is adjusted exactly once every |decay_steps|. Otherwise, the rate is // adjusted in smaller increments on every step, such that the overall rate of // decay is still |decay_base| every |decay_steps|. optional bool decay_staircase = 17 [default = true]; // Random seed to initialize parameters. optional int32 seed = 4 [default = 0]; // Specify the optimizer used in training, the default is MomentumOptimizer. optional string learning_method = 7 [default = 'momentum']; // Whether or not to use a moving average of the weights in inference time. optional bool use_moving_average = 8 [default = false]; // Rolling average update co-efficient. optional double average_weight = 9 [default = 0.9999]; // The dropout *keep* probability rate used in the model. 1.0 = no dropout. optional double dropout_rate = 10 [default = 1.0]; // The dropout *keep* probability rate for recurrent connections. If < 0.0, // recurrent connections should use |dropout_rate| instead. 1.0 = no dropout. optional double recurrent_dropout_rate = 20 [default = -1.0]; // Gradient clipping threshold, applied if greater than zero. A value in the // range 1-20 seems to work well to prevent large learning rates from causing // problems for updates at the start of training. optional double gradient_clip_norm = 11 [default = 0.0]; // A spec for using multiple optimization methods. message CompositeOptimizerSpec { // First optimizer. optional GridPoint method1 = 1; // Second optimizer. optional GridPoint method2 = 2; // After this number of steps, switch from first to second. optional int32 switch_after_steps = 3; } optional CompositeOptimizerSpec composite_optimizer_spec = 12; // Parameters for Adam training. optional double adam_beta1 = 13 [default = 0.01]; optional double adam_beta2 = 14 [default = 0.9999]; optional double adam_eps = 15 [default = 1e-8]; // Coefficient for global L2 regularization. optional double l2_regularization_coefficient = 18 [default = 1e-4]; // Coefficient for global self normalization regularization. // A value of zero turns it off. optional double self_norm_alpha = 19 [default = 0.0]; // Comma separated list of components to which self_norm_alpha // should be restricted. If left empty, no filtering will take // place. Typically a single component. optional string self_norm_components_filter = 21; reserved 5, 6; } // Training target to be built into the graph. message TrainTarget { // Name for this target. This should be unique across all targets. optional string name = 1; // Specify the weights for different components. This should be the same size // as the number of components in the spec, or empty (defaults to equal // weights). Weights are normalized across the components being trained to sum // to one. repeated double component_weights = 2; // Specify whether to train a component using supervised signal or not. This // should be the same size as the number of components in the spec, or empty // (defaults to all true). repeated bool unroll_using_oracle = 3; // Maximum length of the pipeline to train. E.g. if max_index is 1, then only // the first component will be trained via this target. optional int32 max_index = 4 [default = -1]; }