1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768 |
- # coding=utf-8
- # Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- import argparse
- import json
- import os
- import time
- """
- This code adds id to each json object in a json file. User can add prefix
- to the ids.
- """
- if __name__ == '__main__':
- print('parsing the arguments ...')
- parser = argparse.ArgumentParser()
- parser.add_argument('--input-file', type=str, default=None, help='Input'\
- ' json file where id needs to be added')
- parser.add_argument('--output-file', type=str, default=None, help=\
- 'Output file name with id')
- parser.add_argument('--id-prefix', type=str, default=None, help=\
- 'Id prefix')
- parser.add_argument('--log-interval', type=int, default=100,
- help='Log interval')
- args = parser.parse_args()
- print('Adding ids to dataset ...')
- f_input = open(args.input_file, 'r', encoding='utf-8')
- f_output = open(args.output_file, 'wb')
- unique_ids = 1
- start_time = time.time()
- for row in f_input:
- each_row = json.loads(row)
- adlr_id_string = args.id_prefix + '-{:010d}'.format(int(unique_ids))
- each_row['adlr_id'] = adlr_id_string
- myjson = json.dumps(each_row, ensure_ascii=False)
- f_output.write(myjson.encode('utf-8'))
- f_output.write('\n'.encode('utf-8'))
- if unique_ids % args.log_interval == 0:
- print(' processed {:9d} documents in {:.2f} seconds ...'.format( \
- unique_ids, time.time() - start_time), flush=True)
- unique_ids += 1
- # Close the file.
- f_input.close()
- f_output.close()
-
- print('done :-)', flush=True)
|