4 лет назад · 585e5c91bc
--- a/ai/Megatron/English/Python/jupyter_notebook/Day2-5_Observe_GPT_runs_vs_performance.ipynb
+++ b/ai/Megatron/English/Python/jupyter_notebook/Day2-5_Observe_GPT_runs_vs_performance.ipynb
@@ -2,7 +2,7 @@
 
				  "cells": [
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "governing-exclusion",
			
 
				+   "id": "jewish-polish",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "# \n",
			
@@ -30,7 +30,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "supported-romantic",
			
 
				+   "id": "collected-going",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "----------------------------------------------------------\n",
			
@@ -42,7 +42,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "friendly-lightning",
			
 
				+   "id": "rational-wings",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "----------------------------------------------------------\n",
			
@@ -54,7 +54,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "permanent-mainstream",
			
 
				+   "id": "organizational-quality",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "----------------------------------------------------------\n",
			
@@ -76,7 +76,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "meaning-rubber",
			
 
				+   "id": "healthy-addition",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -86,7 +86,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 1,
			
 
				-   "id": "ecological-layer",
			
 
				+   "id": "induced-brazil",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -104,7 +104,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "precise-genius",
			
 
				+   "id": "planned-composer",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -129,7 +129,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 9,
			
 
				-   "id": "educated-story",
			
 
				+   "id": "forty-peeing",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -396,7 +396,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 11,
			
 
				-   "id": "numeric-relevance",
			
 
				+   "id": "perceived-spray",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -405,7 +405,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "dominican-limit",
			
 
				+   "id": "rural-bermuda",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -416,7 +416,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 10,
			
 
				-   "id": "preliminary-feeding",
			
 
				+   "id": "built-scout",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -425,7 +425,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "vanilla-dakota",
			
 
				+   "id": "transsexual-investor",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "----------------------------------------------------------\n",
			
@@ -458,7 +458,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 4,
			
 
				-   "id": "responsible-senegal",
			
 
				+   "id": "handed-gospel",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -752,7 +752,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "global-network",
			
 
				+   "id": "regulation-diary",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "--------------------------------------------------\n",
			
@@ -764,7 +764,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "handled-great",
			
 
				+   "id": "flying-firmware",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -791,8 +791,8 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "code",
			
 
				-   "execution_count": 7,
			
 
				-   "id": "academic-aquatic",
			
 
				+   "execution_count": 1,
			
 
				+   "id": "positive-colleague",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -802,7 +802,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 6,
			
 
				-   "id": "confidential-steam",
			
 
				+   "id": "theoretical-valley",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -1093,7 +1093,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "earlier-priest",
			
 
				+   "id": "headed-journal",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "--------------------------------------------------\n",
			
@@ -1104,7 +1104,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "optimum-resource",
			
 
				+   "id": "developmental-japan",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "<a id=\"TheChallenge\"></a>"
			
@@ -1112,7 +1112,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "connected-lincoln",
			
 
				+   "id": "stretch-block",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "----------------\n",
			
@@ -1122,11 +1122,11 @@
 
				     "        - use your current given # of gpus \n",
			
 
				     "        - do NOT changing the following parameters --train-samples 100 \n",
			
 
				     "        - you cannot go OOM \n",
			
 
				-    "        - you must sustain >80% GPUs utilization in the **training** phase \n",
			
 
				+    "        - you must sustain >60% GPUs utilization in the **training** phase \n",
			
 
				     "        - training run must be finished and checkpoint must be saved successfully\n",
			
 
				     "    - task : \n",
			
 
				     "            given the above constraints, get as good training GPUs utilizations as possible\n",
			
 
				-    "    - Pass : sustain 80% gpus utils ( across all gpus) in the **training** phase !\n",
			
 
				+    "    - Pass : sustain 60% gpus utils ( across all gpus) in the **training** phase !\n",
			
 
				     " \n",
			
 
				     "\n",
			
 
				     "\n",
			
@@ -1151,7 +1151,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "fatal-crawford",
			
 
				+   "id": "manual-birthday",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -1161,7 +1161,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "agreed-gasoline",
			
 
				+   "id": "sorted-translator",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "-----\n",
			
--- a/ai/Megatron/English/Python/jupyter_notebook/Day3-4_customize_process2mmap.ipynb
+++ b/ai/Megatron/English/Python/jupyter_notebook/Day3-4_customize_process2mmap.ipynb
@@ -2,7 +2,7 @@
 
				  "cells": [
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "smart-angola",
			
 
				+   "id": "expressed-hello",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "# \n",
			
@@ -56,7 +56,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "velvet-abuse",
			
 
				+   "id": "regulated-minnesota",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -66,7 +66,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 3,
			
 
				-   "id": "undefined-target",
			
 
				+   "id": "integral-choice",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -86,7 +86,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "asian-twenty",
			
 
				+   "id": "interim-works",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -96,7 +96,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 1,
			
 
				-   "id": "capital-military",
			
 
				+   "id": "affecting-chuck",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -106,7 +106,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 2,
			
 
				-   "id": "greek-night",
			
 
				+   "id": "patient-certificate",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -141,7 +141,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "generous-seventh",
			
 
				+   "id": "pressing-dancing",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "----------------\n",
			
@@ -156,7 +156,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 2,
			
 
				-   "id": "impaired-reasoning",
			
 
				+   "id": "terminal-shell",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -370,7 +370,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 11,
			
 
				-   "id": "sublime-soccer",
			
 
				+   "id": "regular-fifth",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -383,7 +383,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "approved-perth",
			
 
				+   "id": "assisted-ceiling",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -395,7 +395,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": null,
			
 
				-   "id": "loved-making",
			
 
				+   "id": "fallen-theater",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -414,7 +414,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 12,
			
 
				-   "id": "horizontal-barrel",
			
 
				+   "id": "familiar-collar",
			
 
				    "metadata": {
			
 
				     "collapsed": true,
			
 
				     "jupyter": {
			
@@ -12974,7 +12974,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 13,
			
 
				-   "id": "collaborative-reason",
			
 
				+   "id": "enclosed-excerpt",
			
 
				    "metadata": {
			
 
				     "collapsed": true,
			
 
				     "jupyter": {
			
@@ -13001,7 +13001,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "naked-puzzle",
			
 
				+   "id": "dried-assignment",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -13015,7 +13015,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "practical-research",
			
 
				+   "id": "cardiovascular-trailer",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "-----\n",
			
--- a/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/pics/modifyLSH_setuppy.JPG
+++ b/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/pics/modifyLSH_setuppy.JPG
--- a/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/SOLUTION_preprocess_data.py
+++ b/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/SOLUTION_preprocess_data.py
@@ -1,233 +0,0 @@
 
				-# coding=utf-8
			
 
				-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
			
 
				-#
			
 
				-# Licensed under the Apache License, Version 2.0 (the "License");
			
 
				-# you may not use this file except in compliance with the License.
			
 
				-# You may obtain a copy of the License at
			
 
				-#
			
 
				-#     http://www.apache.org/licenses/LICENSE-2.0
			
 
				-#
			
 
				-# Unless required by applicable law or agreed to in writing, software
			
 
				-# distributed under the License is distributed on an "AS IS" BASIS,
			
 
				-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
			
 
				-# See the License for the specific language governing permissions and
			
 
				-# limitations under the License.
			
 
				-
			
 
				-"""Processing data for pretraining."""
			
 
				-
			
 
				-import argparse
			
 
				-import json
			
 
				-import multiprocessing
			
 
				-import os
			
 
				-import sys
			
 
				-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
			
 
				-                                             os.path.pardir)))
			
 
				-import time
			
 
				-import re
			
 
				-import torch
			
 
				-
			
 
				-try:
			
 
				-    import nltk
			
 
				-    nltk_available = True
			
 
				-
			
 
				-    import nltk
			
 
				-    from nltk.tokenize import sent_tokenize
			
 
				-except ImportError:
			
 
				-    nltk_available = False
			
 
				-
			
 
				-from megatron.tokenizer import build_tokenizer
			
 
				-from megatron.data import indexed_dataset
			
 
				-
			
 
				-
			
 
				-# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
			
 
				-class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
			
 
				-
			
 
				-    _period_context_fmt = r"""
			
 
				-        \S*                          # some word material
			
 
				-        %(SentEndChars)s             # a potential sentence ending
			
 
				-        \s*                       #  <-- THIS is what I changed
			
 
				-        (?=(?P<after_tok>
			
 
				-            %(NonWord)s              # either other punctuation
			
 
				-            |
			
 
				-            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
			
 
				-        ))"""
			
 
				-
			
 
				-class IdentitySplitter(object):
			
 
				-    def tokenize(self, *text):
			
 
				-        return text
			
 
				-
			
 
				-def normal_cut_sentence(temp):
			
 
				-    return sent_tokenize(temp)
			
 
				-
			
 
				-def cut_sentence_with_quotation_marks(text):
			
 
				-    p = re.compile("“.*?”")
			
 
				-    list = []
			
 
				-    index = 0
			
 
				-    length = len(text)
			
 
				-    for i in p.finditer(text):
			
 
				-        temp = ''
			
 
				-        start = i.start()
			
 
				-        end = i.end()
			
 
				-        for j in range(index, start):
			
 
				-            temp += text[j]
			
 
				-        if temp != '':
			
 
				-            temp_list = normal_cut_sentence(temp)
			
 
				-            list += temp_list
			
 
				-        temp = ''
			
 
				-        for k in range(start, end):
			
 
				-            temp += text[k]
			
 
				-        if temp != ' ':
			
 
				-            list.append(temp)
			
 
				-        index = end
			
 
				-    return list
			
 
				-
			
 
				-class Encoder(object):
			
 
				-    def __init__(self, args):
			
 
				-        self.args = args
			
 
				-
			
 
				-    def initializer(self):
			
 
				-        # Use Encoder class as a container for global data
			
 
				-        Encoder.tokenizer = build_tokenizer(self.args)
			
 
				-        if self.args.split_sentences:
			
 
				-            if not nltk_available:
			
 
				-                print("NLTK is not available to split sentences.")
			
 
				-                exit()
			
 
				-            splitter = nltk.load("tokenizers/punkt/english.pickle")
			
 
				-            if self.args.keep_newlines:
			
 
				-                # this prevents punkt from eating newlines after sentences
			
 
				-                Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
			
 
				-                    train_text = splitter._params,
			
 
				-                    lang_vars = CustomLanguageVars())
			
 
				-            else:
			
 
				-                Encoder.splitter = splitter
			
 
				-
			
 
				-        else:
			
 
				-            Encoder.splitter = IdentitySplitter()
			
 
				-
			
 
				-    def encode(self, json_line):
			
 
				-        data = json.loads(json_line)
			
 
				-        ids = {}
			
 
				-        for key in self.args.json_keys:
			
 
				-            text = data[key]
			
 
				-            doc_ids = []
			
 
				-            sents=cut_sentence_with_quotation_marks(text)
			
 
				-            for sentence in sents:
			
 
				-                sentence_ids = Encoder.tokenizer.tokenize(sentence)
			
 
				-                if len(sentence_ids) > 0:
			
 
				-                    doc_ids.append(sentence_ids)
			
 
				-            if len(doc_ids) > 0 and self.args.append_eod:
			
 
				-                doc_ids[-1].append(Encoder.tokenizer.eod)
			
 
				-            ids[key] = doc_ids
			
 
				-        return ids, len(json_line)
			
 
				-
			
 
				-def get_args():
			
 
				-    parser = argparse.ArgumentParser()
			
 
				-    group = parser.add_argument_group(title='input data')
			
 
				-    group.add_argument('--input', type=str, required=True,
			
 
				-                       help='Path to input JSON')
			
 
				-    group.add_argument('--json-keys', nargs='+', default=['text'],
			
 
				-                       help='space separate listed of keys to extract from json')
			
 
				-    group.add_argument('--split-sentences', action='store_true',
			
 
				-                       help='Split documents into sentences.')
			
 
				-    group.add_argument('--keep-newlines', action='store_true',
			
 
				-                       help='Keep newlines between sentences when splitting.')
			
 
				-
			
 
				-    group = parser.add_argument_group(title='tokenizer')
			
 
				-    group.add_argument('--tokenizer-type', type=str, required=True,
			
 
				-                       choices=['BertWordPieceLowerCase','BertWordPieceCase',
			
 
				-                                'GPT2BPETokenizer'],
			
 
				-                       help='What type of tokenizer to use.')
			
 
				-    group.add_argument('--vocab-file', type=str, default=None,
			
 
				-                       help='Path to the vocab file')
			
 
				-    group.add_argument('--merge-file', type=str, default=None,
			
 
				-                       help='Path to the BPE merge file (if necessary).')
			
 
				-    group.add_argument('--append-eod', action='store_true',
			
 
				-                       help='Append an <eod> token to the end of a document.')
			
 
				-
			
 
				-
			
 
				-    group = parser.add_argument_group(title='output data')
			
 
				-    group.add_argument('--output-prefix', type=str, required=True,
			
 
				-                       help='Path to binary output file without suffix')
			
 
				-    group.add_argument('--dataset-impl', type=str, default='mmap',
			
 
				-                       choices=['lazy', 'cached', 'mmap'])
			
 
				-
			
 
				-    group = parser.add_argument_group(title='runtime')
			
 
				-    group.add_argument('--workers', type=int, default=1,
			
 
				-                       help='Number of worker processes to launch')
			
 
				-    group.add_argument('--log-interval', type=int, default=100,
			
 
				-                       help='Interval between progress updates')
			
 
				-    args = parser.parse_args()
			
 
				-    args.keep_empty = False
			
 
				-
			
 
				-    if args.tokenizer_type.lower().startswith('bert'):
			
 
				-        if not args.split_sentences:
			
 
				-            print("Bert tokenizer detected, are you sure you don't want to split sentences?")
			
 
				-
			
 
				-    # some default/dummy values for the tokenizer
			
 
				-    args.rank = 0
			
 
				-    args.make_vocab_size_divisible_by = 128
			
 
				-    args.tensor_model_parallel_size = 1
			
 
				-    args.vocab_extra_ids = 0
			
 
				-
			
 
				-    return args
			
 
				-
			
 
				-def main():
			
 
				-    args = get_args()
			
 
				-    startup_start = time.time()
			
 
				-
			
 
				-    print("Opening", args.input)
			
 
				-    fin = open(args.input, 'r', encoding='utf-8')
			
 
				-
			
 
				-    if nltk_available and args.split_sentences:
			
 
				-        nltk.download("punkt", quiet=True)
			
 
				-
			
 
				-    encoder = Encoder(args)
			
 
				-    tokenizer = build_tokenizer(args)
			
 
				-    pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
			
 
				-    encoded_docs = pool.imap(encoder.encode, fin, 25)
			
 
				-    #encoded_docs = map(encoder.encode, fin)
			
 
				-
			
 
				-    level = "document"
			
 
				-    if args.split_sentences:
			
 
				-        level = "sentence"
			
 
				-
			
 
				-    print(f"Vocab size: {tokenizer.vocab_size}")
			
 
				-    print(f"Output prefix: {args.output_prefix}")
			
 
				-    output_bin_files = {}
			
 
				-    output_idx_files = {}
			
 
				-    builders = {}
			
 
				-    for key in args.json_keys:
			
 
				-        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
			
 
				-                                                      key, level)
			
 
				-        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
			
 
				-                                                      key, level)
			
 
				-        builders[key] = indexed_dataset.make_builder(output_bin_files[key],
			
 
				-                                               impl=args.dataset_impl,
			
 
				-                                               vocab_size=tokenizer.vocab_size)
			
 
				-
			
 
				-    startup_end = time.time()
			
 
				-    proc_start = time.time()
			
 
				-    total_bytes_processed = 0
			
 
				-    print("Time to startup:", startup_end - startup_start)
			
 
				-
			
 
				-    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
			
 
				-        total_bytes_processed += bytes_processed
			
 
				-        for key, sentences in doc.items():
			
 
				-            if len(sentences) == 0:
			
 
				-                continue
			
 
				-            for sentence in sentences:
			
 
				-                builders[key].add_item(torch.IntTensor(sentence))
			
 
				-            builders[key].end_document()
			
 
				-        if i % args.log_interval == 0:
			
 
				-            current = time.time()
			
 
				-            elapsed = current - proc_start
			
 
				-            mbs = total_bytes_processed/elapsed/1024/1024
			
 
				-            print(f"Processed {i} documents",
			
 
				-                  f"({i/elapsed} docs/s, {mbs} MB/s).",
			
 
				-                  file=sys.stderr)
			
 
				-
			
 
				-    for key in args.json_keys:
			
 
				-        builders[key].finalize(output_idx_files[key])
			
 
				-
			
 
				-if __name__ == '__main__':
			
 
				-    main()
			
--- a/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-2_SentenceBoundary_and_Deduplicate.ipynb
+++ b/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-2_SentenceBoundary_and_Deduplicate.ipynb
--- a/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/LSH_modification_setup.JPG
+++ b/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/LSH_modification_setup.JPG
--- a/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/df2.csv
+++ b/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/df2.csv
--- a/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/groundtruth.txt
+++ b/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/groundtruth.txt