Browse Source

separate solution vs challenge

zenodia 3 years ago
parent
commit
585e5c91bc

+ 25 - 25
ai/Megatron/English/Python/jupyter_notebook/Day2-5_Observe_GPT_runs_vs_performance.ipynb

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "governing-exclusion",
+   "id": "jewish-polish",
    "metadata": {},
    "source": [
     "# \n",
@@ -30,7 +30,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "supported-romantic",
+   "id": "collected-going",
    "metadata": {},
    "source": [
     "----------------------------------------------------------\n",
@@ -42,7 +42,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "friendly-lightning",
+   "id": "rational-wings",
    "metadata": {},
    "source": [
     "----------------------------------------------------------\n",
@@ -54,7 +54,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "permanent-mainstream",
+   "id": "organizational-quality",
    "metadata": {},
    "source": [
     "----------------------------------------------------------\n",
@@ -76,7 +76,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "meaning-rubber",
+   "id": "healthy-addition",
    "metadata": {},
    "source": [
     "---\n",
@@ -86,7 +86,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "ecological-layer",
+   "id": "induced-brazil",
    "metadata": {},
    "outputs": [
     {
@@ -104,7 +104,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "precise-genius",
+   "id": "planned-composer",
    "metadata": {},
    "source": [
     "---\n",
@@ -129,7 +129,7 @@
   {
    "cell_type": "code",
    "execution_count": 9,
-   "id": "educated-story",
+   "id": "forty-peeing",
    "metadata": {},
    "outputs": [
     {
@@ -396,7 +396,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "id": "numeric-relevance",
+   "id": "perceived-spray",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -405,7 +405,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "dominican-limit",
+   "id": "rural-bermuda",
    "metadata": {},
    "source": [
     "---\n",
@@ -416,7 +416,7 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "id": "preliminary-feeding",
+   "id": "built-scout",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -425,7 +425,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "vanilla-dakota",
+   "id": "transsexual-investor",
    "metadata": {},
    "source": [
     "----------------------------------------------------------\n",
@@ -458,7 +458,7 @@
   {
    "cell_type": "code",
    "execution_count": 4,
-   "id": "responsible-senegal",
+   "id": "handed-gospel",
    "metadata": {},
    "outputs": [
     {
@@ -752,7 +752,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "global-network",
+   "id": "regulation-diary",
    "metadata": {},
    "source": [
     "--------------------------------------------------\n",
@@ -764,7 +764,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "handled-great",
+   "id": "flying-firmware",
    "metadata": {},
    "source": [
     "---\n",
@@ -791,8 +791,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "academic-aquatic",
+   "execution_count": 1,
+   "id": "positive-colleague",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -802,7 +802,7 @@
   {
    "cell_type": "code",
    "execution_count": 6,
-   "id": "confidential-steam",
+   "id": "theoretical-valley",
    "metadata": {},
    "outputs": [
     {
@@ -1093,7 +1093,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "earlier-priest",
+   "id": "headed-journal",
    "metadata": {},
    "source": [
     "--------------------------------------------------\n",
@@ -1104,7 +1104,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "optimum-resource",
+   "id": "developmental-japan",
    "metadata": {},
    "source": [
     "<a id=\"TheChallenge\"></a>"
@@ -1112,7 +1112,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "connected-lincoln",
+   "id": "stretch-block",
    "metadata": {},
    "source": [
     "----------------\n",
@@ -1122,11 +1122,11 @@
     "        - use your current given # of gpus \n",
     "        - do NOT changing the following parameters --train-samples 100 \n",
     "        - you cannot go OOM \n",
-    "        - you must sustain >80% GPUs utilization in the **training** phase \n",
+    "        - you must sustain >60% GPUs utilization in the **training** phase \n",
     "        - training run must be finished and checkpoint must be saved successfully\n",
     "    - task : \n",
     "            given the above constraints, get as good training GPUs utilizations as possible\n",
-    "    - Pass : sustain 80% gpus utils ( across all gpus) in the **training** phase !\n",
+    "    - Pass : sustain 60% gpus utils ( across all gpus) in the **training** phase !\n",
     " \n",
     "\n",
     "\n",
@@ -1151,7 +1151,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "fatal-crawford",
+   "id": "manual-birthday",
    "metadata": {},
    "source": [
     "---\n",
@@ -1161,7 +1161,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "agreed-gasoline",
+   "id": "sorted-translator",
    "metadata": {},
    "source": [
     "-----\n",

+ 15 - 15
ai/Megatron/English/Python/jupyter_notebook/Day3-4_customize_process2mmap.ipynb

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "smart-angola",
+   "id": "expressed-hello",
    "metadata": {},
    "source": [
     "# \n",
@@ -56,7 +56,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "velvet-abuse",
+   "id": "regulated-minnesota",
    "metadata": {},
    "source": [
     "---\n",
@@ -66,7 +66,7 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "undefined-target",
+   "id": "integral-choice",
    "metadata": {},
    "outputs": [
     {
@@ -86,7 +86,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "asian-twenty",
+   "id": "interim-works",
    "metadata": {},
    "source": [
     "---\n",
@@ -96,7 +96,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "capital-military",
+   "id": "affecting-chuck",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -106,7 +106,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "greek-night",
+   "id": "patient-certificate",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -141,7 +141,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "generous-seventh",
+   "id": "pressing-dancing",
    "metadata": {},
    "source": [
     "----------------\n",
@@ -156,7 +156,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "impaired-reasoning",
+   "id": "terminal-shell",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -370,7 +370,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "id": "sublime-soccer",
+   "id": "regular-fifth",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -383,7 +383,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "approved-perth",
+   "id": "assisted-ceiling",
    "metadata": {},
    "source": [
     "---\n",
@@ -395,7 +395,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "loved-making",
+   "id": "fallen-theater",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -414,7 +414,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "id": "horizontal-barrel",
+   "id": "familiar-collar",
    "metadata": {
     "collapsed": true,
     "jupyter": {
@@ -12974,7 +12974,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "id": "collaborative-reason",
+   "id": "enclosed-excerpt",
    "metadata": {
     "collapsed": true,
     "jupyter": {
@@ -13001,7 +13001,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "naked-puzzle",
+   "id": "dried-assignment",
    "metadata": {},
    "source": [
     "---\n",
@@ -13015,7 +13015,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "practical-research",
+   "id": "cardiovascular-trailer",
    "metadata": {},
    "source": [
     "-----\n",

BIN
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/pics/modifyLSH_setuppy.JPG


+ 0 - 233
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/SOLUTION_preprocess_data.py

@@ -1,233 +0,0 @@
-# coding=utf-8
-# Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Processing data for pretraining."""
-
-import argparse
-import json
-import multiprocessing
-import os
-import sys
-sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__),
-                                             os.path.pardir)))
-import time
-import re
-import torch
-
-try:
-    import nltk
-    nltk_available = True
-
-    import nltk
-    from nltk.tokenize import sent_tokenize
-except ImportError:
-    nltk_available = False
-
-from megatron.tokenizer import build_tokenizer
-from megatron.data import indexed_dataset
-
-
-# https://stackoverflow.com/questions/33139531/preserve-empty-lines-with-nltks-punkt-tokenizer
-class CustomLanguageVars(nltk.tokenize.punkt.PunktLanguageVars):
-
-    _period_context_fmt = r"""
-        \S*                          # some word material
-        %(SentEndChars)s             # a potential sentence ending
-        \s*                       #  <-- THIS is what I changed
-        (?=(?P<after_tok>
-            %(NonWord)s              # either other punctuation
-            |
-            (?P<next_tok>\S+)     #  <-- Normally you would have \s+ here
-        ))"""
-
-class IdentitySplitter(object):
-    def tokenize(self, *text):
-        return text
-
-def normal_cut_sentence(temp):
-    return sent_tokenize(temp)
-
-def cut_sentence_with_quotation_marks(text):
-    p = re.compile("“.*?”")
-    list = []
-    index = 0
-    length = len(text)
-    for i in p.finditer(text):
-        temp = ''
-        start = i.start()
-        end = i.end()
-        for j in range(index, start):
-            temp += text[j]
-        if temp != '':
-            temp_list = normal_cut_sentence(temp)
-            list += temp_list
-        temp = ''
-        for k in range(start, end):
-            temp += text[k]
-        if temp != ' ':
-            list.append(temp)
-        index = end
-    return list
-
-class Encoder(object):
-    def __init__(self, args):
-        self.args = args
-
-    def initializer(self):
-        # Use Encoder class as a container for global data
-        Encoder.tokenizer = build_tokenizer(self.args)
-        if self.args.split_sentences:
-            if not nltk_available:
-                print("NLTK is not available to split sentences.")
-                exit()
-            splitter = nltk.load("tokenizers/punkt/english.pickle")
-            if self.args.keep_newlines:
-                # this prevents punkt from eating newlines after sentences
-                Encoder.splitter = nltk.tokenize.punkt.PunktSentenceTokenizer(
-                    train_text = splitter._params,
-                    lang_vars = CustomLanguageVars())
-            else:
-                Encoder.splitter = splitter
-
-        else:
-            Encoder.splitter = IdentitySplitter()
-
-    def encode(self, json_line):
-        data = json.loads(json_line)
-        ids = {}
-        for key in self.args.json_keys:
-            text = data[key]
-            doc_ids = []
-            sents=cut_sentence_with_quotation_marks(text)
-            for sentence in sents:
-                sentence_ids = Encoder.tokenizer.tokenize(sentence)
-                if len(sentence_ids) > 0:
-                    doc_ids.append(sentence_ids)
-            if len(doc_ids) > 0 and self.args.append_eod:
-                doc_ids[-1].append(Encoder.tokenizer.eod)
-            ids[key] = doc_ids
-        return ids, len(json_line)
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    group = parser.add_argument_group(title='input data')
-    group.add_argument('--input', type=str, required=True,
-                       help='Path to input JSON')
-    group.add_argument('--json-keys', nargs='+', default=['text'],
-                       help='space separate listed of keys to extract from json')
-    group.add_argument('--split-sentences', action='store_true',
-                       help='Split documents into sentences.')
-    group.add_argument('--keep-newlines', action='store_true',
-                       help='Keep newlines between sentences when splitting.')
-
-    group = parser.add_argument_group(title='tokenizer')
-    group.add_argument('--tokenizer-type', type=str, required=True,
-                       choices=['BertWordPieceLowerCase','BertWordPieceCase',
-                                'GPT2BPETokenizer'],
-                       help='What type of tokenizer to use.')
-    group.add_argument('--vocab-file', type=str, default=None,
-                       help='Path to the vocab file')
-    group.add_argument('--merge-file', type=str, default=None,
-                       help='Path to the BPE merge file (if necessary).')
-    group.add_argument('--append-eod', action='store_true',
-                       help='Append an <eod> token to the end of a document.')
-
-
-    group = parser.add_argument_group(title='output data')
-    group.add_argument('--output-prefix', type=str, required=True,
-                       help='Path to binary output file without suffix')
-    group.add_argument('--dataset-impl', type=str, default='mmap',
-                       choices=['lazy', 'cached', 'mmap'])
-
-    group = parser.add_argument_group(title='runtime')
-    group.add_argument('--workers', type=int, default=1,
-                       help='Number of worker processes to launch')
-    group.add_argument('--log-interval', type=int, default=100,
-                       help='Interval between progress updates')
-    args = parser.parse_args()
-    args.keep_empty = False
-
-    if args.tokenizer_type.lower().startswith('bert'):
-        if not args.split_sentences:
-            print("Bert tokenizer detected, are you sure you don't want to split sentences?")
-
-    # some default/dummy values for the tokenizer
-    args.rank = 0
-    args.make_vocab_size_divisible_by = 128
-    args.tensor_model_parallel_size = 1
-    args.vocab_extra_ids = 0
-
-    return args
-
-def main():
-    args = get_args()
-    startup_start = time.time()
-
-    print("Opening", args.input)
-    fin = open(args.input, 'r', encoding='utf-8')
-
-    if nltk_available and args.split_sentences:
-        nltk.download("punkt", quiet=True)
-
-    encoder = Encoder(args)
-    tokenizer = build_tokenizer(args)
-    pool = multiprocessing.Pool(args.workers, initializer=encoder.initializer)
-    encoded_docs = pool.imap(encoder.encode, fin, 25)
-    #encoded_docs = map(encoder.encode, fin)
-
-    level = "document"
-    if args.split_sentences:
-        level = "sentence"
-
-    print(f"Vocab size: {tokenizer.vocab_size}")
-    print(f"Output prefix: {args.output_prefix}")
-    output_bin_files = {}
-    output_idx_files = {}
-    builders = {}
-    for key in args.json_keys:
-        output_bin_files[key] = "{}_{}_{}.bin".format(args.output_prefix,
-                                                      key, level)
-        output_idx_files[key] = "{}_{}_{}.idx".format(args.output_prefix,
-                                                      key, level)
-        builders[key] = indexed_dataset.make_builder(output_bin_files[key],
-                                               impl=args.dataset_impl,
-                                               vocab_size=tokenizer.vocab_size)
-
-    startup_end = time.time()
-    proc_start = time.time()
-    total_bytes_processed = 0
-    print("Time to startup:", startup_end - startup_start)
-
-    for i, (doc, bytes_processed) in enumerate(encoded_docs, start=1):
-        total_bytes_processed += bytes_processed
-        for key, sentences in doc.items():
-            if len(sentences) == 0:
-                continue
-            for sentence in sentences:
-                builders[key].add_item(torch.IntTensor(sentence))
-            builders[key].end_document()
-        if i % args.log_interval == 0:
-            current = time.time()
-            elapsed = current - proc_start
-            mbs = total_bytes_processed/elapsed/1024/1024
-            print(f"Processed {i} documents",
-                  f"({i/elapsed} docs/s, {mbs} MB/s).",
-                  file=sys.stderr)
-
-    for key in args.json_keys:
-        builders[key].finalize(output_idx_files[key])
-
-if __name__ == '__main__':
-    main()

File diff suppressed because it is too large
+ 226 - 167
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-2_SentenceBoundary_and_Deduplicate.ipynb


BIN
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/LSH_modification_setup.JPG


File diff suppressed because it is too large
+ 74 - 0
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/df2.csv


File diff suppressed because it is too large
+ 63 - 63
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/groundtruth.txt