4 лет назад · f053575d86
--- a/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-1_acquiring_data.ipynb
+++ b/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-1_acquiring_data.ipynb
@@ -2,7 +2,7 @@
 
				  "cells": [
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "atlantic-kingston",
			
 
				+   "id": "vital-advertising",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "# \n",
			
@@ -24,7 +24,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "aerial-mills",
			
 
				+   "id": "permanent-reception",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "--------------------------------------------------------------------------------------------------------------------\n",
			
@@ -34,7 +34,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 2,
			
 
				-   "id": "organic-owner",
			
 
				+   "id": "steady-henry",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -66,7 +66,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 13,
			
 
				-   "id": "apparent-newark",
			
 
				+   "id": "exposed-mouth",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -84,7 +84,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 15,
			
 
				-   "id": "brave-plasma",
			
 
				+   "id": "turned-navigator",
			
 
				    "metadata": {},
			
 
				    "outputs": [],
			
 
				    "source": [
			
@@ -94,7 +94,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 17,
			
 
				-   "id": "forward-telling",
			
 
				+   "id": "level-discipline",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -112,7 +112,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 18,
			
 
				-   "id": "approximate-surname",
			
 
				+   "id": "separated-payday",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -140,7 +140,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 20,
			
 
				-   "id": "occupational-globe",
			
 
				+   "id": "guilty-comparative",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -172,7 +172,7 @@
 
				     "    print(\"finish processing \",fname)\n",
			
 
				     "    f.close()\n",
			
 
				     "    \n",
			
 
				-    "out_path='../../../../dataset/SV/'\n",
			
 
				+    "out_path='./dataset/SV/'\n",
			
 
				     "xml_f=out_path+'webbnyheter2013.xml'\n",
			
 
				     "if xml_f.endswith('.xml') :    \n",
			
 
				     "    corpus = SBCorpusReader(xml_f)\n",
			
@@ -190,7 +190,7 @@
 
				   {
			
 
				    "cell_type": "code",
			
 
				    "execution_count": 21,
			
 
				-   "id": "banned-series",
			
 
				+   "id": "rubber-finnish",
			
 
				    "metadata": {},
			
 
				    "outputs": [
			
 
				     {
			
@@ -207,7 +207,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "senior-solomon",
			
 
				+   "id": "round-somewhere",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "---\n",
			
@@ -221,7 +221,7 @@
 
				   },
			
 
				   {
			
 
				    "cell_type": "markdown",
			
 
				-   "id": "nutritional-hammer",
			
 
				+   "id": "celtic-appreciation",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				     "-----\n",
			
--- a/ai/Megatron/English/Python/source_code/__pycache__/sb_corpus_reader.cpython-38.pyc
+++ b/ai/Megatron/English/Python/source_code/__pycache__/sb_corpus_reader.cpython-38.pyc
--- a/ai/Megatron/English/Python/source_code/download_webnyheter2013.sh
+++ b/ai/Megatron/English/Python/source_code/download_webnyheter2013.sh
@@ -0,0 +1,12 @@
 
				+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
			
 
				+wget http://spraakbanken.gu.se/lb/resurser/meningsmangder/webbnyheter2013.xml.bz2 &&
			
 
				+bunzip2 -d webbnyheter2013.xml.bz2 &&
			
 
				+mv webbnyheter2013.xml ./source_code/ &&
			
 
				+wget https://raw.githubusercontent.com/spraakbanken/sb-nltk-tools/master/sb_corpus_reader.py &&
			
 
				+mv sb_corpus_reader.py ./source_code/ &&
			
 
				+cd ./source_code/ &&
			
 
				+python get_nyheterdata.py &&
			
 
				+echo ls &&
			
 
				+mv webnyheter2013.txt ../dataset/SV/ &&
			
 
				+rm -fr webbnyheter2013.xml
			
 
				+
			
--- a/ai/Megatron/English/Python/source_code/get_nyheterdata.py
+++ b/ai/Megatron/English/Python/source_code/get_nyheterdata.py
@@ -0,0 +1,30 @@
 
				+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
			
 
				+import json
			
 
				+import os, sys
			
 
				+import numpy as np
			
 
				+import nltk
			
 
				+from sb_corpus_reader import SBCorpusReader
			
 
				+import random
			
 
				+
			
 
				+def write2csv(out_path, fname, sents):
			
 
				+    f=open(out_path+fname,'a')
			
 
				+    for s in sents:
			
 
				+        if len(s)>=2:
			
 
				+            s_text=' '.join(s)
			
 
				+            f.write(s_text+'\n')
			
 
				+    print("finish processing ",fname)
			
 
				+    f.close()
			
 
				+    
			
 
				+out_path='./'
			
 
				+xml_f=out_path+'webbnyheter2013.xml'
			
 
				+if xml_f.endswith('.xml') :    
			
 
				+    corpus = SBCorpusReader(xml_f)
			
 
				+    sents=corpus.sents()
			
 
				+    print(sents[:2])
			
 
				+    #n=len(sents)
			
 
				+    #rn=random.randint(0,n-1)
			
 
				+    #print("a random sample of sentence : \n".format(' '.join(sents[rn])))
			
 
				+    fname='webnyheter2013.txt'  
			
 
				+    print("write to : ",fname)
			
 
				+    write2csv(out_path,fname,sents)
			
 
				+    print('-----'*10)
			
--- a/ai/Megatron/README.md
+++ b/ai/Megatron/README.md
@@ -15,6 +15,15 @@ To run this tutorial you will need a machine with at least 2 x NVIDIA GPUs.
 
				 
			
 
				 - The base containers required for the lab may require users to create a NGC account and generate an API key (https://docs.nvidia.com/ngc/ngc-catalog-user-guide/index.html#registering-activating-ngc-account)
			
 
				 
			
 
				+- you will also need to run the below script in order to obtain the toy data
			
 
				+git clone https://github.com/gpuhackathons-org/gpubootcamp.git
			
 
				+cd gpubootcamp
			
 
				+git checkout megatron
			
 
				+cd ./ai/Megatron/English/Python/ 
			
 
				+mkdir ./dataset/SV/
			
 
				+mkdir ./datset/EN/
			
 
				+bash ./source_code/download_webnyheter2013.sh 
			
 
				+
			
 
				 #Tutorial Duration
			
 
				 The total bootcamp material would take approximately 12 hours ( including solving mini-challenge ).