Просмотр исходного кода

add source_code bash script to download toy data

zenodia 3 лет назад
Родитель
Сommit
f053575d86

+ 12 - 12
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-1_acquiring_data.ipynb

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "atlantic-kingston",
+   "id": "vital-advertising",
    "metadata": {},
    "source": [
     "# \n",
@@ -24,7 +24,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "aerial-mills",
+   "id": "permanent-reception",
    "metadata": {},
    "source": [
     "--------------------------------------------------------------------------------------------------------------------\n",
@@ -34,7 +34,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "organic-owner",
+   "id": "steady-henry",
    "metadata": {},
    "outputs": [
     {
@@ -66,7 +66,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "id": "apparent-newark",
+   "id": "exposed-mouth",
    "metadata": {},
    "outputs": [
     {
@@ -84,7 +84,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "id": "brave-plasma",
+   "id": "turned-navigator",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -94,7 +94,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "id": "forward-telling",
+   "id": "level-discipline",
    "metadata": {},
    "outputs": [
     {
@@ -112,7 +112,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "id": "approximate-surname",
+   "id": "separated-payday",
    "metadata": {},
    "outputs": [
     {
@@ -140,7 +140,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "id": "occupational-globe",
+   "id": "guilty-comparative",
    "metadata": {},
    "outputs": [
     {
@@ -172,7 +172,7 @@
     "    print(\"finish processing \",fname)\n",
     "    f.close()\n",
     "    \n",
-    "out_path='../../../../dataset/SV/'\n",
+    "out_path='./dataset/SV/'\n",
     "xml_f=out_path+'webbnyheter2013.xml'\n",
     "if xml_f.endswith('.xml') :    \n",
     "    corpus = SBCorpusReader(xml_f)\n",
@@ -190,7 +190,7 @@
   {
    "cell_type": "code",
    "execution_count": 21,
-   "id": "banned-series",
+   "id": "rubber-finnish",
    "metadata": {},
    "outputs": [
     {
@@ -207,7 +207,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "senior-solomon",
+   "id": "round-somewhere",
    "metadata": {},
    "source": [
     "---\n",
@@ -221,7 +221,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "nutritional-hammer",
+   "id": "celtic-appreciation",
    "metadata": {},
    "source": [
     "-----\n",

BIN
ai/Megatron/English/Python/source_code/__pycache__/sb_corpus_reader.cpython-38.pyc


+ 12 - 0
ai/Megatron/English/Python/source_code/download_webnyheter2013.sh

@@ -0,0 +1,12 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
+wget http://spraakbanken.gu.se/lb/resurser/meningsmangder/webbnyheter2013.xml.bz2 &&
+bunzip2 -d webbnyheter2013.xml.bz2 &&
+mv webbnyheter2013.xml ./source_code/ &&
+wget https://raw.githubusercontent.com/spraakbanken/sb-nltk-tools/master/sb_corpus_reader.py &&
+mv sb_corpus_reader.py ./source_code/ &&
+cd ./source_code/ &&
+python get_nyheterdata.py &&
+echo ls &&
+mv webnyheter2013.txt ../dataset/SV/ &&
+rm -fr webbnyheter2013.xml
+

+ 30 - 0
ai/Megatron/English/Python/source_code/get_nyheterdata.py

@@ -0,0 +1,30 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
+import json
+import os, sys
+import numpy as np
+import nltk
+from sb_corpus_reader import SBCorpusReader
+import random
+
+def write2csv(out_path, fname, sents):
+    f=open(out_path+fname,'a')
+    for s in sents:
+        if len(s)>=2:
+            s_text=' '.join(s)
+            f.write(s_text+'\n')
+    print("finish processing ",fname)
+    f.close()
+    
+out_path='./'
+xml_f=out_path+'webbnyheter2013.xml'
+if xml_f.endswith('.xml') :    
+    corpus = SBCorpusReader(xml_f)
+    sents=corpus.sents()
+    print(sents[:2])
+    #n=len(sents)
+    #rn=random.randint(0,n-1)
+    #print("a random sample of sentence : \n".format(' '.join(sents[rn])))
+    fname='webnyheter2013.txt'  
+    print("write to : ",fname)
+    write2csv(out_path,fname,sents)
+    print('-----'*10)

+ 9 - 0
ai/Megatron/README.md

@@ -15,6 +15,15 @@ To run this tutorial you will need a machine with at least 2 x NVIDIA GPUs.
 
 - The base containers required for the lab may require users to create a NGC account and generate an API key (https://docs.nvidia.com/ngc/ngc-catalog-user-guide/index.html#registering-activating-ngc-account)
 
+- you will also need to run the below script in order to obtain the toy data
+git clone https://github.com/gpuhackathons-org/gpubootcamp.git
+cd gpubootcamp
+git checkout megatron
+cd ./ai/Megatron/English/Python/ 
+mkdir ./dataset/SV/
+mkdir ./datset/EN/
+bash ./source_code/download_webnyheter2013.sh 
+
 #Tutorial Duration
 The total bootcamp material would take approximately 12 hours ( including solving mini-challenge ).