4 vuotta sitten · f0fecdc8be
--- a/ai/Megatron/English/Python/Start_Here.ipynb
+++ b/ai/Megatron/English/Python/Start_Here.ipynb
@@ -238,6 +238,15 @@
 
				    "cell_type": "markdown",
			
 
				    "metadata": {},
			
 
				    "source": [
			
 
				+    "---\n",
			
 
				+    "# create your own data - web crawling \n",
			
 
				+    "please go through the notebook [link here](./jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-Website_scrapping.ipynb) to scrape NVIDIA blog's data "
			
 
				+   ]
			
 
				+  },
			
 
				+  {
			
 
				+   "cell_type": "markdown",
			
 
				+   "metadata": {},
			
 
				+   "source": [
			
 
				     "### Tutorial Outline\n",
			
 
				     "\n",
			
 
				     "The following contents will be covered during the Bootcamp :\n",
			
--- a/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-Website_scrapping.ipynb
+++ b/ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-Website_scrapping.ipynb
--- a/ai/Megatron/English/Python/source_code/download_NVblogtext.sh
+++ b/ai/Megatron/English/Python/source_code/download_NVblogtext.sh
@@ -0,0 +1,22 @@
 
				+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
			
 
				+#!/usr/bin/env bash
			
 
				+# gdrive_download
			
 
				+# script to download Google Drive files from command line
			
 
				+# not guaranteed to work indefinitely
			
 
				+# taken from Stack Overflow answer:
			
 
				+# http://stackoverflow.com/a/38937732/7002068
			
 
				+
			
 
				+gURL=https://drive.google.com/file/d/1PpyVSvRGoja6M2dbVpiscxrZD3YupRgT/view?usp=sharing
			
 
				+# match more than 26 word characters
			
 
				+ggID=$(echo "$gURL" | egrep -o '(\w|-){26,}')
			
 
				+
			
 
				+ggURL='https://drive.google.com/uc?export=download'
			
 
				+
			
 
				+curl -sc /tmp/gcokie "${ggURL}&id=${ggID}" >/dev/null
			
 
				+getcode="$(awk '/_warning_/ {print $NF}' /tmp/gcokie)"
			
 
				+
			
 
				+cmd='curl --insecure -C - -LOJb /tmp/gcokie "${ggURL}&confirm=${getcode}&id=${ggID}"'
			
 
				+echo -e "Downloading from "$gURL"...\n"
			
 
				+eval $cmd
			
 
				+mv extractedNVblogs.txt ../dataset/EN/
			
 
				+