Browse Source

add notebook on scrap NVblog raw text, update Start_Here.ipynb

zenodia 3 years ago
parent
commit
f0fecdc8be

+ 9 - 0
ai/Megatron/English/Python/Start_Here.ipynb

@@ -238,6 +238,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "---\n",
+    "# create your own data - web crawling \n",
+    "please go through the notebook [link here](./jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-Website_scrapping.ipynb) to scrape NVIDIA blog's data "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
     "### Tutorial Outline\n",
     "\n",
     "The following contents will be covered during the Bootcamp :\n",

File diff suppressed because it is too large
+ 535 - 0
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-Website_scrapping.ipynb


+ 22 - 0
ai/Megatron/English/Python/source_code/download_NVblogtext.sh

@@ -0,0 +1,22 @@
+# Copyright (c) 2020 NVIDIA Corporation.  All rights reserved.
+#!/usr/bin/env bash
+# gdrive_download
+# script to download Google Drive files from command line
+# not guaranteed to work indefinitely
+# taken from Stack Overflow answer:
+# http://stackoverflow.com/a/38937732/7002068
+
+gURL=https://drive.google.com/file/d/1PpyVSvRGoja6M2dbVpiscxrZD3YupRgT/view?usp=sharing
+# match more than 26 word characters
+ggID=$(echo "$gURL" | egrep -o '(\w|-){26,}')
+
+ggURL='https://drive.google.com/uc?export=download'
+
+curl -sc /tmp/gcokie "${ggURL}&id=${ggID}" >/dev/null
+getcode="$(awk '/_warning_/ {print $NF}' /tmp/gcokie)"
+
+cmd='curl --insecure -C - -LOJb /tmp/gcokie "${ggURL}&confirm=${getcode}&id=${ggID}"'
+echo -e "Downloading from "$gURL"...\n"
+eval $cmd
+mv extractedNVblogs.txt ../dataset/EN/
+