zenodia před 3 roky
rodič
revize
a0ad523f01

+ 42 - 21
ai/Megatron/English/Python/jupyter_notebook/Megatron-LM/tools/openwebtext/Day3-Website_scrapping.ipynb

@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "egyptian-reply",
+   "id": "wooden-street",
    "metadata": {},
    "source": [
     "# \n",
@@ -30,7 +30,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "spiritual-delight",
+   "id": "tight-divide",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -43,7 +43,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "stylish-ultimate",
+   "id": "celtic-beverage",
    "metadata": {},
    "source": [
     "## crawl NVblog landing page and obtain links to the individual blogs\n",
@@ -53,7 +53,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "general-reaction",
+   "id": "civil-diagnosis",
    "metadata": {},
    "outputs": [
     {
@@ -81,7 +81,7 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "sufficient-cement",
+   "id": "absent-hometown",
    "metadata": {},
    "outputs": [
     {
@@ -109,7 +109,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "concrete-portal",
+   "id": "limited-release",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -119,7 +119,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "spoken-scottish",
+   "id": "adjustable-debut",
    "metadata": {},
    "outputs": [
     {
@@ -231,7 +231,7 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "appreciated-blond",
+   "id": "freelance-usage",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -243,7 +243,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "foster-indonesia",
+   "id": "sapphire-equation",
    "metadata": {},
    "source": [
     "## fetch the urls of interest and convert to html files \n",
@@ -253,7 +253,7 @@
   {
    "cell_type": "code",
    "execution_count": 16,
-   "id": "gentle-estate",
+   "id": "continent-cheese",
    "metadata": {},
    "outputs": [
     {
@@ -284,7 +284,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "hundred-upset",
+   "id": "improved-banner",
    "metadata": {},
    "source": [
     "## fetch given url and save as .html file"
@@ -293,7 +293,7 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "id": "indie-equity",
+   "id": "congressional-vacation",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -303,7 +303,7 @@
   {
    "cell_type": "code",
    "execution_count": 18,
-   "id": "meaning-northwest",
+   "id": "varying-presence",
    "metadata": {},
    "outputs": [
     {
@@ -391,7 +391,7 @@
   {
    "cell_type": "code",
    "execution_count": 11,
-   "id": "hindu-azerbaijan",
+   "id": "fleet-brazilian",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -408,7 +408,7 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "id": "collectible-portal",
+   "id": "operational-collapse",
    "metadata": {},
    "outputs": [
     {
@@ -445,7 +445,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "hungarian-morocco",
+   "id": "juvenile-emperor",
    "metadata": {},
    "source": [
     "---\n",
@@ -455,7 +455,7 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "id": "apparent-command",
+   "id": "dimensional-practitioner",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -464,7 +464,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "japanese-sterling",
+   "id": "sustained-winner",
    "metadata": {},
    "source": [
     "---\n",
@@ -474,7 +474,7 @@
   {
    "cell_type": "code",
    "execution_count": 20,
-   "id": "arabic-politics",
+   "id": "great-crash",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -485,7 +485,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "extensive-airfare",
+   "id": "lyric-accused",
    "metadata": {},
    "source": [
     "--- \n",
@@ -495,7 +495,7 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "id": "alien-moment",
+   "id": "tender-rally",
    "metadata": {},
    "outputs": [
     {
@@ -509,6 +509,27 @@
    "source": [
     "!head -1 ../../../../dataset/EN/extractedNVblogs.txt"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "equal-violation",
+   "metadata": {},
+   "source": [
+    "## Back To Start Menu\n",
+    "[start menu](../../../../Start_Here.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "gothic-slovak",
+   "metadata": {},
+   "source": [
+    "--- \n",
+    "\n",
+    "## Licensing\n",
+    "\n",
+    "This material is released by OpenACC-Standard.org, in collaboration with NVIDIA Corporation, under the Creative Commons Attribution 4.0 International (CC BY 4.0)."
+   ]
   }
  ],
  "metadata": {