mino98 · November 7, 2019 13:39
diff --git a/ngrams.ipynb b/ngrams.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# How many unique ngrams are there?\n",
    "\n",
    "Source of these downloads: https://fasttext.cc/docs/en/english-vectors.html"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current\n",
      "                                 Dload  Upload   Total   Spent    Left  Speed\n",
      "100 1453M  100 1453M    0     0  11.7M      0  0:02:03  0:02:03 --:--:-- 11.5M.1M\n"
     ]
    }
   ],
   "source": [
    "# English crawl dataset:\n",
    "!curl https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -o /tmp/crawl-300d-2M.vec.zip\n",
    "!unzip /tmp/crawl-300d-2M.vec.zip -d /tmp\n",
    "\n",
    "# English wiki dataset:\n",
    "!curl https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip -o /tmp/wiki-news-300d-1M.vec.zip\n",
    "!unzip /tmp/wiki-news-300d-1M.vec.zip -d /tmp"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's start parsing..."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import tqdm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "nwords = None\n",
    "words = set()\n",
    "\n",
    "# Pick one at a time:\n",
    "# with open(\"/tmp/crawl-300d-2M.vec\", newline=\"\") as fp:\n",
    "with open(\"/tmp/wiki-news-300d-1M.vec\", newline=\"\") as fp:\n",
    "    csvreader = csv.reader(fp, delimiter=' ', quoting=csv.QUOTE_NONE)\n",
    "    for row in csvreader:\n",
    "        if nwords == None:\n",
    "            nwords = int(row[0])\n",
    "            continue\n",
    "            \n",
    "        words.add(row[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "999994"
      ]
     },
     "execution_count": 60,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "# default ngram parameters:\n",
    "minn=1\n",
    "maxn=5"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "476c967889ff491994ed07bd2f94ab8b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "HBox(children=(IntProgress(value=0, max=999994), HTML(value='')))"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "ngrams = set()\n",
    "\n",
    "for word in tqdm.tqdm_notebook(words):\n",
    "    for length in range(minn, maxn + 1):\n",
    "        for start in range(0, len(word) - length + 1):\n",
    "            ngram = word[start:start+length]\n",
    "            ngrams.add(ngram)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1315033"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(ngrams)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# How many unique ngrams are there?\n",
	"\n",
	"Source of these downloads: https://fasttext.cc/docs/en/english-vectors.html"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 53,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" % Total % Received % Xferd Average Speed Time Time Time Current\n",
	" Dload Upload Total Spent Left Speed\n",
	"100 1453M 100 1453M 0 0 11.7M 0 0:02:03 0:02:03 --:--:-- 11.5M.1M\n"
	]
	}
	],
	"source": [
	"# English crawl dataset:\n",
	"!curl https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip -o /tmp/crawl-300d-2M.vec.zip\n",
	"!unzip /tmp/crawl-300d-2M.vec.zip -d /tmp\n",
	"\n",
	"# English wiki dataset:\n",
	"!curl https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip -o /tmp/wiki-news-300d-1M.vec.zip\n",
	"!unzip /tmp/wiki-news-300d-1M.vec.zip -d /tmp"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Let's start parsing..."
	]
	},
	{
	"cell_type": "code",
	"execution_count": 39,
	"metadata": {},
	"outputs": [],
	"source": [
	"import csv\n",
	"import tqdm"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 59,
	"metadata": {},
	"outputs": [],
	"source": [
	"nwords = None\n",
	"words = set()\n",
	"\n",
	"# Pick one at a time:\n",
	"# with open(\"/tmp/crawl-300d-2M.vec\", newline=\"\") as fp:\n",
	"with open(\"/tmp/wiki-news-300d-1M.vec\", newline=\"\") as fp:\n",
	" csvreader = csv.reader(fp, delimiter=' ', quoting=csv.QUOTE_NONE)\n",
	" for row in csvreader:\n",
	" if nwords == None:\n",
	" nwords = int(row[0])\n",
	" continue\n",
	" \n",
	" words.add(row[0])"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 60,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"999994"
	]
	},
	"execution_count": 60,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(words)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 61,
	"metadata": {},
	"outputs": [],
	"source": [
	"# default ngram parameters:\n",
	"minn=1\n",
	"maxn=5"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 62,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"application/vnd.jupyter.widget-view+json": {
	"model_id": "476c967889ff491994ed07bd2f94ab8b",
	"version_major": 2,
	"version_minor": 0
	},
	"text/plain": [
	"HBox(children=(IntProgress(value=0, max=999994), HTML(value='')))"
	]
	},
	"metadata": {},
	"output_type": "display_data"
	}
	],
	"source": [
	"ngrams = set()\n",
	"\n",
	"for word in tqdm.tqdm_notebook(words):\n",
	" for length in range(minn, maxn + 1):\n",
	" for start in range(0, len(word) - length + 1):\n",
	" ngram = word[start:start+length]\n",
	" ngrams.add(ngram)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 63,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"1315033"
	]
	},
	"execution_count": 63,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(ngrams)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.7.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}