File size: 5,646 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !pip install datasets\n",
    "# !huggingface-cli login"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# from datasets import load_dataset\n",
    "# load_dataset(\"balochiml/balochi-language-data\", data_dir=\"data\", cache_dir=\"../data\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate the processed data without English characters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4294"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "\n",
    "def get_txt_file_paths(directory):\n",
    "    txt_file_paths = []\n",
    "    for root, dirs, files in os.walk(directory):\n",
    "        for file in files:\n",
    "            if file.endswith(\".txt\"):\n",
    "                file_path = os.path.join(root, file)\n",
    "                txt_file_paths.append(file_path)\n",
    "    return txt_file_paths\n",
    "\n",
    "# Replace \"directory_path\" with the actual path of the directory you want to search\n",
    "directory_path = \"../data/raw_text\"\n",
    "txt_paths = get_txt_file_paths(directory_path)\n",
    "\n",
    "len(txt_paths)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def clean_text(file_path):\n",
    "    # Open the file and read it into memory\n",
    "    with open(file_path, 'r', encoding='utf-8') as file:\n",
    "        text = file.read()\n",
    "\n",
    "    # Remove English-language characters and numbers\n",
    "    text = re.sub(r'[a-zA-Z0-9]', '', text)\n",
    "\n",
    "    # Remove any excess whitespace\n",
    "    text = re.sub(r'[^\\S\\n]+', ' ', text)\n",
    "\n",
    "    return text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "for path in txt_paths:\n",
    "    cleaned_text = clean_text(path)\n",
    "\n",
    "    # write the cleaned text to a new file with an incremented filename\n",
    "    # write the files all into the '../data/processed_text' directory\n",
    "    with open(f'../data/processed_text/{path.split(\"/\")[-1]}', 'w', encoding='utf-8') as file:\n",
    "        file.write(cleaned_text)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers import Tokenizer\n",
    "from tokenizers.models import BPE\n",
    "\n",
    "tokenizer = Tokenizer(BPE(unk_token=\"[UNK]\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tokenizers.trainers import BpeTrainer\n",
    "\n",
    "# trainer = BpeTrainer(vocab_size=25000, min_frequency=2)\n",
    "trainer = BpeTrainer(\n",
    "    min_frequency=2,\n",
    "    vocab_size=100000,\n",
    "    special_tokens=[\"[UNK]\", \"[CLS]\", \"[SEP]\", \"[PAD]\", \"[MASK]\"],\n",
    "    show_progress=True,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4294"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# get a list of all the txt files in\n",
    "# '/Users/strickvl/balochi/balochi-tokenizer/data/processed_text'\n",
    "\n",
    "processed_files = get_txt_file_paths(\"../data/processed_text\")\n",
    "assert len(processed_files) == len(txt_paths)\n",
    "len(processed_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "tokenizer.train(processed_files, trainer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<tokenizers.models.BPE at 0x140d828f0>"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "100000"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tokenizer.get_vocab_size()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer.save(\"../models/balochi-tokenizer.json\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "balochi",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}