|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "# Kannada with CLTK" |
| 8 | + ] |
| 9 | + }, |
| 10 | + { |
| 11 | + "cell_type": "markdown", |
| 12 | + "metadata": {}, |
| 13 | + "source": [ |
| 14 | + "Analyse Kannada texts with CLTK!<br>\n" |
| 15 | + ] |
| 16 | + }, |
| 17 | + { |
| 18 | + "cell_type": "markdown", |
| 19 | + "metadata": {}, |
| 20 | + "source": [ |
| 21 | + "## Kannada Alphabets" |
| 22 | + ] |
| 23 | + }, |
| 24 | + { |
| 25 | + "cell_type": "markdown", |
| 26 | + "metadata": {}, |
| 27 | + "source": [ |
| 28 | + "There are 14 <i>Swaras</i> or vowels, 25 Structured and 11 Unstructured consonants collectively known as <i>Vynjanas</i> and 2 <i>Yogavaahakas</i><br>\n", |
| 29 | + "A Consonant plus Vowel symbol makes a <i>kagunita</i>" |
| 30 | + ] |
| 31 | + }, |
| 32 | + { |
| 33 | + "cell_type": "code", |
| 34 | + "execution_count": 1, |
| 35 | + "metadata": {}, |
| 36 | + "outputs": [ |
| 37 | + { |
| 38 | + "name": "stdout", |
| 39 | + "output_type": "stream", |
| 40 | + "text": [ |
| 41 | + "Vowels: ['ಅ', 'ಆ', 'ಇ', 'ಈ', 'ಉ', 'ಊ', 'ಋ', 'ೠ', 'ಎ', 'ಏ', 'ಐಒ', 'ಒ', 'ಓ', 'ಔ']\n", |
| 42 | + "Yogavaahakas: ['ಅಂ', 'ಅಃ']\n", |
| 43 | + "Structured consonants: ['ಕ', 'ಖ', 'ಗ', 'ಘ', 'ಙಚ', 'ಚ', 'ಛ', 'ಜ', 'ಝ', 'ಞ', 'ಟ', 'ಠ', 'ಡ', 'ಢ', 'ಣ', 'ತ', 'ಥ', 'ದ', 'ಧ', 'ನ', 'ಪ', 'ಫ', 'ಬ', 'ಭ', 'ಮ']\n", |
| 44 | + "Unstructured consonants: ['ಯ', 'ರ', 'ಱ', 'ಲ', 'ವ', 'ಶ', 'ಷ', 'ಸ', 'ಹ', 'ಳ', 'ೞ']\n", |
| 45 | + "Numerals: ['೦', '೧', '೨', '೩', '೪', '೫', '೬', '೭', '೮', '೯']\n", |
| 46 | + "Vowel signs: ['', 'ಾ', 'ಿ', 'ೀ', 'ು', 'ೂ', 'ೃ', 'ೆ', 'ೇ', 'ೈ', 'ೊ', 'ೋ', 'ೌ', 'ಂ', 'ಃ']\n" |
| 47 | + ] |
| 48 | + } |
| 49 | + ], |
| 50 | + "source": [ |
| 51 | + "from cltk.corpus.kannada.alphabet import *\n", |
| 52 | + "print(\"Vowels: \", VOWELS)\n", |
| 53 | + "print(\"Yogavaahakas: \", YOGAVAAHAKAS)\n", |
| 54 | + "print(\"Structured consonants: \",STRUCTURED_CONSONANTS)\n", |
| 55 | + "print(\"Unstructured consonants: \",UNSTRUCTURED_CONSONANTS)\n", |
| 56 | + "print(\"Numerals: \",NUMERALS)\n", |
| 57 | + "print(\"Vowel signs: \",VOWEL_SIGNS)" |
| 58 | + ] |
| 59 | + }, |
| 60 | + { |
| 61 | + "cell_type": "markdown", |
| 62 | + "metadata": {}, |
| 63 | + "source": [ |
| 64 | + "## Transliterations" |
| 65 | + ] |
| 66 | + }, |
| 67 | + { |
| 68 | + "cell_type": "markdown", |
| 69 | + "metadata": {}, |
| 70 | + "source": [ |
| 71 | + "We can transliterate Kannada scripts to that of other Indic languages. Let us take an example Kannada text and transliterate it to Hindi:" |
| 72 | + ] |
| 73 | + }, |
| 74 | + { |
| 75 | + "cell_type": "code", |
| 76 | + "execution_count": 2, |
| 77 | + "metadata": {}, |
| 78 | + "outputs": [ |
| 79 | + { |
| 80 | + "name": "stdout", |
| 81 | + "output_type": "stream", |
| 82 | + "text": [ |
| 83 | + "ಗ್ರಂಥಾಲಯಗಳು ಅರಿವಿನ ಜ್ಞಾನದೀವಿಗೆಗಳು. ಇಷ್ಟಪಟ್ಟು ಓದಲು ಬರುವವರಿಗೆ, ಜ್ಞಾನದ ಹೊಸ ಹೊಳಹನ್ನು ನೀಡುವ ಅಕ್ಷಯ ಭಂಡಾರಗಳು. ಗ್ರಂಥಾಲಯಗಳ ಸಂಪನ್ಮೂಲಗಳು ಎಂದಿಗೂ ಎಲ್ಲಿಯೂ ಬತ್ತಿಹೋಗುವುದಿಲ್ಲ. ಪ್ರಾಚೀನ ಕಾಲದಲ್ಲಿ ಮುದ್ರಾಣಾಲಯಗಳಿರಲಿಲ್ಲ. ಆದ್ದರಿಂದ ಜ್ಞಾನವನ್ನು ಸಂಪಾದಿಸಲು ಬಹಳ ಕಷ್ಟಪಡಬೇಕಾಗುತ್ತಿತ್ತು. ಈಗ ಗ್ರಂಥಗಳು ನಮಗೆ ಬೇಕಾದ ವಿಷಯಗಳನ್ನು ತಿಳಿಸಲು ಸಿದ್ಧವಿರುವುವು. ನಾವು ಬೇಕಾದಾಗ ಗ್ರಂಥಾಲಯಕ್ಕೆ ಹೋಗಿ ಬೇಕಾದ ಗ್ರಂಥಗಳನ್ನು ಓದಿ ಜ್ಞಾನ ಪಡೆಯಬಹುದು.\n" |
| 84 | + ] |
| 85 | + } |
| 86 | + ], |
| 87 | + "source": [ |
| 88 | + "kannada_text = \"ಗ್ರಂಥಾಲಯಗಳು ಅರಿವಿನ ಜ್ಞಾನದೀವಿಗೆಗಳು. ಇಷ್ಟಪಟ್ಟು ಓದಲು ಬರುವವರಿಗೆ, ಜ್ಞಾನದ ಹೊಸ ಹೊಳಹನ್ನು ನೀಡುವ ಅಕ್ಷಯ ಭಂಡಾರಗಳು. ಗ್ರಂಥಾಲಯಗಳ ಸಂಪನ್ಮೂಲಗಳು ಎಂದಿಗೂ ಎಲ್ಲಿಯೂ ಬತ್ತಿಹೋಗುವುದಿಲ್ಲ. ಪ್ರಾಚೀನ ಕಾಲದಲ್ಲಿ ಮುದ್ರಾಣಾಲಯಗಳಿರಲಿಲ್ಲ. ಆದ್ದರಿಂದ ಜ್ಞಾನವನ್ನು ಸಂಪಾದಿಸಲು ಬಹಳ ಕಷ್ಟಪಡಬೇಕಾಗುತ್ತಿತ್ತು. ಈಗ ಗ್ರಂಥಗಳು ನಮಗೆ ಬೇಕಾದ ವಿಷಯಗಳನ್ನು ತಿಳಿಸಲು ಸಿದ್ಧವಿರುವುವು. ನಾವು ಬೇಕಾದಾಗ ಗ್ರಂಥಾಲಯಕ್ಕೆ ಹೋಗಿ ಬೇಕಾದ ಗ್ರಂಥಗಳನ್ನು ಓದಿ ಜ್ಞಾನ ಪಡೆಯಬಹುದು.\"\n", |
| 89 | + "print(kannada_text)" |
| 90 | + ] |
| 91 | + }, |
| 92 | + { |
| 93 | + "cell_type": "code", |
| 94 | + "execution_count": 3, |
| 95 | + "metadata": {}, |
| 96 | + "outputs": [ |
| 97 | + { |
| 98 | + "data": { |
| 99 | + "text/plain": [ |
| 100 | + "'ग्रंथालयगळु अरिविन ज्ञानदीविगॆगळु. इष्टपट्टु ओदलु बरुववरिगॆ, ज्ञानद हॊस हॊळहन्नु नीडुव अक्षय भंडारगळु. ग्रंथालयगळ संपन्मूलगळु ऎंदिगू ऎल्लियू बत्तिहोगुवुदिल्ल. प्राचीन कालदल्लि मुद्राणालयगळिरलिल्ल. आद्दरिंद ज्ञानवन्नु संपादिसलु बहळ कष्टपडबेकागुत्तित्तु. ईग ग्रंथगळु नमगॆ बेकाद विषयगळन्नु तिळिसलु सिद्धविरुवुवु. नावु बेकादाग ग्रंथालयक्कॆ होगि बेकाद ग्रंथगळन्नु ओदि ज्ञान पडॆयबहुदु.'" |
| 101 | + ] |
| 102 | + }, |
| 103 | + "execution_count": 3, |
| 104 | + "metadata": {}, |
| 105 | + "output_type": "execute_result" |
| 106 | + } |
| 107 | + ], |
| 108 | + "source": [ |
| 109 | + "from cltk.corpus.sanskrit.itrans.unicode_transliterate import UnicodeIndicTransliterator\n", |
| 110 | + "UnicodeIndicTransliterator.transliterate(kannada_text,\"kn\",\"hi\")" |
| 111 | + ] |
| 112 | + }, |
| 113 | + { |
| 114 | + "cell_type": "markdown", |
| 115 | + "metadata": {}, |
| 116 | + "source": [ |
| 117 | + "We can also romanize the text as shown:" |
| 118 | + ] |
| 119 | + }, |
| 120 | + { |
| 121 | + "cell_type": "code", |
| 122 | + "execution_count": 4, |
| 123 | + "metadata": {}, |
| 124 | + "outputs": [ |
| 125 | + { |
| 126 | + "data": { |
| 127 | + "text/plain": [ |
| 128 | + "'shraddh.e mattu shramawu pratibh.eyannu solisabahudu'" |
| 129 | + ] |
| 130 | + }, |
| 131 | + "execution_count": 4, |
| 132 | + "metadata": {}, |
| 133 | + "output_type": "execute_result" |
| 134 | + } |
| 135 | + ], |
| 136 | + "source": [ |
| 137 | + "kannada_text_two = \"ಶ್ರದ್ಧೆ ಮತ್ತು ಶ್ರಮವು ಪ್ರತಿಭೆಯನ್ನು ಸೋಲಿಸಬಹುದು\"\n", |
| 138 | + "from cltk.corpus.sanskrit.itrans.unicode_transliterate import ItransTransliterator\n", |
| 139 | + "ItransTransliterator.to_itrans(kannada_text_two,'kn')\n" |
| 140 | + ] |
| 141 | + }, |
| 142 | + { |
| 143 | + "cell_type": "markdown", |
| 144 | + "metadata": {}, |
| 145 | + "source": [ |
| 146 | + "Similarly, we can indicize a text given in its ITRANS-transliteration" |
| 147 | + ] |
| 148 | + }, |
| 149 | + { |
| 150 | + "cell_type": "code", |
| 151 | + "execution_count": 5, |
| 152 | + "metadata": {}, |
| 153 | + "outputs": [ |
| 154 | + { |
| 155 | + "data": { |
| 156 | + "text/plain": [ |
| 157 | + "'ಪ್ರಾಚೀನ ಗ್ರಂಥಾಲಯಗಳು ಕೇವಲ ಹಸ್ತಪ್ರತಿ, ತಾಳೇಗರಿ, ಚರ್ಮಪಟ್ಟಿ ಮೊದಲಾದುವುಗಳ ಸಂಗ್ರಹಗಳಾಗಿದ್ದುವು'" |
| 158 | + ] |
| 159 | + }, |
| 160 | + "execution_count": 5, |
| 161 | + "metadata": {}, |
| 162 | + "output_type": "execute_result" |
| 163 | + } |
| 164 | + ], |
| 165 | + "source": [ |
| 166 | + "kannada_text_itrans = 'praachiina gra.mthaalayagaldu kewala hastaprati, taaldegari, charmapaTTi m.odalaaduwugalda sa.mgrahagaldaagidduwu'\n", |
| 167 | + "ItransTransliterator.from_itrans(kannada_text_itrans,'kn')" |
| 168 | + ] |
| 169 | + }, |
| 170 | + { |
| 171 | + "cell_type": "markdown", |
| 172 | + "metadata": {}, |
| 173 | + "source": [ |
| 174 | + "## Syllabifier" |
| 175 | + ] |
| 176 | + }, |
| 177 | + { |
| 178 | + "cell_type": "markdown", |
| 179 | + "metadata": {}, |
| 180 | + "source": [ |
| 181 | + "We can use the indian_syllabifier to syllabify the Kannada sentences. To do this, we will have to import models as follows. The importing of `sanskrit_models_cltk` might take some time." |
| 182 | + ] |
| 183 | + }, |
| 184 | + { |
| 185 | + "cell_type": "code", |
| 186 | + "execution_count": 6, |
| 187 | + "metadata": { |
| 188 | + "scrolled": true |
| 189 | + }, |
| 190 | + "outputs": [], |
| 191 | + "source": [ |
| 192 | + "from cltk.corpus.utils.importer import CorpusImporter\n", |
| 193 | + "phonetics_model_importer = CorpusImporter('sanskrit')\n", |
| 194 | + "phonetics_model_importer.list_corpora\n", |
| 195 | + "phonetics_model_importer.import_corpus('sanskrit_models_cltk') " |
| 196 | + ] |
| 197 | + }, |
| 198 | + { |
| 199 | + "cell_type": "markdown", |
| 200 | + "metadata": {}, |
| 201 | + "source": [ |
| 202 | + "Now we import the syllabifier and syllabify as follows:" |
| 203 | + ] |
| 204 | + }, |
| 205 | + { |
| 206 | + "cell_type": "code", |
| 207 | + "execution_count": 7, |
| 208 | + "metadata": {}, |
| 209 | + "outputs": [], |
| 210 | + "source": [ |
| 211 | + "%%capture\n", |
| 212 | + "from cltk.stem.sanskrit.indian_syllabifier import Syllabifier\n", |
| 213 | + "kannada_syllabifier = Syllabifier('kannada')\n", |
| 214 | + "kannada_syllables = kannada_syllabifier.orthographic_syllabify('ಹಸ್ತಪ್ರತಿ')" |
| 215 | + ] |
| 216 | + }, |
| 217 | + { |
| 218 | + "cell_type": "markdown", |
| 219 | + "metadata": {}, |
| 220 | + "source": [ |
| 221 | + "The syllables of the word ಹಸ್ತಪ್ರತಿ will thus be:" |
| 222 | + ] |
| 223 | + }, |
| 224 | + { |
| 225 | + "cell_type": "code", |
| 226 | + "execution_count": 8, |
| 227 | + "metadata": {}, |
| 228 | + "outputs": [ |
| 229 | + { |
| 230 | + "name": "stdout", |
| 231 | + "output_type": "stream", |
| 232 | + "text": [ |
| 233 | + "['ಹ', 'ಸ್ತ', 'ಪ್ರ', 'ತಿ']\n" |
| 234 | + ] |
| 235 | + } |
| 236 | + ], |
| 237 | + "source": [ |
| 238 | + "print(kannada_syllables)" |
| 239 | + ] |
| 240 | + } |
| 241 | + ], |
| 242 | + "metadata": { |
| 243 | + "kernelspec": { |
| 244 | + "display_name": "Python 3", |
| 245 | + "language": "python", |
| 246 | + "name": "python3" |
| 247 | + }, |
| 248 | + "language_info": { |
| 249 | + "codemirror_mode": { |
| 250 | + "name": "ipython", |
| 251 | + "version": 3 |
| 252 | + }, |
| 253 | + "file_extension": ".py", |
| 254 | + "mimetype": "text/x-python", |
| 255 | + "name": "python", |
| 256 | + "nbconvert_exporter": "python", |
| 257 | + "pygments_lexer": "ipython3", |
| 258 | + "version": "3.6.5" |
| 259 | + } |
| 260 | + }, |
| 261 | + "nbformat": 4, |
| 262 | + "nbformat_minor": 2 |
| 263 | +} |
0 commit comments