Skip to content

Commit 19bc4f6

Browse files
committed
Edited jupyter notebook
Signed-off-by: Seungryeol Tae <jaketae1224@gmail.com>
1 parent 391ca38 commit 19bc4f6

2 files changed

Lines changed: 51 additions & 14 deletions

File tree

Granite Scripting/src/MOT.ipynb

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -101,9 +101,9 @@
101101
" langchain_ollama \\\n",
102102
" langchain_milvus \\\n",
103103
" replicate \\\n",
104-
" wget \\\n",
105104
" gitpython \\\n",
106-
" requests"
105+
" requests \\\n",
106+
" pypdf"
107107
]
108108
},
109109
{
@@ -255,37 +255,67 @@
255255
"source": [
256256
"### Download the document\n",
257257
"\n",
258-
"Here we you can use ."
258+
"Here we can use any model github repo as template (plan to add model name search with web search api instead of github)."
259259
]
260260
},
261261
{
262262
"cell_type": "code",
263263
"execution_count": null,
264264
"metadata": {},
265-
"outputs": [],
265+
"outputs": [
266+
{
267+
"name": "stderr",
268+
"output_type": "stream",
269+
"text": [
270+
"<>:27: SyntaxWarning: invalid escape sequence '\\.'\n",
271+
"<>:27: SyntaxWarning: invalid escape sequence '\\.'\n",
272+
"/var/folders/r5/n4xqkxwd157fww2y2qptk5xr0000gn/T/ipykernel_78178/3183198176.py:27: SyntaxWarning: invalid escape sequence '\\.'\n",
273+
" return re.findall(r'https://github\\.com/[^\\s)]+', text\n"
274+
]
275+
},
276+
{
277+
"name": "stdout",
278+
"output_type": "stream",
279+
"text": [
280+
"Cloning https://github.com/dpfried/incoder into /var/folders/r5/n4xqkxwd157fww2y2qptk5xr0000gn/T/tmpbpqzb0a8\n",
281+
"\n",
282+
"All repo content written to 'repo_summary_input.txt'\n",
283+
"Feed it into Granite/Ollama to generate your YAML.\n"
284+
]
285+
}
286+
],
266287
"source": [
267288
"import os\n",
268289
"import re\n",
269290
"import requests\n",
270291
"import tempfile\n",
271292
"from git import Repo\n",
293+
"from pypdf import PdfReader\n",
272294
"\n",
273295
"def clone_repo(repo_url, dest_dir=os.getcwd()):\n",
274296
" print(f\"Cloning {repo_url} into {dest_dir}\")\n",
275297
" Repo.clone_from(repo_url, dest_dir)\n",
276298
"\n",
277-
"def search_files_for_info(dest_dir=os.getcwd(), extensions=(\".md\", \"LICENSE.txt\")):\n",
299+
"def search_files_for_info(dest_dir=os.getcwd(), extensions=(\".md\", \".txt\", \".pdf\")):\n",
278300
" info = []\n",
279301
" for subdir, _, files in os.walk(dest_dir):\n",
280302
" for file in files:\n",
281303
" if file.endswith(extensions):\n",
282304
" filepath = os.path.join(subdir, file)\n",
283-
" try:\n",
284-
" with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:\n",
285-
" content = f.read()\n",
305+
" if(file.endswith('.pdf')):\n",
306+
" try:\n",
307+
" reader = PdfReader(file)\n",
308+
" content = \"\\n\".join(page.extract_text() for page in reader.pages if page.extract_text())\n",
286309
" info.append((filepath, content))\n",
287-
" except Exception as e:\n",
288-
" print(f\"Failed to read {filepath}: {e}\")\n",
310+
" except Exception as e:\n",
311+
" print(f\"Failed to read {filepath}: {e}\")\n",
312+
" elif(file.endswith('.md') or file.endswith('.txt')):\n",
313+
" try:\n",
314+
" with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:\n",
315+
" content = f.read()\n",
316+
" info.append((filepath, content))\n",
317+
" except Exception as e:\n",
318+
" print(f\"Failed to read {filepath}: {e}\")\n",
289319
" return info\n",
290320
" \n",
291321
"'''experimental features for advanced search such as multi-repository\n",
@@ -306,7 +336,7 @@
306336
" with tempfile.TemporaryDirectory() as tmpdir:\n",
307337
" clone_repo(repo_url, tmpdir)\n",
308338
" all_info = search_files_for_info(tmpdir)\n",
309-
" all_text = \"\\n\\n\".join(f\"File: {filepath}\\n\\n{content}\" for filepath, content in all_info)\n",
339+
" all_text = \"\\n\\n\".join(f\"File: {filepath.replace(tmpdir,repo_url)}\\n\\n{content}\" for filepath, content in all_info)\n",
310340
"\n",
311341
" \n",
312342
" # external_links = extract_github_links(all_text)\n",
@@ -324,7 +354,7 @@
324354
" f.write(all_text)\n",
325355
"\n",
326356
" print(\"\\nAll repo content written to 'repo_summary_input.txt'\")\n",
327-
" print(\"Feed it into Granite/Ollama to generate your YAML.\")\n"
357+
" print(\"Feed it into Granite/Ollama to generate your YAML.\")"
328358
]
329359
},
330360
{
@@ -410,6 +440,13 @@
410440
"Search the database for similar documents by proximity of the embedded vector in vector space."
411441
]
412442
},
443+
{
444+
"cell_type": "code",
445+
"execution_count": null,
446+
"metadata": {},
447+
"outputs": [],
448+
"source": []
449+
},
413450
{
414451
"cell_type": "code",
415452
"execution_count": null,
@@ -678,7 +715,7 @@
678715
],
679716
"metadata": {
680717
"kernelspec": {
681-
"display_name": "Python 3 (ipykernel)",
718+
"display_name": "Python 3",
682719
"language": "python",
683720
"name": "python3"
684721
},
@@ -692,7 +729,7 @@
692729
"name": "python",
693730
"nbconvert_exporter": "python",
694731
"pygments_lexer": "ipython3",
695-
"version": "3.12.5"
732+
"version": "3.12.11"
696733
}
697734
},
698735
"nbformat": 4,
2.24 MB
Binary file not shown.

0 commit comments

Comments
 (0)