|
101 | 101 | " langchain_ollama \\\n", |
102 | 102 | " langchain_milvus \\\n", |
103 | 103 | " replicate \\\n", |
104 | | - " wget \\\n", |
105 | 104 | " gitpython \\\n", |
106 | | - " requests" |
| 105 | + " requests \\\n", |
| 106 | + " pypdf" |
107 | 107 | ] |
108 | 108 | }, |
109 | 109 | { |
|
255 | 255 | "source": [ |
256 | 256 | "### Download the document\n", |
257 | 257 | "\n", |
258 | | - "Here we you can use ." |
| 258 | + "Here we can use any model github repo as template (plan to add model name search with web search api instead of github)." |
259 | 259 | ] |
260 | 260 | }, |
261 | 261 | { |
262 | 262 | "cell_type": "code", |
263 | 263 | "execution_count": null, |
264 | 264 | "metadata": {}, |
265 | | - "outputs": [], |
| 265 | + "outputs": [ |
| 266 | + { |
| 267 | + "name": "stderr", |
| 268 | + "output_type": "stream", |
| 269 | + "text": [ |
| 270 | + "<>:27: SyntaxWarning: invalid escape sequence '\\.'\n", |
| 271 | + "<>:27: SyntaxWarning: invalid escape sequence '\\.'\n", |
| 272 | + "/var/folders/r5/n4xqkxwd157fww2y2qptk5xr0000gn/T/ipykernel_78178/3183198176.py:27: SyntaxWarning: invalid escape sequence '\\.'\n", |
| 273 | + " return re.findall(r'https://github\\.com/[^\\s)]+', text\n" |
| 274 | + ] |
| 275 | + }, |
| 276 | + { |
| 277 | + "name": "stdout", |
| 278 | + "output_type": "stream", |
| 279 | + "text": [ |
| 280 | + "Cloning https://github.com/dpfried/incoder into /var/folders/r5/n4xqkxwd157fww2y2qptk5xr0000gn/T/tmpbpqzb0a8\n", |
| 281 | + "\n", |
| 282 | + "All repo content written to 'repo_summary_input.txt'\n", |
| 283 | + "Feed it into Granite/Ollama to generate your YAML.\n" |
| 284 | + ] |
| 285 | + } |
| 286 | + ], |
266 | 287 | "source": [ |
267 | 288 | "import os\n", |
268 | 289 | "import re\n", |
269 | 290 | "import requests\n", |
270 | 291 | "import tempfile\n", |
271 | 292 | "from git import Repo\n", |
| 293 | + "from pypdf import PdfReader\n", |
272 | 294 | "\n", |
273 | 295 | "def clone_repo(repo_url, dest_dir=os.getcwd()):\n", |
274 | 296 | " print(f\"Cloning {repo_url} into {dest_dir}\")\n", |
275 | 297 | " Repo.clone_from(repo_url, dest_dir)\n", |
276 | 298 | "\n", |
277 | | - "def search_files_for_info(dest_dir=os.getcwd(), extensions=(\".md\", \"LICENSE.txt\")):\n", |
| 299 | + "def search_files_for_info(dest_dir=os.getcwd(), extensions=(\".md\", \".txt\", \".pdf\")):\n", |
278 | 300 | " info = []\n", |
279 | 301 | " for subdir, _, files in os.walk(dest_dir):\n", |
280 | 302 | " for file in files:\n", |
281 | 303 | " if file.endswith(extensions):\n", |
282 | 304 | " filepath = os.path.join(subdir, file)\n", |
283 | | - " try:\n", |
284 | | - " with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:\n", |
285 | | - " content = f.read()\n", |
| 305 | + " if(file.endswith('.pdf')):\n", |
| 306 | + " try:\n", |
| 307 | + " reader = PdfReader(file)\n", |
| 308 | + " content = \"\\n\".join(page.extract_text() for page in reader.pages if page.extract_text())\n", |
286 | 309 | " info.append((filepath, content))\n", |
287 | | - " except Exception as e:\n", |
288 | | - " print(f\"Failed to read {filepath}: {e}\")\n", |
| 310 | + " except Exception as e:\n", |
| 311 | + " print(f\"Failed to read {filepath}: {e}\")\n", |
| 312 | + " elif(file.endswith('.md') or file.endswith('.txt')):\n", |
| 313 | + " try:\n", |
| 314 | + " with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:\n", |
| 315 | + " content = f.read()\n", |
| 316 | + " info.append((filepath, content))\n", |
| 317 | + " except Exception as e:\n", |
| 318 | + " print(f\"Failed to read {filepath}: {e}\")\n", |
289 | 319 | " return info\n", |
290 | 320 | " \n", |
291 | 321 | "'''experimental features for advanced search such as multi-repository\n", |
|
306 | 336 | " with tempfile.TemporaryDirectory() as tmpdir:\n", |
307 | 337 | " clone_repo(repo_url, tmpdir)\n", |
308 | 338 | " all_info = search_files_for_info(tmpdir)\n", |
309 | | - " all_text = \"\\n\\n\".join(f\"File: {filepath}\\n\\n{content}\" for filepath, content in all_info)\n", |
| 339 | + " all_text = \"\\n\\n\".join(f\"File: {filepath.replace(tmpdir,repo_url)}\\n\\n{content}\" for filepath, content in all_info)\n", |
310 | 340 | "\n", |
311 | 341 | " \n", |
312 | 342 | " # external_links = extract_github_links(all_text)\n", |
|
324 | 354 | " f.write(all_text)\n", |
325 | 355 | "\n", |
326 | 356 | " print(\"\\nAll repo content written to 'repo_summary_input.txt'\")\n", |
327 | | - " print(\"Feed it into Granite/Ollama to generate your YAML.\")\n" |
| 357 | + " print(\"Feed it into Granite/Ollama to generate your YAML.\")" |
328 | 358 | ] |
329 | 359 | }, |
330 | 360 | { |
|
410 | 440 | "Search the database for similar documents by proximity of the embedded vector in vector space." |
411 | 441 | ] |
412 | 442 | }, |
| 443 | + { |
| 444 | + "cell_type": "code", |
| 445 | + "execution_count": null, |
| 446 | + "metadata": {}, |
| 447 | + "outputs": [], |
| 448 | + "source": [] |
| 449 | + }, |
413 | 450 | { |
414 | 451 | "cell_type": "code", |
415 | 452 | "execution_count": null, |
|
678 | 715 | ], |
679 | 716 | "metadata": { |
680 | 717 | "kernelspec": { |
681 | | - "display_name": "Python 3 (ipykernel)", |
| 718 | + "display_name": "Python 3", |
682 | 719 | "language": "python", |
683 | 720 | "name": "python3" |
684 | 721 | }, |
|
692 | 729 | "name": "python", |
693 | 730 | "nbconvert_exporter": "python", |
694 | 731 | "pygments_lexer": "ipython3", |
695 | | - "version": "3.12.5" |
| 732 | + "version": "3.12.11" |
696 | 733 | } |
697 | 734 | }, |
698 | 735 | "nbformat": 4, |
|
0 commit comments