lfai
diff --git a/‎Granite Scripting/src/MOT.ipynb‎
Lines changed: 51 additions & 14 deletions b/‎Granite Scripting/src/MOT.ipynb‎
Lines changed: 51 additions & 14 deletions
diff --git a/‎Granite Scripting/src/repo_summary_input.txt‎
2.24 MB b/‎Granite Scripting/src/repo_summary_input.txt‎
2.24 MB
@@ -101,9 +101,9 @@
     "    langchain_ollama \\\n",
     "    langchain_milvus \\\n",
     "    replicate \\\n",
-    "    wget \\\n",
     "    gitpython \\\n",
-    "    requests"
+    "    requests \\\n",
+    "    pypdf"
    ]
   },
   {
@@ -255,37 +255,67 @@
    "source": [
     "### Download the document\n",
     "\n",
-    "Here we you can use ."
+    "Here we can use any model github repo as template (plan to add model name search with web search api instead of github)."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<>:27: SyntaxWarning: invalid escape sequence '\\.'\n",
+      "<>:27: SyntaxWarning: invalid escape sequence '\\.'\n",
+      "/var/folders/r5/n4xqkxwd157fww2y2qptk5xr0000gn/T/ipykernel_78178/3183198176.py:27: SyntaxWarning: invalid escape sequence '\\.'\n",
+      "  return re.findall(r'https://github\\.com/[^\\s)]+', text\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cloning https://github.com/dpfried/incoder into /var/folders/r5/n4xqkxwd157fww2y2qptk5xr0000gn/T/tmpbpqzb0a8\n",
+      "\n",
+      "All repo content written to 'repo_summary_input.txt'\n",
+      "Feed it into Granite/Ollama to generate your YAML.\n"
+     ]
+    }
+   ],
    "source": [
     "import os\n",
     "import re\n",
     "import requests\n",
     "import tempfile\n",
     "from git import Repo\n",
+    "from pypdf import PdfReader\n",
     "\n",
     "def clone_repo(repo_url, dest_dir=os.getcwd()):\n",
     "    print(f\"Cloning {repo_url} into {dest_dir}\")\n",
     "    Repo.clone_from(repo_url, dest_dir)\n",
     "\n",
-    "def search_files_for_info(dest_dir=os.getcwd(), extensions=(\".md\", \"LICENSE.txt\")):\n",
+    "def search_files_for_info(dest_dir=os.getcwd(), extensions=(\".md\", \".txt\", \".pdf\")):\n",
     "    info = []\n",
     "    for subdir, _, files in os.walk(dest_dir):\n",
     "        for file in files:\n",
     "            if file.endswith(extensions):\n",
     "                filepath = os.path.join(subdir, file)\n",
-    "                try:\n",
-    "                    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:\n",
-    "                        content = f.read()\n",
+    "                if(file.endswith('.pdf')):\n",
+    "                    try:\n",
+    "                        reader = PdfReader(file)\n",
+    "                        content = \"\\n\".join(page.extract_text() for page in reader.pages if page.extract_text())\n",
     "                        info.append((filepath, content))\n",
-    "                except Exception as e:\n",
-    "                    print(f\"Failed to read {filepath}: {e}\")\n",
+    "                    except Exception as e:\n",
+    "                        print(f\"Failed to read {filepath}: {e}\")\n",
+    "                elif(file.endswith('.md') or file.endswith('.txt')):\n",
+    "                    try:\n",
+    "                        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:\n",
+    "                            content = f.read()\n",
+    "                            info.append((filepath, content))\n",
+    "                    except Exception as e:\n",
+    "                        print(f\"Failed to read {filepath}: {e}\")\n",
     "    return info\n",
     "    \n",
     "'''experimental features for advanced search such as multi-repository\n",
@@ -306,7 +336,7 @@
     "    with tempfile.TemporaryDirectory() as tmpdir:\n",
     "        clone_repo(repo_url, tmpdir)\n",
     "        all_info = search_files_for_info(tmpdir)\n",
-    "        all_text = \"\\n\\n\".join(f\"File: {filepath}\\n\\n{content}\" for filepath, content in all_info)\n",
+    "        all_text = \"\\n\\n\".join(f\"File: {filepath.replace(tmpdir,repo_url)}\\n\\n{content}\" for filepath, content in all_info)\n",
     "\n",
     "        \n",
     "        # external_links = extract_github_links(all_text)\n",
@@ -324,7 +354,7 @@
     "        f.write(all_text)\n",
     "\n",
     "    print(\"\\nAll repo content written to 'repo_summary_input.txt'\")\n",
-    "    print(\"Feed it into Granite/Ollama to generate your YAML.\")\n"
+    "    print(\"Feed it into Granite/Ollama to generate your YAML.\")"
    ]
   },
   {
@@ -410,6 +440,13 @@
     "Search the database for similar documents by proximity of the embedded vector in vector space."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -678,7 +715,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
    "language": "python",
    "name": "python3"
   },
@@ -692,7 +729,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.5"
+   "version": "3.12.11"
   }
  },
  "nbformat": 4,