Otosaku
diff --git a/‎.gitignore‎
Lines changed: 54 additions & 0 deletions b/‎.gitignore‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 182 additions & 0 deletions b/‎README.md‎
Lines changed: 182 additions & 0 deletions
diff --git a/‎app/build.gradle.kts‎
Lines changed: 51 additions & 0 deletions b/‎app/build.gradle.kts‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎app/proguard-rules.pro‎
Lines changed: 1 addition & 0 deletions b/‎app/proguard-rules.pro‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎app/src/main/AndroidManifest.xml‎
Lines changed: 23 additions & 0 deletions b/‎app/src/main/AndroidManifest.xml‎
Lines changed: 23 additions & 0 deletions
@@ -0,0 +1,54 @@
+# Built application files
+*.apk
+*.ap_
+*.aab
+
+# Files for the ART/Dalvik VM
+*.dex
+
+# Java class files
+*.class
+
+# Generated files
+bin/
+gen/
+out/
+build/
+
+# Gradle files
+.gradle/
+build/
+
+# Local configuration file (sdk path, etc)
+local.properties
+
+# Log Files
+*.log
+
+# Android Studio Navigation editor temp files
+.navigation/
+
+# Android Studio captures folder
+captures/
+
+# IntelliJ
+*.iml
+.idea/
+
+# Keystore files
+*.jks
+*.keystore
+
+# Google Services
+google-services.json
+
+# MacOS
+.DS_Store
+
+# Model files (user should add these to app/src/main/assets/)
+*.pte
+*.onnx
+app/src/main/assets/conformer_encoder.onnx
+app/src/main/assets/conformer_decoder.onnx
+app/src/main/assets/vocabulary.json
+app/src/main/assets/sample_audio.wav
@@ -0,0 +1,182 @@
+# NeMoConformerASR-Android
+
+Kotlin library for speech recognition using NVIDIA NeMo Conformer CTC model on Android with ONNX Runtime.
+
+## Features
+
+- NVIDIA NeMo Conformer CTC Small model (13M parameters)
+- **ONNX Runtime** for reliable cross-device inference
+- Returns both full text and timestamped segments
+- Automatic audio chunking for long audio (>20 seconds)
+- BPE tokenization (1024 vocabulary)
+- Pure Kotlin implementation
+
+## Requirements
+
+- Android API 26+
+- Any ARM or x86 device (ONNX Runtime handles compatibility)
+
+## Installation
+
+### JitPack
+
+Add JitPack to your root `settings.gradle.kts`:
+
+```kotlin
+dependencyResolutionManagement {
+    repositories {
+        maven { url = uri("https://jitpack.io") }
+    }
+}
+```
+
+Add the dependency to your module's `build.gradle.kts`:
+
+```kotlin
+dependencies {
+    implementation("com.github.Otosaku:NeMoConformerASR-Android:1.0.0")
+}
+```
+
+### Download Models
+
+Download the ONNX models from Google Drive:
+
+**[Download Models (65 MB)](https://drive.google.com/file/d/1F2QBIyvxONhufgIA5xD0aN07wuN6Bn9r/view?usp=sharing)**
+
+The archive contains:
+- `conformer_encoder.onnx` - Conformer encoder (64 MB)
+- `conformer_decoder.onnx` - CTC decoder (0.7 MB)
+- `vocabulary.json` - BPE vocabulary (1024 tokens)
+
+Models should be downloaded to app's internal storage (not bundled in APK to reduce app size).
+
+## Usage
+
+### Basic Recognition
+
+```kotlin
+import com.otosaku.nemoconformerasr.NeMoConformerASR
+
+// Initialize with model file paths
+val asr = NeMoConformerASR(
+    context = context,
+    encoderPath = "${context.filesDir}/conformer_encoder.onnx",
+    decoderPath = "${context.filesDir}/conformer_decoder.onnx",
+    vocabularyPath = "${context.filesDir}/vocabulary.json"
+)
+
+// Recognize speech (samples must be 16kHz mono Float32)
+val audioSamples: FloatArray = loadAudio()
+val result = asr.recognize(audioSamples)
+
+// Full recognized text
+println(result.text)
+
+// Individual segments with timestamps
+for (segment in result.segments) {
+    println("[${segment.start}s - ${segment.end}s]: ${segment.text}")
+}
+
+// Audio duration
+println("Duration: ${result.audioDuration}s")
+
+// Don't forget to close when done
+asr.close()
+```
+
+### ASRResult Structure
+
+```kotlin
+data class ASRResult(
+    val text: String,              // Full recognized text
+    val segments: List<ASRSegment>, // Timestamped segments
+    val audioDuration: Double      // Total audio duration in seconds
+)
+
+data class ASRSegment(
+    val start: Double,  // Start time in seconds
+    val end: Double,    // End time in seconds
+    val text: String    // Recognized text for this segment
+)
+```
+
+### Supported Input Durations
+
+The model accepts up to 20 seconds of audio per inference. Longer audio is automatically split into chunks.
+
+| Duration | Samples | Mel Frames | Encoded Frames |
+|----------|---------|------------|----------------|
+| 5 sec    | 80,000  | 501        | 126            |
+| 10 sec   | 160,000 | 1,001      | 251            |
+| 15 sec   | 240,000 | 1,501      | 376            |
+| 20 sec   | 320,000 | 2,001      | 501            |
+
+### Long Audio Processing
+
+For audio longer than 20 seconds, the library automatically:
+1. Splits audio into 20-second chunks
+2. Processes each chunk independently
+3. Combines results with proper timestamps
+
+## Example Project
+
+The repository includes a complete example app with audio recording and file import.
+
+### Running the Example
+
+1. Open the project in Android Studio
+
+2. Download and add models:
+   - Download models from the link above
+   - Unzip the archive
+   - Copy files to `app/src/main/assets/`:
+     - `conformer_encoder.onnx`
+     - `conformer_decoder.onnx`
+     - `vocabulary.json`
+
+3. Build and run on device
+
+### Example Features
+
+- **Record Audio**: Hold button to record from microphone
+- **Test File**: Import audio file for testing
+- **Results**: Shows recognized text, duration, and processing time
+
+## Model Information
+
+- **Model**: nvidia/stt_en_conformer_ctc_small
+- **Parameters**: 13.15M
+- **Architecture**: Conformer encoder (16 layers) + CTC decoder
+- **Hidden dim**: 176
+- **Attention heads**: 4
+- **Vocabulary**: 1024 BPE tokens + 1 blank
+
+## Audio Requirements
+
+- Sample rate: 16,000 Hz
+- Channels: Mono
+- Format: Float32
+
+## Model Architecture
+
+| Component | Input | Output | Size |
+|-----------|-------|--------|------|
+| Feature Extractor | audio (16kHz) | mel (80, frames) | - |
+| Encoder | mel (1, 80, 2001) | hidden (1, 176, 501) | 64 MB |
+| Decoder | hidden (1, 176, 501) | logits (1, 501, 1025) | 0.7 MB |
+
+## Dependencies
+
+- [ONNX Runtime Android](https://onnxruntime.ai/) - ML inference runtime
+- [NeMoFeatureExtractor-Android](https://github.com/Otosaku/NeMoFeatureExtractor-Android) - Mel spectrogram extraction
+- [Gson](https://github.com/google/gson) - JSON parsing
+
+## License
+
+MIT License
+
+## Acknowledgments
+
+- [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) - Original model and training
+- [ONNX Runtime](https://onnxruntime.ai/) - Cross-platform ML inference
@@ -0,0 +1,51 @@
+plugins {
+    id("com.android.application")
+    id("org.jetbrains.kotlin.android")
+}
+
+android {
+    namespace = "com.otosaku.conformerexample"
+    compileSdk = 34
+
+    defaultConfig {
+        applicationId = "com.otosaku.conformerexample"
+        minSdk = 26
+        targetSdk = 34
+        versionCode = 1
+        versionName = "1.0"
+    }
+
+    buildTypes {
+        release {
+            isMinifyEnabled = false
+            proguardFiles(
+                getDefaultProguardFile("proguard-android-optimize.txt"),
+                "proguard-rules.pro"
+            )
+        }
+    }
+
+    compileOptions {
+        sourceCompatibility = JavaVersion.VERSION_1_8
+        targetCompatibility = JavaVersion.VERSION_1_8
+    }
+
+    kotlinOptions {
+        jvmTarget = "1.8"
+    }
+
+    buildFeatures {
+        viewBinding = true
+    }
+}
+
+dependencies {
+    implementation(project(":library"))
+
+    implementation("androidx.core:core-ktx:1.12.0")
+    implementation("androidx.appcompat:appcompat:1.6.1")
+    implementation("com.google.android.material:material:1.11.0")
+    implementation("androidx.activity:activity-ktx:1.8.2")
+    implementation("androidx.lifecycle:lifecycle-runtime-ktx:2.7.0")
+    implementation("org.jetbrains.kotlinx:kotlinx-coroutines-android:1.7.3")
+}
@@ -0,0 +1 @@
+# Add project specific ProGuard rules here.
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android">
+
+    <uses-permission android:name="android.permission.RECORD_AUDIO" />
+
+    <application
+        android:allowBackup="true"
+        android:icon="@mipmap/ic_launcher"
+        android:label="@string/app_name"
+        android:roundIcon="@mipmap/ic_launcher_round"
+        android:supportsRtl="true"
+        android:theme="@style/Theme.ConformerExample">
+        <activity
+            android:name=".MainActivity"
+            android:exported="true">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+        </activity>
+    </application>
+
+</manifest>
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+# Add project specific ProGuard rules here.`