chore: sync sdk code with DeepLearning repo (#162)

zkleb-aai · AssemblyAI · web-flow · commit 43ff6f5eda24 · 2026-01-28T18:26:31.000-05:00
Co-authored-by: AssemblyAI &lt;engineering.sdk@assemblyai.com&gt;
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -86,8 +86,6 @@ jobs:
         if: ${{ steps.counter.outputs.count > 0 }}
         with:
           python-version: '3.9'
-          cache: 'pip'
-          cache-dependency-path: 'setup.py'
       - run: pip install mypy==1.5.1
         if: ${{ steps.counter.outputs.count > 0 }}
       - run: mypy ${{ steps.filter.outputs.python_files }} --follow-imports=silent --ignore-missing-imports
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -22,33 +22,19 @@ jobs:
         os:
           - ubuntu-22.04
     steps:
+      - name: Setup python for tox
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.py }}
+      - name: Install tox
+        run: python -m pip install tox
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
       - name: Setup python for test ${{ matrix.py }}
         uses: actions/setup-python@v4
         with:
           python-version: ${{ matrix.py }}
-          cache: 'pip'
-          cache-dependency-path: 'setup.py'
-      - name: Cache apt packages
-        uses: actions/cache@v3
-        with:
-          path: |
-            /var/cache/apt/archives
-            /var/lib/apt/lists
-          key: apt-${{ runner.os }}-portaudio
-          restore-keys: |
-            apt-${{ runner.os }}-
-      - name: Cache tox environments
-        uses: actions/cache@v3
-        with:
-          path: .tox
-          key: tox-${{ matrix.os }}-${{ matrix.py }}-${{ hashFiles('tox.ini', 'setup.py') }}
-          restore-keys: |
-            tox-${{ matrix.os }}-${{ matrix.py }}-
-      - name: Install tox
-        run: python -m pip install tox
       - name: Setup test suite
         run: |
           sudo apt-get update && sudo apt-get install -y portaudio19-dev
diff --git a/assemblyai/__version__.py b/assemblyai/__version__.py
@@ -1 +1 @@
-__version__ = "0.48.4"
+__version__ = "0.49.0"
diff --git a/assemblyai/transcriber.py b/assemblyai/transcriber.py
@@ -523,6 +523,22 @@ def webhook_auth(self) -> Optional[bool]:
 
         return self._impl.transcript.webhook_auth
 
+    @property
+    def language_code(self) -> Optional[Union[str, types.LanguageCode]]:
+        "The language code of the transcript"
+        if not self._impl.transcript:
+            raise ValueError("The internal Transcript object is None.")
+
+        return self._impl.transcript.language_code
+
+    @property
+    def language_codes(self) -> Optional[List[Union[str, types.LanguageCode]]]:
+        "The list of language codes for multilingual/code-switching audio"
+        if not self._impl.transcript:
+            raise ValueError("The internal Transcript object is None.")
+
+        return self._impl.transcript.language_codes
+
     @property
     def lemur(self) -> lemur.Lemur:
         """
diff --git a/assemblyai/types.py b/assemblyai/types.py
@@ -7,6 +7,7 @@
     Any,
     Dict,
     List,
+    Literal,
     Optional,
     Sequence,
     Tuple,
@@ -296,6 +297,9 @@ class EntityType(str, Enum):
     filename = "filename"
     "Names of computer files, including the extension or filepath (e.g., Taxes/2012/brad-tax-returns.pdf)"
 
+    gender = "gender"
+    "Terms indicating gender identity (e.g., female, male, non-binary)"
+
     gender_sexuality = "gender_sexuality"
     "Terms indicating gender identity or sexual orientation, including slang terms (e.g., female; bisexual; trans)"
 
@@ -314,6 +318,27 @@ class EntityType(str, Enum):
     location = "location"
     "Any Location reference including mailing address, postal code, city, state, province, country, or coordinates (e.g., Lake Victoria, 145 Windsor St., 90210)"
 
+    location_address = "location_address"
+    "Mailing address (e.g., 123 Main Street, Apartment 4B)"
+
+    location_address_street = "location_address_street"
+    "Street address (e.g., 123 Main Street)"
+
+    location_city = "location_city"
+    "City name (e.g., San Francisco, New York)"
+
+    location_coordinate = "location_coordinate"
+    "Geographic coordinates (e.g., 37.7749° N, 122.4194° W)"
+
+    location_country = "location_country"
+    "Country name (e.g., United States, Canada)"
+
+    location_state = "location_state"
+    "State or province name (e.g., California, Ontario)"
+
+    location_zip = "location_zip"
+    "Postal or ZIP code (e.g., 94102, M5V 3A8)"
+
     marital_status = "marital_status"
     "Terms indicating marital status (e.g., Single, common-law, ex-wife, married)"
 
@@ -338,6 +363,8 @@ class EntityType(str, Enum):
     organization = "organization"
     "Name of an organization (e.g., CNN, McDonalds, University of Alaska, Northwest General Hospital)"
 
+    organization_medical_facility = "organization_medical_facility"
+
     passport_number = "passport_number"
     "Passport numbers, issued by any country (e.g., PA4568332; NU3C6L86S12)"
 
@@ -362,6 +389,9 @@ class EntityType(str, Enum):
     religion = "religion"
     "Terms indicating religious affiliation (e.g., Hindu, Catholic)"
 
+    sexuality = "sexuality"
+    "Terms indicating sexual orientation (e.g., heterosexual, gay, bisexual)"
+
     statistics = "statistics"
     "Medical statistics (e.g., 18%, 18 percent)"
 
@@ -383,6 +413,40 @@ class EntityType(str, Enum):
     zodiac_sign = "zodiac_sign"
     "Names of Zodiac signs (e.g., Aries, Taurus)"
 
+    # BETA - only english
+    corporate_action = "corporate_action"
+    "Corporate actions (e.g., merger, acquisition, IPO)"
+
+    day = "day"
+    "Day reference (e.g., Monday, Friday)"
+
+    effect = "effect"
+    "Effect or result (e.g., increase, decrease)"
+
+    financial_metric = "financial_metric"
+    "Financial metrics (e.g., revenue, profit margin, EBITDA)"
+
+    medical_code = "medical_code"
+    "Medical codes (e.g., ICD-10, CPT codes)"
+
+    month = "month"
+    "Month reference (e.g., January, February)"
+
+    organization_id = "organization_id"
+    "Organization identification numbers (e.g., EIN, company registration number)"
+
+    product = "product"
+    "Product names (e.g., iPhone, Tesla Model 3)"
+
+    project = "project"
+    "Project names (e.g., Project Apollo, Manhattan Project)"
+
+    trend = "trend"
+    "Trend indicators (e.g., upward trend, downward trend)"
+
+    year = "year"
+    "Year reference (e.g., 2023, 1999)"
+
 
 # EntityType and PIIRedactionPolicy share the same values
 PIIRedactionPolicy = EntityType
@@ -704,6 +768,10 @@ class SpeakerOptions(BaseModel):
         None,
         description="Enable or disable two-stage clustering for speaker diarization",
     )
+    long_file_diarization_method: Optional[Literal["standard", "experimental"]] = Field(
+        None,
+        description="Diarization method for long files. Options: standard (default), experimental",
+    )
 
     if pydantic_v2:
 
@@ -861,7 +929,13 @@ class RawTranscriptionConfig(BaseModel):
     "The list of key terms used to generate the transcript with the Slam-1 speech model. Can't be used together with `prompt`."
 
     language_codes: Optional[List[Union[str, LanguageCode]]] = None
-    "List of language codes detected in the audio file when language detection is enabled"
+    """
+    A list of language codes associated with the transcript.
+
+    When submitting a transcript request, this can be used to provide multiple language codes
+    for multilingual/code-switching audio (equivalent to passing `language_codes` in the
+    `/v2/transcript` API request body).
+    """
 
     language_detection_results: Optional[LanguageDetectionResults] = None
     "Language detection results including code switching languages"
@@ -876,6 +950,7 @@ class TranscriptionConfig:
     def __init__(
         self,
         language_code: Optional[Union[str, LanguageCode]] = None,
+        language_codes: Optional[List[Union[str, LanguageCode]]] = None,
         punctuate: Optional[bool] = None,
         format_text: Optional[bool] = None,
         dual_channel: Optional[bool] = None,
@@ -922,6 +997,7 @@ def __init__(
         """
         Args:
             language_code: The language of your audio file. Possible values are found in Supported Languages.
+            language_codes: A list of language codes for multilingual/code-switching audio.
             punctuate: Enable Automatic Punctuation
             format_text: Enable Text Formatting
             dual_channel: Enable Dual Channel transcription
@@ -969,6 +1045,7 @@ def __init__(
 
         # explicit configurations have higher priority if `raw_transcription_config` has been passed as well
         self.language_code = language_code
+        self.language_codes = language_codes
         self.punctuate = punctuate
         self.format_text = format_text
         self.dual_channel = dual_channel
@@ -1455,10 +1532,17 @@ def speech_threshold(self, threshold: Optional[float]) -> None:
 
     @property
     def language_codes(self) -> Optional[List[Union[str, LanguageCode]]]:
-        "Returns the list of language codes detected in the audio file when language detection is enabled."
+        "Returns the list of language codes associated with this transcript/config."
 
         return self._raw_transcription_config.language_codes
 
+    @language_codes.setter
+    def language_codes(
+        self, language_codes: Optional[List[Union[str, LanguageCode]]]
+    ) -> None:
+        "Sets the list of language codes for multilingual/code-switching audio."
+        self._raw_transcription_config.language_codes = language_codes
+
     @property
     def language_detection_results(self) -> Optional[LanguageDetectionResults]:
         "Returns the language detection results including code switching languages."
@@ -1878,7 +1962,7 @@ class Utterance(UtteranceWord):
 class Chapter(BaseModel):
     summary: str
     headline: str
-    gist: str
+    gist: Optional[str] = None
     start: int
     end: int
 
diff --git a/tests/unit/test_auto_chapters.py b/tests/unit/test_auto_chapters.py
@@ -81,3 +81,41 @@ def test_auto_chapters_enabled(httpx_mock: HTTPXMock):
         assert transcript_chapter.gist == response_chapter["gist"]
         assert transcript_chapter.start == response_chapter["start"]
         assert transcript_chapter.end == response_chapter["end"]
+
+
+def test_auto_chapters_with_missing_gist(httpx_mock: HTTPXMock):
+    """
+    Tests that the SDK can handle Chapter responses where the `gist` field is missing.
+    The `gist` field is optional in the Chapter model and should default to None.
+    """
+    # Create a mock response with chapters that have missing gist fields
+    mock_response = factories.generate_dict_factory(AutoChaptersResponseFactory)()
+
+    # Remove the gist field from all chapters to simulate backend response without gist
+    for chapter in mock_response["chapters"]:
+        del chapter["gist"]
+
+    request_body, transcript = unit_test_utils.submit_mock_transcription_request(
+        httpx_mock,
+        mock_response=mock_response,
+        config=aai.TranscriptionConfig(auto_chapters=True),
+    )
+
+    # Check that request body was properly defined
+    assert request_body.get("auto_chapters") is True
+
+    # Check that transcript was properly parsed from JSON response
+    assert transcript.error is None
+    assert transcript.chapters is not None
+    assert len(transcript.chapters) > 0
+    assert len(transcript.chapters) == len(mock_response["chapters"])
+
+    # Verify that chapters can be parsed without gist field
+    for response_chapter, transcript_chapter in zip(
+        mock_response["chapters"], transcript.chapters
+    ):
+        assert transcript_chapter.summary == response_chapter["summary"]
+        assert transcript_chapter.headline == response_chapter["headline"]
+        assert transcript_chapter.gist is None  # Should be None when missing
+        assert transcript_chapter.start == response_chapter["start"]
+        assert transcript_chapter.end == response_chapter["end"]
diff --git a/tests/unit/test_speaker_options.py b/tests/unit/test_speaker_options.py
@@ -127,3 +127,52 @@ def test_transcription_config_with_two_stage_clustering():
     assert config.speaker_labels is True
     assert config.speaker_options == speaker_options
     assert config.speaker_options.use_two_stage_clustering is False
+
+
+def test_speaker_options_long_file_diarization_method():
+    """Test that SpeakerOptions can be created with long_file_diarization_method."""
+    speaker_options = aai.SpeakerOptions(long_file_diarization_method="experimental")
+    assert speaker_options.long_file_diarization_method == "experimental"
+
+
+def test_speaker_options_long_file_diarization_all_methods():
+    """Test all valid values for long_file_diarization_method."""
+    methods = ["standard", "experimental"]
+    for method in methods:
+        speaker_options = aai.SpeakerOptions(long_file_diarization_method=method)
+        assert speaker_options.long_file_diarization_method == method
+
+
+def test_transcription_config_with_long_file_experimental_diarization():
+    """Test the issue scenario: TranscriptionConfig with experimental diarization."""
+    speaker_options = aai.SpeakerOptions(long_file_diarization_method="experimental")
+
+    config = aai.TranscriptionConfig(
+        speaker_labels=True,
+        speaker_options=speaker_options,
+    )
+
+    assert config.speaker_labels is True
+    assert config.speaker_options == speaker_options
+    assert config.speaker_options.long_file_diarization_method == "experimental"
+    assert config.raw.speaker_options.long_file_diarization_method == "experimental"
+
+
+def test_transcription_config_with_all_speaker_options():
+    """Test TranscriptionConfig with all speaker options fields."""
+    speaker_options = aai.SpeakerOptions(
+        min_speakers_expected=2,
+        max_speakers_expected=5,
+        use_two_stage_clustering=False,
+        long_file_diarization_method="experimental",
+    )
+
+    config = aai.TranscriptionConfig(
+        speaker_labels=True,
+        speaker_options=speaker_options,
+    )
+
+    assert config.speaker_options.min_speakers_expected == 2
+    assert config.speaker_options.max_speakers_expected == 5
+    assert config.speaker_options.use_two_stage_clustering is False
+    assert config.speaker_options.long_file_diarization_method == "experimental"
diff --git a/tests/unit/test_transcriber.py b/tests/unit/test_transcriber.py
@@ -644,6 +644,36 @@ def test_language_detection(httpx_mock: HTTPXMock):
     assert request.get("language_code") is None
 
 
+def test_language_codes_request(httpx_mock: HTTPXMock):
+    mock_completed_json = factories.generate_dict_factory(
+        factories.TranscriptCompletedResponseFactory
+    )()
+
+    httpx_mock.add_response(
+        url=f"{aai.settings.base_url}{ENDPOINT_TRANSCRIPT}",
+        status_code=httpx.codes.OK,
+        method="POST",
+        json=mock_completed_json,
+    )
+
+    httpx_mock.add_response(
+        url=f"{aai.settings.base_url}{ENDPOINT_TRANSCRIPT}/{mock_completed_json['id']}",
+        status_code=httpx.codes.OK,
+        method="GET",
+        json=mock_completed_json,
+    )
+
+    aai.Transcriber().transcribe(
+        "https://example.org/audio.wav",
+        config=aai.TranscriptionConfig(
+            language_codes=["en", "es"],
+        ),
+    )
+
+    request = json.loads(httpx_mock.get_requests()[0].content.decode())
+    assert request.get("language_codes") == ["en", "es"]
+
+
 def test_language_code_string(httpx_mock: HTTPXMock):
     mock_completed_json = factories.generate_dict_factory(
         factories.TranscriptCompletedResponseFactory
diff --git a/tox.ini b/tox.ini

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.48.4"`
	`1`	`+__version__ = "0.49.0"`