chatGPTtools/bin/chatgpttranscription at develop · stoerr/chatGPTtools · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/bin/bash

# Function to show the usage of the script
show_usage() {
    echo "Usage: $(basename $0) [-m model] [-l language] [-p prompt] [-r response_format] [-o output_file] <audio_file>"
    echo "  -m  Model ID to use for transcription. Default is 'whisper-1'."
    echo "  -l  Language code of the input audio. Optional but improves accuracy."
    echo "  -p  Text to guide the model's style. Optional."
    echo "  -r  Response format of the transcript output. Options: text, vtt, json, srt, verbose_json. Default is 'text'."
    echo "  -o  Output file to write the response to. If not provided, the response will be printed to stdout."
    echo "  -t  Trim the output - remove any whitespace at beginning and end of the response."
    echo "  -h  Show this help message and exit."
    echo "  <audio_file> The audio file to transcribe."
    exit 1
}

# If the file exceeds 26200000 bytes: first downmix stereo to mono, then recode if still too large.
# In the extreme case: ffmpeg -i audio.mp3 -vn -map_metadata -1 -ac 1 -c:a libopus -b:a 12k -application voip audio.ogg

# Initialize default values
model="whisper-1"
response_format="text"
output_file=""
trim_output=0

# Parse options
while getopts ":m:l:p:r:o:th" opt; do
  case $opt in
    m) model="$OPTARG"
    ;;
    l) language="$OPTARG"
    ;;
    p) prompt="$OPTARG"
    ;;
    r) response_format="$OPTARG"
    ;;
    o) output_file="$OPTARG"
    ;;
    h) show_usage
    ;;
    \?) echo "Invalid option -$OPTARG" >&2
        show_usage
    ;;
    :) echo "Option -$OPTARG requires an argument." >&2
       show_usage
    ;;
  esac
done

# Remove the options from the positional parameters
shift $((OPTIND -1))

# Check for the audio file argument
if [ "$#" -ne 1 ]; then
    echo "Error: You must provide an audio file to transcribe."
    show_usage
fi

audio_file=$1

# Verify that the audio file exists
if [ ! -f "$audio_file" ]; then
    echo "Error: The file $audio_file does not exist."
    exit 1
fi

# downmix to mono if larger than 26200000 and file is stereo, recode if still too large
if [ $(wc -c < "$audio_file") -gt 26200000 ]; then
  #tmpdir is $TMPDIR or /tmp
  tmpdir=$TMPDIR
  [ -z "$tmpdir" ] && tmpdir=/tmp
  trap 'rm -f "$tmpfile" "$tmpfile2"' EXIT

  # check number of audio channels
  channels=$(ffprobe -v error -select_streams a:0 -show_entries stream=channels -of csv=p=0 "$audio_file" 2>/dev/null)
  if [ "$channels" -gt 1 ] 2>/dev/null; then
    tmpfile=$(mktemp -u "$tmpdir/tmp.XXXXXXXXXX").mp3
    echo "Downmixing stereo to mono..."
    ffmpeg -i "$audio_file" -vn -map_metadata -1 -ac 1 -c:a libmp3lame "$tmpfile"
    audio_file=$tmpfile
    ls -l "$audio_file"
  fi

  # recode if still too large
  if [ $(wc -c < "$audio_file") -gt 26200000 ]; then
    tmpfile2=$(mktemp -u "$tmpdir/tmp.XXXXXXXXXX").mp3
    echo "Recoding to reduce file size..."
    # if lame is in the path, use it to resample and reduce the bitrate
    if command -v lame &> /dev/null; then
      lame -h -a -S -t --resample 12 --abr 16 -m m "$audio_file" "$tmpfile2"
    else
      ffmpeg -i "$audio_file" -vn -map_metadata -1 -ac 1 -c:a libopus -b:a 12k -application voip "$tmpfile2"
    fi
    audio_file=$tmpfile2
    ls -l "$audio_file"
  fi
fi

# Read the OpenAI API key
if [ -z "$OPENAI_API_KEY" ]; then
    if [ -f "$HOME/.openai-api-key.txt" ]; then
        OPENAI_API_KEY=$(cat "$HOME/.openai-api-key.txt")
    else
        echo "Error: OPENAI_API_KEY is not set and no config file found at $HOME/.openai-api-key.txt."
        exit 1
    fi
fi

# Build the curl command
curl_command="curl -X POST 'https://api.openai.com/v1/audio/transcriptions' -s -S"
curl_command+=" -H 'Authorization: Bearer $OPENAI_API_KEY'"
curl_command+=" -H 'Content-Type: multipart/form-data'"
curl_command+=" -F 'file=@$audio_file'"
curl_command+=" -F 'model=$model'"

# Add optional parameters if provided
[ -n "$language" ] && curl_command+=" -F 'language=$language'"
[ -n "$prompt" ] && curl_command+=" -F 'prompt=$prompt'"
curl_command+=" -F 'response_format=$response_format'"

# Execute the curl command and handle the response
if [ -n "$output_file" ]; then
    eval "$curl_command" -o "$output_file"
    echo "Response written to $output_file"
elif [ $trim_output -eq 1 ]; then
    eval "$curl_command" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//'
else
    eval "$curl_command"
fi

exit 0