-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathchatgpttranscription
More file actions
executable file
·131 lines (115 loc) · 4.26 KB
/
chatgpttranscription
File metadata and controls
executable file
·131 lines (115 loc) · 4.26 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/bin/bash
# Function to show the usage of the script
show_usage() {
echo "Usage: $(basename $0) [-m model] [-l language] [-p prompt] [-r response_format] [-o output_file] <audio_file>"
echo " -m Model ID to use for transcription. Default is 'whisper-1'."
echo " -l Language code of the input audio. Optional but improves accuracy."
echo " -p Text to guide the model's style. Optional."
echo " -r Response format of the transcript output. Options: text, vtt, json, srt, verbose_json. Default is 'text'."
echo " -o Output file to write the response to. If not provided, the response will be printed to stdout."
echo " -t Trim the output - remove any whitespace at beginning and end of the response."
echo " -h Show this help message and exit."
echo " <audio_file> The audio file to transcribe."
exit 1
}
# If the file exceeds 26200000 bytes: first downmix stereo to mono, then recode if still too large.
# In the extreme case: ffmpeg -i audio.mp3 -vn -map_metadata -1 -ac 1 -c:a libopus -b:a 12k -application voip audio.ogg
# Initialize default values
model="whisper-1"
response_format="text"
output_file=""
trim_output=0
# Parse options
while getopts ":m:l:p:r:o:th" opt; do
case $opt in
m) model="$OPTARG"
;;
l) language="$OPTARG"
;;
p) prompt="$OPTARG"
;;
r) response_format="$OPTARG"
;;
o) output_file="$OPTARG"
;;
h) show_usage
;;
\?) echo "Invalid option -$OPTARG" >&2
show_usage
;;
:) echo "Option -$OPTARG requires an argument." >&2
show_usage
;;
esac
done
# Remove the options from the positional parameters
shift $((OPTIND -1))
# Check for the audio file argument
if [ "$#" -ne 1 ]; then
echo "Error: You must provide an audio file to transcribe."
show_usage
fi
audio_file=$1
# Verify that the audio file exists
if [ ! -f "$audio_file" ]; then
echo "Error: The file $audio_file does not exist."
exit 1
fi
# downmix to mono if larger than 26200000 and file is stereo, recode if still too large
if [ $(wc -c < "$audio_file") -gt 26200000 ]; then
#tmpdir is $TMPDIR or /tmp
tmpdir=$TMPDIR
[ -z "$tmpdir" ] && tmpdir=/tmp
trap 'rm -f "$tmpfile" "$tmpfile2"' EXIT
# check number of audio channels
channels=$(ffprobe -v error -select_streams a:0 -show_entries stream=channels -of csv=p=0 "$audio_file" 2>/dev/null)
if [ "$channels" -gt 1 ] 2>/dev/null; then
tmpfile=$(mktemp -u "$tmpdir/tmp.XXXXXXXXXX").mp3
echo "Downmixing stereo to mono..."
ffmpeg -i "$audio_file" -vn -map_metadata -1 -ac 1 -c:a libmp3lame "$tmpfile"
audio_file=$tmpfile
ls -l "$audio_file"
fi
# recode if still too large
if [ $(wc -c < "$audio_file") -gt 26200000 ]; then
tmpfile2=$(mktemp -u "$tmpdir/tmp.XXXXXXXXXX").mp3
echo "Recoding to reduce file size..."
# if lame is in the path, use it to resample and reduce the bitrate
if command -v lame &> /dev/null; then
lame -h -a -S -t --resample 12 --abr 16 -m m "$audio_file" "$tmpfile2"
else
ffmpeg -i "$audio_file" -vn -map_metadata -1 -ac 1 -c:a libopus -b:a 12k -application voip "$tmpfile2"
fi
audio_file=$tmpfile2
ls -l "$audio_file"
fi
fi
# Read the OpenAI API key
if [ -z "$OPENAI_API_KEY" ]; then
if [ -f "$HOME/.openai-api-key.txt" ]; then
OPENAI_API_KEY=$(cat "$HOME/.openai-api-key.txt")
else
echo "Error: OPENAI_API_KEY is not set and no config file found at $HOME/.openai-api-key.txt."
exit 1
fi
fi
# Build the curl command
curl_command="curl -X POST 'https://api.openai.com/v1/audio/transcriptions' -s -S"
curl_command+=" -H 'Authorization: Bearer $OPENAI_API_KEY'"
curl_command+=" -H 'Content-Type: multipart/form-data'"
curl_command+=" -F 'file=@$audio_file'"
curl_command+=" -F 'model=$model'"
# Add optional parameters if provided
[ -n "$language" ] && curl_command+=" -F 'language=$language'"
[ -n "$prompt" ] && curl_command+=" -F 'prompt=$prompt'"
curl_command+=" -F 'response_format=$response_format'"
# Execute the curl command and handle the response
if [ -n "$output_file" ]; then
eval "$curl_command" -o "$output_file"
echo "Response written to $output_file"
elif [ $trim_output -eq 1 ]; then
eval "$curl_command" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//'
else
eval "$curl_command"
fi
exit 0