128 lines
3.7 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
# -------- Defaults --------
LANGUAGE="en"
MODEL="medium"
OUTPUT_FORMAT="txt" # txt|vtt|srt|json|tsv|lrc|all
WHISPER_EXEC=""
# -------- Usage --------
usage() {
cat >&2 <<EOF
Usage:
$0 <file.mp3|file.mp4|URL> [--language <code>] [--model <name>] [--output_format <fmt>]
Options:
--language <code> ISO code (default: ${LANGUAGE})
--model <name> whisper model name (default: ${MODEL})
--output_format <fmt> One of: txt|vtt|srt|json|tsv|lrc|all (default: ${OUTPUT_FORMAT})
-h, --help Show this help
Notes:
- Input may be a local .mp3/.mp4 file or an http(s) URL to .mp3/.mp4
- .mp4 will be transcoded to .mp3 via ffmpeg
EOF
}
# -------- Arg check --------
[[ $# -lt 1 ]] && { usage; exit 1; }
INPUT="$1"; shift
# -------- Parse flags --------
while [[ $# -gt 0 ]]; do
case "$1" in
--language) LANGUAGE="${2:-}"; shift 2 ;;
--model) MODEL="${2:-}"; shift 2 ;;
--output_format) OUTPUT_FORMAT="${2:-}"; shift 2 ;;
-h|--help) usage; exit 0 ;;
*) echo "Unknown option: $1" >&2; usage; exit 1 ;;
esac
done
# -------- Validate deps --------
need() { command -v "$1" >/dev/null 2>&1 || { echo "Missing dependency: $1" >&2; exit 1; }; }
need ffmpeg
need wget
# Prefer PATH whisper; fallback to $HOME scan
if command -v whisper >/dev/null 2>&1; then
WHISPER_EXEC="$(command -v whisper)"
else
WHISPER_EXEC="$(find "$HOME" -type f -perm -u=x -name whisper 2>/dev/null | head -n1 || true)"
fi
[[ -n "$WHISPER_EXEC" ]] || { echo "Could not find 'whisper' executable in PATH or \$HOME" >&2; exit 1; }
# -------- Helpers --------
lower_ext() {
local f="$1"
local p="${f%%\?*}" # strip query
p="${p%%\#*}" # strip fragment
local e="${p##*.}" # extension
printf '%s' "${e,,}" # lowercase
}
is_url() { [[ "$1" =~ ^https?:// ]]; }
output_flags_for() {
case "$1" in
txt) echo "--output-txt" ;;
vtt) echo "--output-vtt" ;;
srt) echo "--output-srt" ;;
json) echo "--output-json" ;;
tsv) echo "--output-csv" ;; # whisper.cpp uses CSV/TSV-ish; adjust if your build differs
lrc) echo "--output-lrc" ;;
all) echo "--output-txt --output-vtt --output-srt --output-json --output-csv --output-lrc" ;;
*) echo "Invalid --output_format: $1. Use txt|vtt|srt|json|tsv|lrc|all" >&2; exit 1 ;;
esac
}
# -------- Temp workspace & cleanup --------
WORKDIR="$(mktemp -d)"
trap 'rm -rf "$WORKDIR"' EXIT
SOURCE_PATH=""
MP3_PATH=""
# -------- Stage input (download if URL) --------
if is_url "$INPUT"; then
ext="$(lower_ext "$INPUT")"
case "$ext" in
mp3|mp4) : ;;
*) ext="mp3" ;; # default if unknown
esac
SOURCE_PATH="$WORKDIR/input.$ext"
echo "↓ Downloading: $INPUT"
wget -q --show-progress -O "$SOURCE_PATH" "$INPUT"
else
[[ -f "$INPUT" ]] || { echo "Error: file not found: $INPUT" >&2; exit 1; }
SOURCE_PATH="$INPUT"
ext="$(lower_ext "$SOURCE_PATH")"
fi
# -------- Transcode if needed --------
case "$ext" in
mp3)
MP3_PATH="$SOURCE_PATH"
;;
mp4)
MP3_PATH="$WORKDIR/audio.mp3"
echo "🎞 Transcoding MP4 → MP3 with ffmpeg..."
ffmpeg -y -i "$SOURCE_PATH" -vn -acodec libmp3lame -q:a 2 "$MP3_PATH" >/dev/null 2>&1
;;
*)
echo "Error: unsupported extension '$ext'. Only .mp3 or .mp4 are handled." >&2
exit 1
;;
esac
# -------- Run whisper --------
OUT_FLAGS="$(output_flags_for "$OUTPUT_FORMAT")"
echo "▶ Running whisper"
echo " model: $MODEL"
echo " language: $LANGUAGE"
echo " outputs: $OUTPUT_FORMAT"
echo " input: $MP3_PATH"
# shellcheck disable=SC2086
"$WHISPER_EXEC" "$MP3_PATH" --model "$MODEL" --device cuda --language "$LANGUAGE" $OUT_FLAGS