#!/usr/bin/env bash set -euo pipefail # -------- Defaults -------- LANGUAGE="en" MODEL="medium" OUTPUT_FORMAT="txt" # txt|vtt|srt|json|tsv|lrc|all WHISPER_EXEC="" # -------- Usage -------- usage() { cat >&2 < [--language ] [--model ] [--output_format ] Options: --language ISO code (default: ${LANGUAGE}) --model whisper model name (default: ${MODEL}) --output_format One of: txt|vtt|srt|json|tsv|lrc|all (default: ${OUTPUT_FORMAT}) -h, --help Show this help Notes: - Input may be a local .mp3/.mp4 file or an http(s) URL to .mp3/.mp4 - .mp4 will be transcoded to .mp3 via ffmpeg EOF } # -------- Arg check -------- [[ $# -lt 1 ]] && { usage; exit 1; } INPUT="$1"; shift # -------- Parse flags -------- while [[ $# -gt 0 ]]; do case "$1" in --language) LANGUAGE="${2:-}"; shift 2 ;; --model) MODEL="${2:-}"; shift 2 ;; --output_format) OUTPUT_FORMAT="${2:-}"; shift 2 ;; -h|--help) usage; exit 0 ;; *) echo "Unknown option: $1" >&2; usage; exit 1 ;; esac done # -------- Validate deps -------- need() { command -v "$1" >/dev/null 2>&1 || { echo "Missing dependency: $1" >&2; exit 1; }; } need ffmpeg need wget # Prefer PATH whisper; fallback to $HOME scan if command -v whisper >/dev/null 2>&1; then WHISPER_EXEC="$(command -v whisper)" else WHISPER_EXEC="$(find "$HOME" -type f -perm -u=x -name whisper 2>/dev/null | head -n1 || true)" fi [[ -n "$WHISPER_EXEC" ]] || { echo "Could not find 'whisper' executable in PATH or \$HOME" >&2; exit 1; } # -------- Helpers -------- lower_ext() { local f="$1" local p="${f%%\?*}" # strip query p="${p%%\#*}" # strip fragment local e="${p##*.}" # extension printf '%s' "${e,,}" # lowercase } is_url() { [[ "$1" =~ ^https?:// ]]; } output_flags_for() { case "$1" in txt) echo "--output-txt" ;; vtt) echo "--output-vtt" ;; srt) echo "--output-srt" ;; json) echo "--output-json" ;; tsv) echo "--output-csv" ;; # whisper.cpp uses CSV/TSV-ish; adjust if your build differs lrc) echo "--output-lrc" ;; all) echo "--output-txt --output-vtt --output-srt --output-json --output-csv --output-lrc" ;; *) echo "Invalid --output_format: $1. Use txt|vtt|srt|json|tsv|lrc|all" >&2; exit 1 ;; esac } # -------- Temp workspace & cleanup -------- WORKDIR="$(mktemp -d)" trap 'rm -rf "$WORKDIR"' EXIT SOURCE_PATH="" MP3_PATH="" # -------- Stage input (download if URL) -------- if is_url "$INPUT"; then ext="$(lower_ext "$INPUT")" case "$ext" in mp3|mp4) : ;; *) ext="mp3" ;; # default if unknown esac SOURCE_PATH="$WORKDIR/input.$ext" echo "↓ Downloading: $INPUT" wget -q --show-progress -O "$SOURCE_PATH" "$INPUT" else [[ -f "$INPUT" ]] || { echo "Error: file not found: $INPUT" >&2; exit 1; } SOURCE_PATH="$INPUT" ext="$(lower_ext "$SOURCE_PATH")" fi # -------- Transcode if needed -------- case "$ext" in mp3) MP3_PATH="$SOURCE_PATH" ;; mp4) MP3_PATH="$WORKDIR/audio.mp3" echo "🎞 Transcoding MP4 → MP3 with ffmpeg..." ffmpeg -y -i "$SOURCE_PATH" -vn -acodec libmp3lame -q:a 2 "$MP3_PATH" >/dev/null 2>&1 ;; *) echo "Error: unsupported extension '$ext'. Only .mp3 or .mp4 are handled." >&2 exit 1 ;; esac # -------- Run whisper -------- OUT_FLAGS="$(output_flags_for "$OUTPUT_FORMAT")" echo "▶ Running whisper" echo " model: $MODEL" echo " language: $LANGUAGE" echo " outputs: $OUTPUT_FORMAT" echo " input: $MP3_PATH" # shellcheck disable=SC2086 "$WHISPER_EXEC" "$MP3_PATH" --model "$MODEL" --device cuda --language "$LANGUAGE" $OUT_FLAGS