Как изменить это приложение, чтобы отключить ввод из командной строки?
Это исходный код:
#include <stdio.h> #include <string.h> #include <assert.h> #if defined(_WIN32) && !defined(__CYGWIN__) #include <windows.h> #else #include <sys/select.h> #endif #include <sphinxbase/err.h> #include <sphinxbase/ad.h> #include "pocketsphinx.h" static const arg_t cont_args_def[] = { POCKETSPHINX_OPTIONS, /* Argument file. */ {"-argfile", ARG_STRING, NULL, "Argument file giving extra arguments."}, {"-adcdev", ARG_STRING, NULL, "Name of audio device to use for input."}, {"-infile", ARG_STRING, NULL, "Audio file to transcribe."}, {"-inmic", ARG_BOOLEAN, "no", "Transcribe audio from microphone."}, {"-time", ARG_BOOLEAN, "no", "Print word times in file transcription."}, CMDLN_EMPTY_OPTION }; static ps_decoder_t *ps; static cmd_ln_t *config; static FILE *rawfd; static void print_word_times() { int frame_rate = cmd_ln_int32_r(config, "-frate"); ps_seg_t *iter = ps_seg_iter(ps); while (iter != NULL) { int32 sf, ef, pprob; float conf; ps_seg_frames(iter, &sf, &ef); pprob = ps_seg_prob(iter, NULL, NULL, NULL); conf = logmath_exp(ps_get_logmath(ps), pprob); printf("%s %.3f %.3f %f\n", ps_seg_word(iter), ((float)sf / frame_rate), ((float) ef / frame_rate), conf); iter = ps_seg_next(iter); } } static int check_wav_header(char *header, int expected_sr) { int sr; if (header[34] != 0x10) { E_ERROR("Input audio file has [%d] bits per sample instead of 16\n", header[34]); return 0; } if (header[20] != 0x1) { E_ERROR("Input audio file has compression [%d] and not required PCM\n", header[20]); return 0; } if (header[22] != 0x1) { E_ERROR("Input audio file has [%d] channels, expected single channel mono\n", header[22]); return 0; } sr = ((header[24] & 0xFF) | ((header[25] & 0xFF) << 8) | ((header[26] & 0xFF) << 16) | ((header[27] & 0xFF) << 24)); if (sr != expected_sr) { E_ERROR("Input audio file has sample rate [%d], but decoder expects [%d]\n", sr, expected_sr); return 0; } return 1; } /* * Continuous recognition from a file */ static void recognize_from_file() { int16 adbuf[2048]; const char *fname; const char *hyp; int32 k; uint8 utt_started, in_speech; int32 print_times = cmd_ln_boolean_r(config, "-time"); fname = cmd_ln_str_r(config, "-infile"); if ((rawfd = fopen(fname, "rb")) == NULL) { E_FATAL_SYSTEM("Failed to open file '%s' for reading", fname); } if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".wav") == 0) { char waveheader[44]; fread(waveheader, 1, 44, rawfd); if (!check_wav_header(waveheader, (int)cmd_ln_float32_r(config, "-samprate"))) E_FATAL("Failed to process file '%s' due to format mismatch.\n", fname); } if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".mp3") == 0) { E_FATAL("Can not decode mp3 files, convert input file to WAV 16kHz 16-bit mono before decoding.\n"); } ps_start_utt(ps); utt_started = FALSE; while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) { ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; } if (!in_speech && utt_started) { ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL); if (hyp != NULL) printf("%s\n", hyp); if (print_times) print_word_times(); fflush(stdout); ps_start_utt(ps); utt_started = FALSE; } } ps_end_utt(ps); if (utt_started) { hyp = ps_get_hyp(ps, NULL); if (hyp != NULL) { printf("%s\n", hyp); if (print_times) { print_word_times(); } } } fclose(rawfd); } /* Sleep for specified msec */ static void sleep_msec(int32 ms) { #if (defined(_WIN32) && !defined(GNUWINCE)) || defined(_WIN32_WCE) Sleep(ms); #else /* ------------------- Unix ------------------ */ struct timeval tmo; tmo.tv_sec = 0; tmo.tv_usec = ms * 1000; select(0, NULL, NULL, NULL, &tmo); #endif } /* * Main utterance processing loop: * for (;;) { * start utterance and wait for speech to process * decoding till end-of-utterance silence will be detected * print utterance result; * } */ static void recognize_from_microphone() { ad_rec_t *ad; int16 adbuf[2048]; uint8 utt_started, in_speech; int32 k; char const *hyp; if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"), (int) cmd_ln_float32_r(config, "-samprate"))) == NULL) E_FATAL("Failed to open audio device\n"); if (ad_start_rec(ad) < 0) E_FATAL("Failed to start recording\n"); if (ps_start_utt(ps) < 0) E_FATAL("Failed to start utterance\n"); utt_started = FALSE; E_INFO("Ready....\n"); for (;;) { if ((k = ad_read(ad, adbuf, 2048)) < 0) E_FATAL("Failed to read audio\n"); ps_process_raw(ps, adbuf, k, FALSE, FALSE); in_speech = ps_get_in_speech(ps); if (in_speech && !utt_started) { utt_started = TRUE; E_INFO("Listening...\n"); } if (!in_speech && utt_started) { /* speech -> silence transition, time to start new utterance */ ps_end_utt(ps); hyp = ps_get_hyp(ps, NULL ); if (hyp != NULL) { printf("%s\n", hyp); fflush(stdout); } if (ps_start_utt(ps) < 0) E_FATAL("Failed to start utterance\n"); utt_started = FALSE; E_INFO("Ready....\n"); } sleep_msec(100); } ad_close(ad); } int main(int argc, char *argv[]) { char const *cfg; config = cmd_ln_parse_r(NULL, cont_args_def, argc, argv, TRUE); /* Handle argument file as -argfile. */ if (config && (cfg = cmd_ln_str_r(config, "-argfile")) != NULL) { config = cmd_ln_parse_file_r(config, cont_args_def, cfg, FALSE); } if (config == NULL || (cmd_ln_str_r(config, "-infile") == NULL && cmd_ln_boolean_r(config, "-inmic") == FALSE)) { E_INFO("Specify '-infile <file.wav>' to recognize from file or '-inmic yes' to recognize from microphone.\n"); cmd_ln_free_r(config); return 1; } ps_default_search_args(config); ps = ps_init(config); if (ps == NULL) { cmd_ln_free_r(config); return 1; } E_INFO("%s COMPILED ON: %s, AT: %s\n\n", argv[0], __DATE__, __TIME__); if (cmd_ln_str_r(config, "-infile") != NULL) { recognize_from_file(); } else if (cmd_ln_boolean_r(config, "-inmic")) { recognize_from_microphone(); } ps_free(ps); cmd_ln_free_r(config); return 0; } #if defined(_WIN32_WCE) #pragma comment(linker,"/entry:mainWCRTStartup") #include <windows.h> //Windows Mobile has the Unicode main only int wmain(int32 argc, wchar_t * wargv[]) { char **argv; size_t wlen; size_t len; int i; argv = malloc(argc * sizeof(char *)); for (i = 0; i < argc; i++) { wlen = lstrlenW(wargv[i]); len = wcstombs(NULL, wargv[i], wlen); argv[i] = malloc(len + 1); wcstombs(argv[i], wargv[i], wlen); } //assuming ASCII parameters return main(argc, argv); } #endif
Я могу скомпилировать его с помощью этой команды:
g++ -o output continuous.cpp -DMODELDIR=\"`pkg-config --variable=modeldir pocketsphinx`\" `pkg-config --cflags --libs pocketsphinx sphinxbase`
И запустите его по этой команде :
output -inmic yes
Что я уже пробовал:
Но мне нравится преобразовывать код, так как ему не нужно вставлять "да", и он автоматически запускает программу с микрофона. Но я получил ошибку сегментации(ядро сброшено) , когда я изменил эти части:
static const arg_t cont_args_def= {"-inmic", ARG_BOOLEAN, "no", "Transcribe audio from microphone."}; int main(int argc, char *argv[]) { config = cmd_ln_parse_r(NULL, cont_args_def, argc, argv, TRUE); if (cmd_ln_boolean_r(config, "-inmic")) { recognize_from_microphone(); } // recognize_from_microphone(); ps_free(ps); cmd_ln_free_r(config); return 0; }
Я много искал и красил документацию, но не мог понять, в чем проблема?
EDIT: я изменил код следующим образом:
static const arg_t cont_args_def[] = { POCKETSPHINX_OPTIONS, {"-inmic", ARG_BOOLEAN, "no", "Transcribe audio from microphone."}, CMDLN_EMPTY_OPTION };
int main(int argc, char *argv[]) { config = cmd_ln_parse_r(NULL, cont_args_def, argc, argv, TRUE); // if (cmd_ln_boolean_r(config, "-inmic")) { recognize_from_microphone(); // } // recognize_from_microphone(); ps_free(ps); cmd_ln_free_r(config); return 0; }
Но результат есть:
Arguments list definition: [NAME] [DEFLT] [DESCR] -agc none Automatic gain control for c0 ('max', 'emax', 'noise', or 'none') -agcthresh 2.0 Initial threshold for automatic gain control -allphone Perform phoneme decoding with phonetic lm -allphone_ci no Perform phoneme decoding with phonetic lm and context-independent units only -alpha 0.97 Preemphasis parameter -ascale 20.0 Inverse of acoustic model scale for confidence score calculation -aw 1 Inverse weight applied to acoustic scores. -backtrace no Print results and backtraces to log. -beam 1e-48 Beam width applied to every frame in Viterbi search (smaller values mean wider beam) -bestpath yes Run bestpath (Dijkstra) search over word lattice (3rd pass) -bestpathlw 9.5 Language model probability weight for bestpath search -ceplen 13 Number of components in the input feature vector -cmn live Cepstral mean normalization scheme ('live', 'batch', or 'none') -cmninit 40,3,-1 Initial values (comma-separated) for cepstral mean when 'live' is used -compallsen no Compute all senone scores in every frame (can be faster when there are many senones) -debug Verbosity level for debugging messages -dict Main pronunciation dictionary (lexicon) input file -dictcase no Dictionary is case sensitive (NOTE: case insensitivity applies to ASCII characters only) -dither no Add 1/2-bit noise -doublebw no Use double bandwidth filters (same center freq) -ds 1 Frame GMM computation downsampling ratio -fdict Noise word pronunciation dictionary input file -feat 1s_c_d_dd Feature stream type, depends on the acoustic model -featparams File containing feature extraction parameters. -fillprob 1e-8 Filler word transition probability -frate 100 Frame rate -fsg Sphinx format finite state grammar file -fsgusealtpron yes Add alternate pronunciations to FSG -fsgusefiller yes Insert filler words at each state. -fwdflat yes Run forward flat-lexicon search over word lattice (2nd pass) -fwdflatbeam 1e-64 Beam width applied to every frame in second-pass flat search -fwdflatefwid 4 Minimum number of end frames for a word to be searched in fwdflat search -fwdflatlw 8.5 Language model probability weight for flat lexicon (2nd pass) decoding -fwdflatsfwin 25 Window of frames in lattice to search for successor words in fwdflat search -fwdflatwbeam 7e-29 Beam width applied to word exits in second-pass flat search -fwdtree yes Run forward lexicon-tree search (1st pass) -hmm Directory containing acoustic model files. -inmic no Transcribe audio from microphone. -input_endian little Endianness of input data, big or little, ignored if NIST or MS Wav -jsgf JSGF grammar file -keyphrase Keyphrase to spot -kws A file with keyphrases to spot, one per line -kws_delay 10 Delay to wait for best detection score -kws_plp 1e-1 Phone loop probability for keyphrase spotting -kws_threshold 1 Threshold for p(hyp)/p(alternatives) ratio -latsize 5000 Initial backpointer table size -lda File containing transformation matrix to be applied to features (single-stream features only) -ldadim 0 Dimensionality of output of feature transformation (0 to use entire matrix) -lifter 0 Length of sin-curve for liftering, or 0 for no liftering. -lm Word trigram language model input file -lmctl Specify a set of language model -lmname Which language model in -lmctl to use by default -logbase 1.0001 Base in which all log-likelihoods calculated -logfn File to write log messages in -logspec no Write out logspectral files instead of cepstra -lowerf 133.33334 Lower edge of filters -lpbeam 1e-40 Beam width applied to last phone in words -lponlybeam 7e-29 Beam width applied to last phone in single-phone words -lw 6.5 Language model probability weight -maxhmmpf 30000 Maximum number of active HMMs to maintain at each frame (or -1 for no pruning) -maxwpf -1 Maximum number of distinct word exits at each frame (or -1 for no pruning) -mdef Model definition input file -mean Mixture gaussian means input file -mfclogdir Directory to log feature files to -min_endfr 0 Nodes ignored in lattice construction if they persist for fewer than N frames -mixw Senone mixture weights input file (uncompressed) -mixwfloor 0.0000001 Senone mixture weights floor (applied to data from -mixw file) -mllr MLLR transformation to apply to means and variances -mmap yes Use memory-mapped I/O (if possible) for model files -ncep 13 Number of cep coefficients -nfft 512 Size of FFT -nfilt 40 Number of filter banks -nwpen 1.0 New word transition penalty -pbeam 1e-48 Beam width applied to phone transitions -pip 1.0 Phone insertion penalty -pl_beam 1e-10 Beam width applied to phone loop search for lookahead -pl_pbeam 1e-10 Beam width applied to phone loop transitions for lookahead -pl_pip 1.0 Phone insertion penalty for phone loop -pl_weight 3.0 Weight for phoneme lookahead penalties -pl_window 5 Phoneme lookahead window size, in frames -rawlogdir Directory to log raw audio files to -remove_dc no Remove DC offset from each frame -remove_noise yes Remove noise with spectral subtraction in mel-energies -remove_silence yes Enables VAD, removes silence frames from processing -round_filters yes Round mel filter frequencies to DFT points -samprate 16000 Sampling rate -seed -1 Seed for random number generator; if less than zero, pick our own -sendump Senone dump (compressed mixture weights) input file -senlogdir Directory to log senone score files to -senmgau Senone to codebook mapping input file (usually not needed) -silprob 0.005 Silence word transition probability -smoothspec no Write out cepstral-smoothed logspectral files -svspec Subvector specification (e.g., 24,0-11/25,12-23/26-38 or 0-12/13-25/26-38) -tmat HMM state transition matrix input file -tmatfloor 0.0001 HMM state transition probability floor (applied to -tmat file) -topn 4 Maximum number of top Gaussians to use in scoring. -topn_beam 0 Beam width used to determine top-N Gaussians (or a list, per-feature) -toprule Start rule for JSGF (first public rule is default) -transform legacy Which type of transform to use to calculate cepstra (legacy, dct, or htk) -unit_area yes Normalize mel filters to unit area -upperf 6855.4976 Upper edge of filters -uw 1.0 Unigram weight -vad_postspeech 50 Num of silence frames to keep after from speech to silence. -vad_prespeech 20 Num of speech frames to keep before silence to speech. -vad_startspeech 10 Num of speech frames to trigger vad from silence to speech. -vad_threshold 2.0 Threshold for decision between noise and silence frames. Log-ratio between signal level and noise level. -var Mixture gaussian variances input file -varfloor 0.0001 Mixture gaussian variance floor (applied to data from -var file) -varnorm no Variance normalize each utterance (only if CMN == current) -verbose no Show input filenames -warp_params Parameters defining the warping function -warp_type inverse_linear Warping function type (or shape) -wbeam 7e-29 Beam width applied to word exits -wip 0.65 Word insertion penalty -wlen 0.025625 Hamming window length Segmentation fault (core dumped)