Member 13376650 Ответов: 1

Как изменить это приложение, чтобы отключить ввод из командной строки?


Это исходный код:

#include <stdio.h>
#include <string.h>
#include <assert.h>

#if defined(_WIN32) && !defined(__CYGWIN__)
#include <windows.h>
#else
#include <sys/select.h>
#endif

#include <sphinxbase/err.h>
#include <sphinxbase/ad.h>

#include "pocketsphinx.h"

static const arg_t cont_args_def[] = {
    POCKETSPHINX_OPTIONS,
    /* Argument file. */
    {"-argfile",
     ARG_STRING,
     NULL,
     "Argument file giving extra arguments."},
    {"-adcdev",
     ARG_STRING,
     NULL,
     "Name of audio device to use for input."},
    {"-infile",
     ARG_STRING,
     NULL,
     "Audio file to transcribe."},
    {"-inmic",
     ARG_BOOLEAN,
     "no",
     "Transcribe audio from microphone."},
    {"-time",
     ARG_BOOLEAN,
     "no",
     "Print word times in file transcription."},
    CMDLN_EMPTY_OPTION
};

static ps_decoder_t *ps;
static cmd_ln_t *config;
static FILE *rawfd;

static void
print_word_times()
{
    int frame_rate = cmd_ln_int32_r(config, "-frate");
    ps_seg_t *iter = ps_seg_iter(ps);
    while (iter != NULL) {
        int32 sf, ef, pprob;
        float conf;

        ps_seg_frames(iter, &sf, &ef);
        pprob = ps_seg_prob(iter, NULL, NULL, NULL);
        conf = logmath_exp(ps_get_logmath(ps), pprob);
        printf("%s %.3f %.3f %f\n", ps_seg_word(iter), ((float)sf / frame_rate),
               ((float) ef / frame_rate), conf);
        iter = ps_seg_next(iter);
    }
}

static int
check_wav_header(char *header, int expected_sr)
{
    int sr;

    if (header[34] != 0x10) {
        E_ERROR("Input audio file has [%d] bits per sample instead of 16\n", header[34]);
        return 0;
    }
    if (header[20] != 0x1) {
        E_ERROR("Input audio file has compression [%d] and not required PCM\n", header[20]);
        return 0;
    }
    if (header[22] != 0x1) {
        E_ERROR("Input audio file has [%d] channels, expected single channel mono\n", header[22]);
        return 0;
    }
    sr = ((header[24] & 0xFF) | ((header[25] & 0xFF) << 8) | ((header[26] & 0xFF) << 16) | ((header[27] & 0xFF) << 24));
    if (sr != expected_sr) {
        E_ERROR("Input audio file has sample rate [%d], but decoder expects [%d]\n", sr, expected_sr);
        return 0;
    }
    return 1;
}

/*
 * Continuous recognition from a file
 */
static void
recognize_from_file()
{
    int16 adbuf[2048];
    const char *fname;
    const char *hyp;
    int32 k;
    uint8 utt_started, in_speech;
    int32 print_times = cmd_ln_boolean_r(config, "-time");

    fname = cmd_ln_str_r(config, "-infile");
    if ((rawfd = fopen(fname, "rb")) == NULL) {
        E_FATAL_SYSTEM("Failed to open file '%s' for reading",
                       fname);
    }

    if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".wav") == 0) {
        char waveheader[44];
    fread(waveheader, 1, 44, rawfd);
    if (!check_wav_header(waveheader, (int)cmd_ln_float32_r(config, "-samprate")))
            E_FATAL("Failed to process file '%s' due to format mismatch.\n", fname);
    }

    if (strlen(fname) > 4 && strcmp(fname + strlen(fname) - 4, ".mp3") == 0) {
    E_FATAL("Can not decode mp3 files, convert input file to WAV 16kHz 16-bit mono before decoding.\n");
    }

    ps_start_utt(ps);
    utt_started = FALSE;

    while ((k = fread(adbuf, sizeof(int16), 2048, rawfd)) > 0) {
        ps_process_raw(ps, adbuf, k, FALSE, FALSE);
        in_speech = ps_get_in_speech(ps);
        if (in_speech && !utt_started) {
            utt_started = TRUE;
        } 
        if (!in_speech && utt_started) {
            ps_end_utt(ps);
            hyp = ps_get_hyp(ps, NULL);
            if (hyp != NULL)
            printf("%s\n", hyp);
            if (print_times)
            print_word_times();
            fflush(stdout);

            ps_start_utt(ps);
            utt_started = FALSE;
        }
    }
    ps_end_utt(ps);
    if (utt_started) {
        hyp = ps_get_hyp(ps, NULL);
        if (hyp != NULL) {
            printf("%s\n", hyp);
            if (print_times) {
            print_word_times();
        }
    }
    }

    fclose(rawfd);
}

/* Sleep for specified msec */
static void
sleep_msec(int32 ms)
{
#if (defined(_WIN32) && !defined(GNUWINCE)) || defined(_WIN32_WCE)
    Sleep(ms);
#else
    /* ------------------- Unix ------------------ */
    struct timeval tmo;

    tmo.tv_sec = 0;
    tmo.tv_usec = ms * 1000;

    select(0, NULL, NULL, NULL, &tmo);
#endif
}

/*
 * Main utterance processing loop:
 *     for (;;) {
 *        start utterance and wait for speech to process
 *        decoding till end-of-utterance silence will be detected
 *        print utterance result;
 *     }
 */
static void
recognize_from_microphone()
{
    ad_rec_t *ad;
    int16 adbuf[2048];
    uint8 utt_started, in_speech;
    int32 k;
    char const *hyp;

    if ((ad = ad_open_dev(cmd_ln_str_r(config, "-adcdev"),
                          (int) cmd_ln_float32_r(config,
                                                 "-samprate"))) == NULL)
        E_FATAL("Failed to open audio device\n");
    if (ad_start_rec(ad) < 0)
        E_FATAL("Failed to start recording\n");

    if (ps_start_utt(ps) < 0)
        E_FATAL("Failed to start utterance\n");
    utt_started = FALSE;
    E_INFO("Ready....\n");

    for (;;) {
        if ((k = ad_read(ad, adbuf, 2048)) < 0)
            E_FATAL("Failed to read audio\n");
        ps_process_raw(ps, adbuf, k, FALSE, FALSE);
        in_speech = ps_get_in_speech(ps);
        if (in_speech && !utt_started) {
            utt_started = TRUE;
            E_INFO("Listening...\n");
        }
        if (!in_speech && utt_started) {
            /* speech -> silence transition, time to start new utterance  */
            ps_end_utt(ps);
            hyp = ps_get_hyp(ps, NULL );
            if (hyp != NULL) {
                printf("%s\n", hyp);
                fflush(stdout);
            }

            if (ps_start_utt(ps) < 0)
                E_FATAL("Failed to start utterance\n");
            utt_started = FALSE;
            E_INFO("Ready....\n");
        }
        sleep_msec(100);
    }
    ad_close(ad);
}

int
main(int argc, char *argv[])
{
    char const *cfg;

    config = cmd_ln_parse_r(NULL, cont_args_def, argc, argv, TRUE);

    /* Handle argument file as -argfile. */
    if (config && (cfg = cmd_ln_str_r(config, "-argfile")) != NULL) {
        config = cmd_ln_parse_file_r(config, cont_args_def, cfg, FALSE);
    }

    if (config == NULL || (cmd_ln_str_r(config, "-infile") == NULL && cmd_ln_boolean_r(config, "-inmic") == FALSE)) {
    E_INFO("Specify '-infile <file.wav>' to recognize from file or '-inmic yes' to recognize from microphone.\n");
        cmd_ln_free_r(config);
    return 1;
    }

    ps_default_search_args(config);
    ps = ps_init(config);
    if (ps == NULL) {
        cmd_ln_free_r(config);
        return 1;
    }

    E_INFO("%s COMPILED ON: %s, AT: %s\n\n", argv[0], __DATE__, __TIME__);

    if (cmd_ln_str_r(config, "-infile") != NULL) {
        recognize_from_file();
    } else if (cmd_ln_boolean_r(config, "-inmic")) {
        recognize_from_microphone();
    }

    ps_free(ps);
    cmd_ln_free_r(config);

    return 0;
}

#if defined(_WIN32_WCE)
#pragma comment(linker,"/entry:mainWCRTStartup")
#include <windows.h>
//Windows Mobile has the Unicode main only
int
wmain(int32 argc, wchar_t * wargv[])
{
    char **argv;
    size_t wlen;
    size_t len;
    int i;

    argv = malloc(argc * sizeof(char *));
    for (i = 0; i < argc; i++) {
        wlen = lstrlenW(wargv[i]);
        len = wcstombs(NULL, wargv[i], wlen);
        argv[i] = malloc(len + 1);
        wcstombs(argv[i], wargv[i], wlen);
    }

    //assuming ASCII parameters
    return main(argc, argv);
}
#endif


Я могу скомпилировать его с помощью этой команды:

g++ -o output continuous.cpp         -DMODELDIR=\"`pkg-config --variable=modeldir pocketsphinx`\"     `pkg-config --cflags --libs pocketsphinx sphinxbase`


И запустите его по этой команде :
output -inmic yes


Что я уже пробовал:

Но мне нравится преобразовывать код, так как ему не нужно вставлять "да", и он автоматически запускает программу с микрофона. Но я получил ошибку сегментации(ядро сброшено) , когда я изменил эти части:

static const arg_t cont_args_def= {"-inmic",
     ARG_BOOLEAN,
     "no",
     "Transcribe audio from microphone."};

int main(int argc, char *argv[])
{
    config = cmd_ln_parse_r(NULL, cont_args_def, argc, argv, TRUE);

 if (cmd_ln_boolean_r(config, "-inmic")) {
        recognize_from_microphone();
    }



   // recognize_from_microphone();
    ps_free(ps);
    cmd_ln_free_r(config);



    return 0;

}


Я много искал и красил документацию, но не мог понять, в чем проблема?



EDIT: я изменил код следующим образом:

static const arg_t cont_args_def[] = {
    POCKETSPHINX_OPTIONS,

    {"-inmic",
     ARG_BOOLEAN,
     "no",
     "Transcribe audio from microphone."},

    CMDLN_EMPTY_OPTION
};


int main(int argc, char *argv[])
{
    config = cmd_ln_parse_r(NULL, cont_args_def, argc, argv, TRUE);

// if (cmd_ln_boolean_r(config, "-inmic")) {
        recognize_from_microphone();
//    }



   // recognize_from_microphone();
    ps_free(ps);
    cmd_ln_free_r(config);



    return 0;

}



Но результат есть:

Arguments list definition:
[NAME]			[DEFLT]		[DESCR]
-agc			none		Automatic gain control for c0 ('max', 'emax', 'noise', or 'none')
-agcthresh		2.0		Initial threshold for automatic gain control
-allphone				Perform phoneme decoding with phonetic lm
-allphone_ci		no		Perform phoneme decoding with phonetic lm and context-independent units only
-alpha			0.97		Preemphasis parameter
-ascale			20.0		Inverse of acoustic model scale for confidence score calculation
-aw			1		Inverse weight applied to acoustic scores.
-backtrace		no		Print results and backtraces to log.
-beam			1e-48		Beam width applied to every frame in Viterbi search (smaller values mean wider beam)
-bestpath		yes		Run bestpath (Dijkstra) search over word lattice (3rd pass)
-bestpathlw		9.5		Language model probability weight for bestpath search
-ceplen			13		Number of components in the input feature vector
-cmn			live		Cepstral mean normalization scheme ('live', 'batch', or 'none')
-cmninit		40,3,-1		Initial values (comma-separated) for cepstral mean when 'live' is used
-compallsen		no		Compute all senone scores in every frame (can be faster when there are many senones)
-debug					Verbosity level for debugging messages
-dict					Main pronunciation dictionary (lexicon) input file
-dictcase		no		Dictionary is case sensitive (NOTE: case insensitivity applies to ASCII characters only)
-dither			no		Add 1/2-bit noise
-doublebw		no		Use double bandwidth filters (same center freq)
-ds			1		Frame GMM computation downsampling ratio
-fdict					Noise word pronunciation dictionary input file
-feat			1s_c_d_dd	Feature stream type, depends on the acoustic model
-featparams				File containing feature extraction parameters.
-fillprob		1e-8		Filler word transition probability
-frate			100		Frame rate
-fsg					Sphinx format finite state grammar file
-fsgusealtpron		yes		Add alternate pronunciations to FSG
-fsgusefiller		yes		Insert filler words at each state.
-fwdflat		yes		Run forward flat-lexicon search over word lattice (2nd pass)
-fwdflatbeam		1e-64		Beam width applied to every frame in second-pass flat search
-fwdflatefwid		4		Minimum number of end frames for a word to be searched in fwdflat search
-fwdflatlw		8.5		Language model probability weight for flat lexicon (2nd pass) decoding
-fwdflatsfwin		25		Window of frames in lattice to search for successor words in fwdflat search 
-fwdflatwbeam		7e-29		Beam width applied to word exits in second-pass flat search
-fwdtree		yes		Run forward lexicon-tree search (1st pass)
-hmm					Directory containing acoustic model files.
-inmic			no		Transcribe audio from microphone.
-input_endian		little		Endianness of input data, big or little, ignored if NIST or MS Wav
-jsgf					JSGF grammar file
-keyphrase				Keyphrase to spot
-kws					A file with keyphrases to spot, one per line
-kws_delay		10		Delay to wait for best detection score
-kws_plp		1e-1		Phone loop probability for keyphrase spotting
-kws_threshold		1		Threshold for p(hyp)/p(alternatives) ratio
-latsize		5000		Initial backpointer table size
-lda					File containing transformation matrix to be applied to features (single-stream features only)
-ldadim			0		Dimensionality of output of feature transformation (0 to use entire matrix)
-lifter			0		Length of sin-curve for liftering, or 0 for no liftering.
-lm					Word trigram language model input file
-lmctl					Specify a set of language model
-lmname					Which language model in -lmctl to use by default
-logbase		1.0001		Base in which all log-likelihoods calculated
-logfn					File to write log messages in
-logspec		no		Write out logspectral files instead of cepstra
-lowerf			133.33334	Lower edge of filters
-lpbeam			1e-40		Beam width applied to last phone in words
-lponlybeam		7e-29		Beam width applied to last phone in single-phone words
-lw			6.5		Language model probability weight
-maxhmmpf		30000		Maximum number of active HMMs to maintain at each frame (or -1 for no pruning)
-maxwpf			-1		Maximum number of distinct word exits at each frame (or -1 for no pruning)
-mdef					Model definition input file
-mean					Mixture gaussian means input file
-mfclogdir				Directory to log feature files to
-min_endfr		0		Nodes ignored in lattice construction if they persist for fewer than N frames
-mixw					Senone mixture weights input file (uncompressed)
-mixwfloor		0.0000001	Senone mixture weights floor (applied to data from -mixw file)
-mllr					MLLR transformation to apply to means and variances
-mmap			yes		Use memory-mapped I/O (if possible) for model files
-ncep			13		Number of cep coefficients
-nfft			512		Size of FFT
-nfilt			40		Number of filter banks
-nwpen			1.0		New word transition penalty
-pbeam			1e-48		Beam width applied to phone transitions
-pip			1.0		Phone insertion penalty
-pl_beam		1e-10		Beam width applied to phone loop search for lookahead
-pl_pbeam		1e-10		Beam width applied to phone loop transitions for lookahead
-pl_pip			1.0		Phone insertion penalty for phone loop
-pl_weight		3.0		Weight for phoneme lookahead penalties
-pl_window		5		Phoneme lookahead window size, in frames
-rawlogdir				Directory to log raw audio files to
-remove_dc		no		Remove DC offset from each frame
-remove_noise		yes		Remove noise with spectral subtraction in mel-energies
-remove_silence		yes		Enables VAD, removes silence frames from processing
-round_filters		yes		Round mel filter frequencies to DFT points
-samprate		16000		Sampling rate
-seed			-1		Seed for random number generator; if less than zero, pick our own
-sendump				Senone dump (compressed mixture weights) input file
-senlogdir				Directory to log senone score files to
-senmgau				Senone to codebook mapping input file (usually not needed)
-silprob		0.005		Silence word transition probability
-smoothspec		no		Write out cepstral-smoothed logspectral files
-svspec					Subvector specification (e.g., 24,0-11/25,12-23/26-38 or 0-12/13-25/26-38)
-tmat					HMM state transition matrix input file
-tmatfloor		0.0001		HMM state transition probability floor (applied to -tmat file)
-topn			4		Maximum number of top Gaussians to use in scoring.
-topn_beam		0		Beam width used to determine top-N Gaussians (or a list, per-feature)
-toprule				Start rule for JSGF (first public rule is default)
-transform		legacy		Which type of transform to use to calculate cepstra (legacy, dct, or htk)
-unit_area		yes		Normalize mel filters to unit area
-upperf			6855.4976	Upper edge of filters
-uw			1.0		Unigram weight
-vad_postspeech		50		Num of silence frames to keep after from speech to silence.
-vad_prespeech		20		Num of speech frames to keep before silence to speech.
-vad_startspeech	10		Num of speech frames to trigger vad from silence to speech.
-vad_threshold		2.0		Threshold for decision between noise and silence frames. Log-ratio between signal level and noise level.
-var					Mixture gaussian variances input file
-varfloor		0.0001		Mixture gaussian variance floor (applied to data from -var file)
-varnorm		no		Variance normalize each utterance (only if CMN == current)
-verbose		no		Show input filenames
-warp_params				Parameters defining the warping function
-warp_type		inverse_linear	Warping function type (or shape)
-wbeam			7e-29		Beam width applied to word exits
-wip			0.65		Word insertion penalty
-wlen			0.025625	Hamming window length

Segmentation fault (core dumped)

1 Ответов

Рейтинг:
2

Jochen Arndt

Сравнивайте ваши arg_t переменная с теми из исходного кода и прочитать документацию по cmd_ln_parse_r() функция или посмотрите на реализацию, если нет документации.

arg_t должен быть массив с завершающей записью, чтобы функция знала, когда прекратить разбор поддерживаемых аргументов:

// Must be an array!
//static const arg_t cont_args_def= {"-inmic",
static const arg_t cont_args_def[] = {
    {"-inmic",
     ARG_BOOLEAN,
     "no",
     "Transcribe audio from microphone."},
    // This must be always the last array entry
    CMDLN_EMPTY_OPTION
}; 

[РЕДАКТИРОВАТЬ]
Чтобы всегда пользоваться микрофоном, снимите -inmic и -infile параметры из списка команд и всегда вызывайте recognize_from_microphone().

Но прежде чем сделать это, вы должны подготовить параметры команды:
ps_default_search_args(config);
ps = ps_init(config);
if (ps == NULL) {
    cmd_ln_free_r(config);
    return 1;
}
/* Always use microphone */
recognize_from_microphone();

Если вы все еще получаете ошибку seg, то это происходит где-то в другом месте.

При изменении существующего кода, чтобы адаптировать его для собственных нужд, важно понимать, что делает существующий код.

Если вы например не включаете вышеприведенный блок который присваивает значение ps, ps это неопределенно. Тогда не звони free(ps) too.
[/РЕДАКТИРОВАТЬ]


Member 13376650

Я действительно отредактировал свой вопрос