sapi_textbased_lipsync::callback

Basically, when we get a "RECOGNITION" event or a "SPEI_HYPOTHESIS" event we process them the same. Hypothesis are more likely, for all but very short files, "SPIE_RECOGNITION" is a rarity.
Since the hypothesis will include duplicate data, we have a decision. We can save the newest hypothesis or we can save the one which generates the most alignments. Imperically, it seems that sticking with the longest result works best. But perhaps this is not so.
00525 {
00526     //USES_CONVERSION;
00527     CSpEvent event;
00528 
00529     ISpRecoResult *pRecoResult; // recoResult from the event
00530     SPPHRASE *pSpPhrase;    // phrase from recoResult
00531     SPRECORESULTTIMES pRecoResultTimes; // result times from RecoResult
00532     WCHAR phone_buffer[256];            // buffer for the phonemes
00533     UINT msStart;                       // start time of the phrase
00534     
00535 
00536     // Process the events
00537     while (event.GetFrom(this->m_recogCntxt) == S_OK)
00538     {
00539         if (event.eEventId == SPEI_RECOGNITION || event.eEventId == SPEI_HYPOTHESIS)
00540         {
00541             // text based has to accept hypothesis or it mostly fails unless the
00542             // script is very short
00543 
00544             // pull out the result object
00545             pRecoResult = event.RecoResult();
00546 
00547             // pull the whole text from the result
00548             CSpDynamicString pSapiText;
00549             pRecoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pSapiText, NULL);
00550 
00551             // get the start time for the phrase. we use this as an offset for the phrase
00552             // elements. Not sure if this is correct.
00553             pRecoResult->GetResultTimes(&pRecoResultTimes);
00554             msStart = sapi_time_to_milli(pRecoResultTimes.ullStart);
00555 
00556             std::wstring strPrintText = pSapiText;
00557             std::cerr << "hypothesis: " << wstring_2_string(strPrintText) << std::endl;
00558             // if the new results are longer than existing results in orthographic form
00559             // we accept the results and process the phonemes. Otherwise, we skip it
00560             if ((wcslen(pSapiText) > this->m_strResults.size()))
00561             {
00562                 m_strResults = pSapiText;
00563                 // clear the old results. This hypothesis trumps it
00564                 this->m_results.clear();
00565                 
00566                 // extract the phrase object
00567                 pRecoResult->GetPhrase(&pSpPhrase);
00568 
00569                 if (pSpPhrase != NULL)
00570                 {
00571                     // Process each element of the phrase. These should be our
00572                     // orthorgraphs
00573                     const SPPHRASEELEMENT *p = pSpPhrase->pElements;
00574                     const SPPHRASEELEMENT *pEnd = p + pSpPhrase->Rule.ulCountOfElements;
00575                     while (p != pEnd)
00576                     {
00577                         // for each phrase element we create a marker 
00578                     // that contains the time stamps along with the 
00579                     // phonemes. associated with it.
00580                         alignment_result al;
00581                         al.m_orthography = p->pszDisplayText;
00582                         // Get the phonemes
00583                         ULONG j = 0;
00584                         SPPHONEID phn[2];
00585                         phn[1] = 0x00;
00586                         while (p->pszPronunciation[j] != 0)
00587                         {
00588                             // process each phoneme
00589                             phn[0] = p->pszPronunciation[j];
00590                             m_phnCvt->IdToPhone(phn, phone_buffer);
00591                             al.m_phonemes.push_back(phone_buffer);
00592                             j++;
00593                         }
00594                                              
00595                         // start time of the ortheme
00596                         al.m_msStart= msStart + bytes_to_milli(p->ulAudioStreamOffset);
00597                         // end time of the ortheme
00598                         al.m_msEnd = bytes_to_milli(p->ulAudioSizeBytes);
00599                         al.m_msEnd += al.m_msStart;
00600                         // add it to the results
00601                         m_results.push_back(al);
00602                         
00603                         p++;
00604                     }
00605                 }
00606             }
00607         }
00608         else if (event.eEventId == SPEI_END_SR_STREAM)
00609         {
00610             // This event occurs when the stream has finished processing.
00611             // we set a flag to indicate that things are done.
00612             m_bDone = TRUE;        
00613         }
00614     }
00615 }