sapi_lipsync/sapi_lipsync.cpp Source File

00001 /* 
00002 sapi_lipsync.cpp
00003 
00004 Copyright (C) 2005 Annosoft, LLC. Richardson, Texas. 
00005 All rights reserved.  
00006     
00007 Permission is hereby granted, free of charge, to use and distribute
00008 this software and its documentation without restriction, including   
00009 without limitation the rights to use, copy, modify, merge, publish,  
00010 distribute, sublicense, and/or sell copies of this work, and to      
00011 permit persons to whom this work is furnished to do so, subject to   
00012 the following conditions:                                            
00013 1. The code must retain the above copyright notice, this list of    
00014     conditions and the following disclaimer.                        
00015 2. Any modifications must be clearly marked as such.                
00016 3. Original authors' names are not deleted.                         
00017 4. The name "Annosoft" and the authors' names can be not used to endorse or 
00018    promote products derived from this software without specific prior written       
00019    permission.                                            
00020 
00021 ANNOSOFT AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES 
00022 WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF 
00023 MERCHANTABILITY AND FITNESS, IN NO EVENT ANNOSOFT NOR THE CONTRIBUTORS 
00024 BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 
00025 DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   
00026 AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 
00027 OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
00028 
00029 */  
00030     
00052 #include "stdafx.h"
00053 #include <comdef.h>
00054 #include <string>
00055 #include "sapi_lipsync.h"
00056 #include "phone_estimate.h"
00057 #include "sapi_util.h"
00058 
00062 
00063 #define GID_LIPSYNC   0   // grammar identifier. 
00064 
00066 const ULONGLONG ullInterest = SPFEI(SPEI_SOUND_START) | SPFEI(SPEI_SOUND_END) |
00067                                       SPFEI(SPEI_PHRASE_START) | SPFEI(SPEI_RECOGNITION) |
00068                                       SPFEI(SPEI_FALSE_RECOGNITION) | SPFEI(SPEI_HYPOTHESIS) |
00069                                       SPFEI(SPEI_INTERFERENCE) | SPFEI(SPEI_RECO_OTHER_CONTEXT) |
00070                                       SPFEI(SPEI_REQUEST_UI) | SPFEI(SPEI_RECO_STATE_CHANGE) |
00071                                       SPFEI(SPEI_END_SR_STREAM) | 
00072                                       SPFEI(SPEI_PROPERTY_NUM_CHANGE) | SPFEI(SPEI_PROPERTY_STRING_CHANGE);
00073 
00074 
00075 
00076 
00079 // sapi_lipsync class implementation
00082 
00084 sapi_lipsync::sapi_lipsync()
00085 {
00086     m_pPhnEstimator = NULL;
00087 }
00088 
00096 sapi_lipsync::sapi_lipsync(phoneme_estimator* pEstimator)
00097 {
00098     m_pPhnEstimator = pEstimator;
00099 }
00100 
00101 
00103 sapi_lipsync::~sapi_lipsync()
00104 {
00105     close();
00106 }
00107 
00108 
00110 
00113 void sapi_lipsync::close()
00114 {
00115     this->m_recogCntxt.Release();
00116     this->m_grammar.Release();
00117     this->m_recog.Release();
00118     this->m_phnCvt.Release();
00119     if (m_pWaveFmt)
00120     {
00122         CoTaskMemFree(m_pWaveFmt);
00123     }
00124 }
00125 
00126 
00128 
00132 bool sapi_lipsync::initializeObjects()
00133 {
00134     HRESULT hr = S_OK;
00135     m_err = L"";
00136     try
00137     {
00138         // create the recognizer (inproc)
00139         hr = this->m_recog.CoCreateInstance(CLSID_SpInprocRecognizer);
00140         if (hr != S_OK) 
00141         {
00142             m_err = L"Error: Can't create SAPI Speech Recognizer (ISpRecognizer)";
00143             throw (hr);
00144         }
00145 
00146         // create the recognition context from the recognizer
00147         hr = this->m_recog->CreateRecoContext(&this->m_recogCntxt);
00148         if (hr != S_OK)
00149         {
00150             m_err = L"Error: Cannot create SAPI Recognition Context (ISpRecoContext)";
00151             throw (hr);
00152         }            
00153 
00154         hr = m_recogCntxt->SetNotifyCallbackFunction(&this->sapi_callback, 0, LPARAM(this));
00155         if (hr != S_OK)
00156         {
00157             m_err = L"Error: Cannot set notify callback function. (SetNofifyCallbackFunction)";
00158             throw (hr);
00159         }
00160         // initialize and disable the grammar
00161         hr = m_recogCntxt->CreateGrammar(GID_LIPSYNC, &m_grammar);
00162         if (hr != S_OK)
00163         {
00164             m_err = L"Error: Failed to create grammar for lipsync";
00165             throw (hr);
00166         }
00167         // not totally sure here!
00168         //hr = m_grammar->SetGrammarState(SPGS_DISABLED); Let subclasses handle the grammer state
00169         if (hr != S_OK)
00170         {
00171             m_err = L"Error: Failed to disable the grammar.";
00172             throw (hr);
00173         }
00174 
00175         // need a phoneme converter to map SPHONEID into phoneme strings
00176         hr = SpCreatePhoneConverter(MAKELANGID(LANG_ENGLISH, SUBLANG_ENGLISH_US), 
00177             NULL, NULL, &this->m_phnCvt);
00178         if (hr != S_OK)
00179         {
00180             m_err = L"Error: Failed create phoneme converter";
00181             throw (hr);
00182         }
00183 
00184         // Set interest level for events, we want all events, just in case.                
00185         hr = m_recogCntxt->SetInterest(ullInterest, ullInterest);
00186         if (hr != S_OK)
00187         {
00188             m_err = L"Error: Cannot correctly set notifications for the Speech Recognizer";
00189             throw(hr);
00190         }
00191         // turn off recognizer while we initialize things. HMMM
00192         //m_recog->SetRecoState(SPRST_INACTIVE);
00193     } 
00194     catch (HRESULT& _hr)
00195     {
00196         hr = _hr;
00197     }
00198     return (hr == S_OK);
00199 }
00200 
00201 
00208 bool sapi_lipsync::loadAudio(const std::wstring& audioFile)
00209 {
00210     HRESULT hr = S_OK;
00211 
00212     this->m_strAudioFile = audioFile;
00213     try
00214     {
00215         m_bDone = false;
00216         
00217         hr = SPBindToFile(audioFile.c_str(), SPFM_OPEN_READONLY, &this->m_audioStream);
00218         if (hr != S_OK)
00219         {
00220             m_err = L"Error: Can't open audio file";
00221             throw(hr);
00222         }
00223         GUID guid; // unused
00224         hr = this->m_audioStream->GetFormat(&guid, &m_pWaveFmt);
00225         if (hr != S_OK)
00226         {
00227             m_err = L"Error: cannot get audio formatting information";
00228             throw (hr);
00229         }
00230         
00232         hr = this->m_recog->SetInput(this->m_audioStream, TRUE);
00233         if (hr != S_OK)
00234         {
00235             m_err = L"Error: cannot set the input stream for ASR";
00236             throw (hr);
00237         }
00238     }
00239     catch (HRESULT& _hr)
00240     {
00241         hr = _hr;
00242     }
00243     return (hr == S_OK);
00244 
00245 }
00246 
00252 const std::wstring& 
00253 sapi_lipsync::getErrorString()
00254 {
00255     return (m_err);
00256 }
00257 
00262 void _stdcall 
00263 sapi_lipsync::sapi_callback(WPARAM wParam, LPARAM lParam)
00264 {
00265     sapi_lipsync* pThis = (sapi_lipsync*)lParam;
00266     pThis->callback();
00267 }
00268 
00279 std::vector<alignment_result>& 
00280 sapi_lipsync::get_phoneme_alignment()
00281 {
00282     return (m_results);
00283 }
00284 
00285 
00297 void sapi_lipsync::finalize_phoneme_alignment()
00298 {
00299     std::vector<alignment_result>::iterator p, pEnd;
00300     std::vector<alignment_result> result;
00301     result.reserve(m_results.size() * 3 + 2);
00302     p = m_results.begin();
00303     pEnd = m_results.end();
00304     long lastEndTime = 0;
00305     while (p != pEnd)
00306     {
00307         if (p->m_msStart > lastEndTime)
00308         {
00309             alignment_result r;
00310             r.m_msEnd = p->m_msStart;
00311             r.m_msStart = lastEndTime;
00312             r.m_phonemes.push_back(std::wstring(L"x"));
00313             r.m_phonemeEndTimes.push_back(p->m_msStart);            
00314             lastEndTime = p->m_msStart;
00315             result.push_back(r);
00316         }
00317         if (m_pPhnEstimator)
00318             m_pPhnEstimator->EstimatePhonemeAlignment(*p);
00319         else
00320             phoneme_estimator::TrivialPhonemeAlignment(*p);
00321         result.push_back(*p);
00322         lastEndTime = p->m_msEnd;
00323         p++;
00324     }
00326     this->m_results = result;
00327 }
00328 
00329 
00347 void sapi_lipsync::print_results(std::ostream& os)
00348 {
00349 
00350     std::vector<alignment_result>::iterator p, pEnd;
00351     p = m_results.begin();
00352     pEnd = m_results.end();
00353     // print the audio result
00354     if (m_strAudioFile.size())
00355     {
00356         os << "audio " << wstring_2_string(m_strAudioFile) << std::endl;
00357     }
00358     while (p != pEnd)
00359     {
00360         // print the word marker
00361         if (p->m_orthography.size())
00362         {
00363             os << "word " << p->m_msStart << ' ' << p->m_msEnd <<
00364                 ' ' << wstring_2_string(p->m_orthography) << std::endl;
00365         }
00366         // print the phn markers
00367         long pos = p->m_msStart;
00368         for (unsigned long j = 0; j < p->m_phonemes.size(); j++)
00369         {
00370             long start = pos; 
00371             long end = p->m_phonemeEndTimes[j];
00372             os << "phn " << start << ' ' << end << ' ' << 75 <<
00373                 ' ' << wstring_2_string(p->m_phonemes[j]) << std::endl;
00374             pos = end;
00375         }
00376         p++;
00377     }
00378 }
00379 
00380 
00383 // sapi_textbased_lipsync class implementation
00386 
00390 sapi_textbased_lipsync::sapi_textbased_lipsync()
00391 {
00392 
00393 }
00394 
00402 sapi_textbased_lipsync::sapi_textbased_lipsync(phoneme_estimator* pEstimator) :
00403 sapi_lipsync(pEstimator)
00404 {
00405     
00406 }
00407 
00408 
00412 sapi_textbased_lipsync::~sapi_textbased_lipsync()
00413 {
00414 
00415 }
00416 
00440 bool 
00441 sapi_textbased_lipsync::lipsync(const std::wstring& strAudioFile, const std::wstring& strText)
00442 {
00443     HRESULT hr;
00444     try
00445     {
00446         m_strInputText = strText;
00447         if (!this->initializeObjects())
00448             throw (HRESULT(E_FAIL));
00449         
00450         if (!this->loadAudio(strAudioFile))
00451              throw (HRESULT(E_FAIL));
00452         
00453         // initialize the grammar
00454             
00455         SPSTATEHANDLE hLipsyncRule;
00458         hr = this->m_grammar->GetRule(L"TextLipsync", NULL,
00459                             SPRAF_TopLevel | SPRAF_Active, TRUE,
00460                             &hLipsyncRule);
00461 
00462         if (hr != S_OK)
00463         {
00464             m_err = L"Failed to create grammar rule for text based lipsync";
00465             throw (hr);
00466         }
00467         
00468         // prepare text for text based lipsync. Tokenize out formatting, punctuation
00469         std::wstring strIn = preprocess_text(strText);
00470         // create the phrase inside the rule
00471         hr = m_grammar->AddWordTransition(hLipsyncRule, NULL, strIn.c_str(), 
00472             L" ", SPWT_LEXICAL, 1, NULL);
00473 
00474         if (hr != S_OK)
00475         {
00476             m_err = L"Failed to create lipsync rule for specified text transcription";
00477             throw (hr);
00478         }
00479         
00480         // finalize the grammar
00481         hr = m_grammar->Commit(0);
00482         if (hr != S_OK)
00483         {
00484             m_err = L"Failed to commit lipsync text rule for specified text transcription.";
00485             throw (hr);
00486         }
00487                 
00488         // turn the grammar on
00489         hr = m_grammar->SetGrammarState(SPGS_ENABLED);
00490         if (hr != S_OK)
00491         {
00492             m_err = L"Error: Failed to disable the grammar.";
00493             throw (hr);
00494         }
00495         // start up recognition
00496         m_recog->SetRecoState(SPRST_ACTIVE);
00497         // enable the rule
00498         m_grammar->SetRuleState(NULL, NULL, SPRS_ACTIVE);
00499 
00500         // now we should be running!
00501 
00502     }
00503     catch (HRESULT _hr)
00504     {
00505         hr = _hr;
00506     }
00507     return (hr == S_OK);
00508 }
00509 
00524 void sapi_textbased_lipsync::callback()
00525 {
00526     //USES_CONVERSION;
00527     CSpEvent event;
00528 
00529     ISpRecoResult *pRecoResult; // recoResult from the event
00530     SPPHRASE *pSpPhrase;    // phrase from recoResult
00531     SPRECORESULTTIMES pRecoResultTimes; // result times from RecoResult
00532     WCHAR phone_buffer[256];            // buffer for the phonemes
00533     UINT msStart;                       // start time of the phrase
00534     
00535 
00536     // Process the events
00537     while (event.GetFrom(this->m_recogCntxt) == S_OK)
00538     {
00539         if (event.eEventId == SPEI_RECOGNITION || event.eEventId == SPEI_HYPOTHESIS)
00540         {
00541             // text based has to accept hypothesis or it mostly fails unless the
00542             // script is very short
00543 
00544             // pull out the result object
00545             pRecoResult = event.RecoResult();
00546 
00547             // pull the whole text from the result
00548             CSpDynamicString pSapiText;
00549             pRecoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pSapiText, NULL);
00550 
00551             // get the start time for the phrase. we use this as an offset for the phrase
00552             // elements. Not sure if this is correct.
00553             pRecoResult->GetResultTimes(&pRecoResultTimes);
00554             msStart = sapi_time_to_milli(pRecoResultTimes.ullStart);
00555 
00556             std::wstring strPrintText = pSapiText;
00557             std::cerr << "hypothesis: " << wstring_2_string(strPrintText) << std::endl;
00558             // if the new results are longer than existing results in orthographic form
00559             // we accept the results and process the phonemes. Otherwise, we skip it
00560             if ((wcslen(pSapiText) > this->m_strResults.size()))
00561             {
00562                 m_strResults = pSapiText;
00563                 // clear the old results. This hypothesis trumps it
00564                 this->m_results.clear();
00565                 
00566                 // extract the phrase object
00567                 pRecoResult->GetPhrase(&pSpPhrase);
00568 
00569                 if (pSpPhrase != NULL)
00570                 {
00571                     // Process each element of the phrase. These should be our
00572                     // orthorgraphs
00573                     const SPPHRASEELEMENT *p = pSpPhrase->pElements;
00574                     const SPPHRASEELEMENT *pEnd = p + pSpPhrase->Rule.ulCountOfElements;
00575                     while (p != pEnd)
00576                     {
00577                         // for each phrase element we create a marker 
00578                     // that contains the time stamps along with the 
00579                     // phonemes. associated with it.
00580                         alignment_result al;
00581                         al.m_orthography = p->pszDisplayText;
00582                         // Get the phonemes
00583                         ULONG j = 0;
00584                         SPPHONEID phn[2];
00585                         phn[1] = 0x00;
00586                         while (p->pszPronunciation[j] != 0)
00587                         {
00588                             // process each phoneme
00589                             phn[0] = p->pszPronunciation[j];
00590                             m_phnCvt->IdToPhone(phn, phone_buffer);
00591                             al.m_phonemes.push_back(phone_buffer);
00592                             j++;
00593                         }
00594                                              
00595                         // start time of the ortheme
00596                         al.m_msStart= msStart + bytes_to_milli(p->ulAudioStreamOffset);
00597                         // end time of the ortheme
00598                         al.m_msEnd = bytes_to_milli(p->ulAudioSizeBytes);
00599                         al.m_msEnd += al.m_msStart;
00600                         // add it to the results
00601                         m_results.push_back(al);
00602                         
00603                         p++;
00604                     }
00605                 }
00606             }
00607         }
00608         else if (event.eEventId == SPEI_END_SR_STREAM)
00609         {
00610             // This event occurs when the stream has finished processing.
00611             // we set a flag to indicate that things are done.
00612             m_bDone = TRUE;        
00613         }
00614     }
00615 }
00616 
00621 void sapi_textbased_lipsync::print_results(std::ostream& os)
00622 {
00623     sapi_lipsync::print_results(os);
00624     // use bstr_t to convert to single byte
00625     bstr_t annoText = m_strInputText.c_str();
00626     os << "%%-begin-anno-text-%% " << std::endl;
00627     os << (const char*)annoText << std::endl;
00628     os << "%%-end-anno-text-%%" << std::endl;
00629 
00630 }
00631 
00644 std::wstring 
00645 sapi_textbased_lipsync::preprocess_text(const std::wstring& in)
00646 {
00647     std::wstring::const_iterator p, pEnd;
00648     std::wstring out;
00649     p = in.begin();
00650     pEnd = in.end();
00651     while (p != pEnd)
00652     {
00653         if (is_dirty_char(*p))
00654         {
00655             while (p != pEnd && is_dirty_char(*p))
00656                 p++;
00657             out += L" ";
00658         }
00659         if (p != pEnd)
00660         {
00661             out += *p;
00662             p++;
00663         }
00664     }
00665     bstr_t ist = out.c_str();
00666     std::cerr << (const char*)ist << std::endl;
00667     return (out);
00668 }
00669 
00680 bool sapi_textbased_lipsync::is_dirty_char(wchar_t in)
00681 {
00682     if (iswspace(in) || (iswpunct(in) && in != L'\'')) 
00683         return (true);
00684     return (false);
00685     if (iswalnum(in))
00686         return (false);
00687     return (true);
00688 }
00689 
00692 // sapi_textless_lipsync class implementation
00695 
00696 
00700 sapi_textless_lipsync::sapi_textless_lipsync()
00701 {
00702 }
00703 
00704 
00712 sapi_textless_lipsync::sapi_textless_lipsync(phoneme_estimator* pEstimator) :
00713 sapi_lipsync(pEstimator)
00714 {
00715     
00716 }
00717 
00721 sapi_textless_lipsync::~sapi_textless_lipsync()
00722 {
00723 
00724 }
00725 
00737 bool sapi_textless_lipsync::lipsync(const std::wstring& strAudioFile)
00738 {
00739     HRESULT hr;
00740     try
00741     {
00742         
00743         if (!this->initializeObjects())
00744             throw (HRESULT(E_FAIL));
00745         
00746         if (!this->loadAudio(strAudioFile))
00747              throw (HRESULT(E_FAIL));
00748         
00749         // initialize the grammar
00750         hr = m_grammar->LoadDictation(NULL, SPLO_STATIC);
00751         if (hr != S_OK)
00752         {
00753             m_err = L"Error: Cannot load SAPI Dictation Grammar";
00754         }
00755     
00756         hr = m_grammar->SetDictationState(SPRS_ACTIVE);
00757 
00758         if (hr != S_OK)
00759         {
00760             m_err = L"Cannot activate the SAPI Dictation Grammar";
00761             throw (hr);
00762         }
00763         
00764         m_recog->SetRecoState(SPRST_ACTIVE);
00765        
00766 
00767         // now we should be running!
00768 
00769     }
00770     catch (HRESULT _hr)
00771     {
00772         hr = _hr;
00773     }
00774     return (hr == S_OK);
00775 }
00776 
00777     
00778 
00788 void sapi_textless_lipsync::callback()
00789 {
00790     CSpEvent event; // the event
00791 
00792     ISpRecoResult *pRecoResult;         // recoResult from the event
00793     SPPHRASE      *pSpPhrase;           // phrase from recoResult
00794     SPRECORESULTTIMES pRecoResultTimes; // result times from RecoResult
00795     WCHAR phone_buffer[256];            // phoneme buffer for conversion
00796     long msStart;                       // time stamp of the result 
00797     
00798 
00799     while (event.GetFrom(this->m_recogCntxt) == S_OK)
00800     {
00801         if (event.eEventId == SPEI_RECOGNITION /*|| event.eEventId == SPEI_HYPOTHESIS */)
00802         {   
00803             // for textless we only accept full recognition. This might be an area
00804             // to watch out for
00805             
00806             // pull out the result object
00807             pRecoResult = event.RecoResult();
00808 
00809             // pull the whole text from the result
00810             CSpDynamicString pSapiText;
00811             pRecoResult->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, FALSE, &pSapiText, NULL);
00812 
00813             // get the start time for the phrase. we use this as an offset for the phrase
00814             // elements. Not sure if this is correct.
00815             pRecoResult->GetResultTimes(&pRecoResultTimes);
00816             msStart = sapi_time_to_milli(pRecoResultTimes.ullStart);
00817 
00818             // extract the phrase object
00819             pRecoResult->GetPhrase(&pSpPhrase);
00820 
00821             if (pSpPhrase != NULL)
00822             {
00823                 // Process each element of the phrase. These should be our
00824                 // orthorgraphs
00825                 const SPPHRASEELEMENT *p = pSpPhrase->pElements;
00826                 const SPPHRASEELEMENT *pEnd = p + pSpPhrase->Rule.ulCountOfElements;
00827                 while (p != pEnd)
00828                 {
00829                     // for each phrase element we create a marker 
00830                     // that contains the time stamps along with the 
00831                     // phonemes. associated with it.
00832                     alignment_result al;
00833                     al.m_orthography = p->pszDisplayText;
00834                     // Get the phonemes
00835                     ULONG j = 0;
00836                     SPPHONEID phn[2];
00837                     phn[1] = 0x00;
00838                     while (p->pszPronunciation[j] != 0)
00839                     {
00840                         // process each phoneme
00841                         phn[0] = p->pszPronunciation[j];
00842                         m_phnCvt->IdToPhone(phn, phone_buffer);
00843                         al.m_phonemes.push_back(phone_buffer);
00844                         j++;
00845                     }
00846                     // start time of the ortheme
00847                     al.m_msStart= msStart + bytes_to_milli(p->ulAudioStreamOffset);
00848                     // end time of the ortheme
00849                     al.m_msEnd = bytes_to_milli(p->ulAudioSizeBytes);
00850                     al.m_msEnd += al.m_msStart;
00851                     // add it to the results
00852                     m_results.push_back(al);
00853 
00854                     p++;
00855                 }
00856             }
00857         }
00858         else if (event.eEventId == SPEI_END_SR_STREAM)
00859         {
00860             // This event occurs when the stream has finished processing.
00861             // we set a flag to indicate that things are done.
00862             m_bDone = TRUE;        
00863         }
00864     }
00865 }