sapi_lipsync/sapi_lipsync.h Source File

00001 /* 
00002 sapi_lipsync.h
00003 
00004 Copyright (C) 2005 Annosoft, LLC. Richardson, Texas. 
00005 All rights reserved.  
00006     
00007 Permission is hereby granted, free of charge, to use and distribute
00008 this software and its documentation without restriction, including   
00009 without limitation the rights to use, copy, modify, merge, publish,  
00010 distribute, sublicense, and/or sell copies of this work, and to      
00011 permit persons to whom this work is furnished to do so, subject to   
00012 the following conditions:                                            
00013 1. The code must retain the above copyright notice, this list of    
00014     conditions and the following disclaimer.                        
00015 2. Any modifications must be clearly marked as such.                
00016 3. Original authors' names are not deleted.                         
00017 4. The name "Annosoft" and the authors' names can be not used to endorse or 
00018    promote products derived from this software without specific prior written       
00019    permission.                                            
00020 
00021 ANNOSOFT AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES 
00022 WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF 
00023 MERCHANTABILITY AND FITNESS, IN NO EVENT ANNOSOFT NOR THE CONTRIBUTORS 
00024 BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 
00025 DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   
00026 AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 
00027 OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
00028 
00029 */  
00030 
00050 #ifndef _H_SAPI_LIPSYNC
00051 #define _H_SAPI_LIPSYNC
00052 
00053 // forwards 
00054 class phoneme_estimator;
00055 
00075 class alignment_result
00076 {
00077 public:
00079     long        m_msStart;
00081     long        m_msEnd;
00083     std::wstring m_orthography;
00087     std::vector<std::wstring> m_phonemes;
00092     std::vector<long> m_phonemeEndTimes;
00093 };
00094 
00122 class sapi_lipsync
00123 {
00124 public:
00126     sapi_lipsync();
00127 
00129     sapi_lipsync(phoneme_estimator* pEstimator);
00130     
00131     // destructor
00132     virtual ~sapi_lipsync();
00133 
00135     virtual void close();
00136 
00138     bool initializeObjects();
00139 
00141     bool loadAudio(const std::wstring& audioFile);
00142 
00144     const std::wstring& getErrorString();
00145 
00147     virtual bool isDone() { return (m_bDone); }
00148 
00150     std::vector<alignment_result>& get_phoneme_alignment();
00151 
00153     virtual void finalize_phoneme_alignment();
00154 
00156     virtual void print_results(std::ostream& os);
00157 
00162     virtual void callback() = 0;
00163 
00168     long sapi_time_to_milli(ULONGLONG ts)
00169     {
00170         return (long(ts / 10000));
00171     }
00176     long bytes_to_milli(DWORD dwBytes)
00177     {
00178         return (UINT)((dwBytes * 1000 )/ m_pWaveFmt->nAvgBytesPerSec); 
00179     }
00180 
00181 protected:
00182     // SAPI COM objects
00184     CComPtr<ISpRecognizer>      m_recog;
00186     CComPtr<ISpRecoContext>     m_recogCntxt;
00188     CComPtr<ISpRecoGrammar>     m_grammar;
00190     CComPtr<ISpPhoneConverter>  m_phnCvt;
00192     CComPtr<ISpStream>          m_audioStream;
00193 
00194 
00196     WAVEFORMATEX                *m_pWaveFmt;
00197 
00203     phoneme_estimator           *m_pPhnEstimator;           
00204 
00206     std::wstring m_err;
00207 
00209     std::wstring m_strAudioFile;
00210 
00213     static void _stdcall sapi_callback(WPARAM wParam, LPARAM lParam);
00214 
00215 
00217     std::vector<alignment_result> m_results;
00218 
00219 
00221     bool                          m_bDone;
00222 
00223 };
00224 
00225 
00248 class sapi_textbased_lipsync : public sapi_lipsync
00249 {
00250 public:
00252     sapi_textbased_lipsync();
00253 
00255     sapi_textbased_lipsync(phoneme_estimator* pEstimator);
00256 
00258     virtual ~sapi_textbased_lipsync();
00259     
00261     virtual bool lipsync(const std::wstring& strAudioFile, const std::wstring& strText);
00262 
00263 
00265     virtual void callback();
00266 
00268     virtual void print_results(std::ostream& os);
00269 
00270     // the method cleans the transription text for use in text based lipsync.
00271     static std::wstring preprocess_text(const std::wstring& in);
00272     
00275     static bool is_dirty_char(wchar_t in);
00276 
00277 protected:
00282     std::wstring                  m_strResults;
00283 
00285     std::wstring                  m_strInputText;
00286 
00287 };
00288 
00296 class sapi_textless_lipsync : public sapi_lipsync
00297 {
00298 public:
00300     sapi_textless_lipsync();
00301 
00303     sapi_textless_lipsync(phoneme_estimator* pEstimator);
00304 
00306     virtual ~sapi_textless_lipsync();
00307     
00309     virtual bool lipsync(const std::wstring& strAudioFile);
00310     
00312     virtual void callback();
00313 
00314 };
00315 
00316 
00317 #endif
00318 
00319 
00320