sapi_lipsync/sapi_lipsync_main.cpp Source File

00001 /* 
00002 sapi_lipsync_main.h
00003 
00004 Copyright (C) 2005 Annosoft, LLC. Richardson, Texas. 
00005 All rights reserved.  
00006     
00007 Permission is hereby granted, free of charge, to use and distribute
00008 this software and its documentation without restriction, including   
00009 without limitation the rights to use, copy, modify, merge, publish,  
00010 distribute, sublicense, and/or sell copies of this work, and to      
00011 permit persons to whom this work is furnished to do so, subject to   
00012 the following conditions:                                            
00013 1. The code must retain the above copyright notice, this list of    
00014     conditions and the following disclaimer.                        
00015 2. Any modifications must be clearly marked as such.                
00016 3. Original authors' names are not deleted.                         
00017 4. The name "Annosoft" and the authors' names can be not used to endorse or 
00018    promote products derived from this software without specific prior written       
00019    permission.                                            
00020 
00021 ANNOSOFT AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES 
00022 WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF 
00023 MERCHANTABILITY AND FITNESS, IN NO EVENT ANNOSOFT NOR THE CONTRIBUTORS 
00024 BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 
00025 DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   
00026 AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 
00027 OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
00028 
00029 */  
00030 
00031 
00071 #include "stdafx.h"
00072 #include <comdef.h>
00073 #include <fstream>
00074 #include "sapi_lipsync.h"
00075 #include "sapi_util.h"
00076 #include "phone_estimate.h"
00077 
00078 
00080 void usage(std::ostream& os)
00081 {
00082     os << "USAGE:" << std::endl;
00083     os << "       Lipsync without a text transcription:" << std::endl;
00084     os << " DOS:\\>sapi_lipsync.exe path-to-audio-file.wav" << std::endl;
00085     os << "       Lipsync with a text transcription:" << std::endl;
00086     os << " DOS:\\>sapi_lipsync.exe path-to-audio-file.wav path-to-text-file.txt" << std::endl; 
00087 }
00088 
00090 void banner(std::ostream& os, std::wstring& strAudioFile, 
00091             TCHAR* strTextFile)
00092 {
00093     USES_CONVERSION;
00094     os << "SAPI Lipsync version 1.0 Copyright (C) Annosoft LLC 2005. All Rights Reserved" << std::endl;
00095     os << std::endl << std::endl;
00096     os << "This program will generate phoneme timings from an audio file using SAPI 5.1." << std::endl << 
00097         "It takes as input a Windows RIFF WAV and an optional text transcript" << std::endl <<
00098         "and generates phoneme and word markers from the results " << std::endl << std::endl;
00099 
00100     if (strAudioFile.size() && strTextFile)
00101     {
00102         os << "Text based Lipsync: " << std::endl;
00103         os << "audio: " << wstring_2_string(strAudioFile) << std::endl;
00104         os << "text: " << T2A(strTextFile) << std::endl;
00105         
00106     }
00107     else if (strAudioFile.size())
00108     {
00109         os << "Textless Lipsync: " << std::endl;
00110         os << "audio: " << wstring_2_string(strAudioFile) << std::endl;
00111     }
00112     os << std::endl << std::endl;
00113 }
00114 
00124 void
00125 run_lipsync_message_loop(sapi_lipsync& lsp)
00126 {
00127     MSG msg;
00128     while (!lsp.isDone())
00129     {
00130         if (GetMessage(&msg, NULL, 0, 0))
00131         {
00132             TranslateMessage(&msg);
00133             DispatchMessage(&msg);
00134             Sleep(100);
00135         }       
00136     }        
00137 }
00138 
00139 
00153 void
00154 run_sapi_textbased_lipsync(std::wstring& strAudioFile, TCHAR *strTextFile)
00155 {
00156     // 1. [optional] declare the SAPI 5.1 estimator. 
00157     // NOTE: for different phoneme sets, create a new estimator
00158     phoneme_estimator sapi51Estimator;
00159 
00160     // 2. Load the text file into memory
00161     WCHAR * pwszCoMem = 0;
00162     ULONG cch = 0;
00163     HRESULT hr = GetTextFile(strTextFile, &pwszCoMem, &cch);
00164     if (hr == S_OK)
00165     {
00166         std::wstring strText(pwszCoMem, cch);
00167 
00168         // 3. declare the sapi lipsync object and call the lipsync method 
00169         // to start the lipsync process
00170         sapi_textbased_lipsync lsp(&sapi51Estimator);
00171         if (lsp.lipsync(strAudioFile, strText))
00172         {
00173 
00174             // 4. Run the message loop and wait till the lipsync is 
00175             // finished
00176             run_lipsync_message_loop(lsp);
00177               
00178             // 5. finalize the lipsync results for printing
00179             // this call will estimate phoneme timings 
00180             lsp.finalize_phoneme_alignment();
00181             
00182             // 6. print the results to the output stream
00183             lsp.print_results(std::cout);
00184         }
00185         else
00186         {
00187             std::wcerr << lsp.getErrorString() << std::endl;            
00188         }
00189     }
00190     else
00191     {
00192         std::wcerr << L"Can't open text transcript file" << std::endl;
00193     }
00194 }
00195 
00207 void
00208 run_sapi_textless_lipsync(std::wstring& strAudioFile)
00209 {
00210     // 1. [optional] declare the SAPI 5.1 estimator. 
00211     // NOTE: for different phoneme sets: create a new estimator
00212     phoneme_estimator sapi51Estimator;
00213 
00214     // 2. declare the sapi lipsync object and call the lipsync method to
00215     // start the lipsync process
00216     sapi_textless_lipsync lsp(&sapi51Estimator);
00217     if (lsp.lipsync(strAudioFile))
00218     {
00219         // 3. Run the message loop and wait till the lipsync is finished
00220         run_lipsync_message_loop(lsp);
00221         
00222         // 4. finalize the lipsync results for printing
00223         // this call will estimate phoneme timings 
00224         lsp.finalize_phoneme_alignment();
00225 
00226         // 5. print the results to the output stream
00227         lsp.print_results(std::cout);
00228     
00229     }
00230     else
00231     {
00232         std::wcerr << lsp.getErrorString() << std::endl;
00233     }
00234 }
00235 
00236 
00244 int _tmain(int argc, TCHAR* argv[], TCHAR* envp[])
00245 {
00246     
00247 
00248     CoInitialize(NULL);
00249     std::wstring strAudioFile;
00250     TCHAR *strTextFile = NULL;
00251     if (argc >= 2)
00252     {
00253         strAudioFile = TCHAR_2_wstring(argv[1]);
00254     }
00255     if (argc == 3)
00256     {
00257         strTextFile = argv[2];
00258     }
00259 
00260     if (argc < 2 || argc > 3)
00261     {
00262         usage(std::cerr);
00263         return (-1);
00264     }
00265     banner(std::cerr, strAudioFile, strTextFile);   
00266 
00267     // lipsync!
00268     if (strAudioFile.size() && strTextFile)
00269     {
00270         run_sapi_textbased_lipsync(strAudioFile, strTextFile);
00271     }
00272     else if (strAudioFile.size())
00273     {
00274         run_sapi_textless_lipsync(strAudioFile);
00275     }       
00276     else
00277     {
00278         usage(std::cerr);
00279     }
00280     return (0);
00281 }
00282 
00283