sapi_lipsync/phone_estimate.cpp Source File

00001 /* 
00002 phone_estimate.cpp
00003 
00004 Copyright (C) 2005 Annosoft, LLC. Richardson, Texas. 
00005 All rights reserved.  
00006     
00007 Permission is hereby granted, free of charge, to use and distribute
00008 this software and its documentation without restriction, including   
00009 without limitation the rights to use, copy, modify, merge, publish,  
00010 distribute, sublicense, and/or sell copies of this work, and to      
00011 permit persons to whom this work is furnished to do so, subject to   
00012 the following conditions:                                            
00013 1. The code must retain the above copyright notice, this list of    
00014     conditions and the following disclaimer.                        
00015 2. Any modifications must be clearly marked as such.                
00016 3. Original authors' names are not deleted.                         
00017 4. The name "Annosoft" and the authors' names can be not used to endorse or 
00018    promote products derived from this software without specific prior written       
00019    permission.                                            
00020 
00021 ANNOSOFT AND THE CONTRIBUTORS TO THIS WORK DISCLAIM ALL WARRANTIES 
00022 WITH REGARD TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF 
00023 MERCHANTABILITY AND FITNESS, IN NO EVENT ANNOSOFT NOR THE CONTRIBUTORS 
00024 BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY 
00025 DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   
00026 AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 
00027 OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 
00028 
00029 */  
00030 
00045 #include "stdafx.h"
00046 #include <comdef.h>
00047 #include <string>
00048 #include "sapi_lipsync.h"
00049 #include "phone_estimate.h"
00050 #include "sapi_util.h"
00051 
00052 
00063 engine_phoneme_spec SapiEnglish51[] =
00064 {
00065 engine_phoneme_spec(L"-", L"x",  L"syllable", engine_phoneme_spec::silence),
00066 engine_phoneme_spec(L"!", L"x", L"Sentence" , engine_phoneme_spec::silence),
00067 engine_phoneme_spec(L"aa", L"AH", L"father", engine_phoneme_spec::voiced), 
00068 engine_phoneme_spec(L"ae", L"AE", L"cat", engine_phoneme_spec::voiced), 
00069 engine_phoneme_spec(L"ah", L"AH", L"cut", engine_phoneme_spec::voiced),
00070 engine_phoneme_spec(L"ao", L"AO", L"dog", engine_phoneme_spec::dipthong),
00071 engine_phoneme_spec(L"aw", L"AW", L"foul", engine_phoneme_spec::dipthong),
00072 engine_phoneme_spec(L"ax", L"AH", L"ago", engine_phoneme_spec::voiced),
00073 engine_phoneme_spec(L"ay", L"AY", L"bite", engine_phoneme_spec::voiced),
00074 engine_phoneme_spec(L"b", L"b", L"big", engine_phoneme_spec::voiced),
00075 engine_phoneme_spec(L"ch", L"CH", L"chin", engine_phoneme_spec::unvoiced),
00076 engine_phoneme_spec(L"d", L"d", L"dig", engine_phoneme_spec::unvoiced),
00077 engine_phoneme_spec(L"dh", L"DH", L"then", engine_phoneme_spec::unvoiced),
00078 engine_phoneme_spec(L"eh", L"EH", L"pet", engine_phoneme_spec::voiced),
00079 engine_phoneme_spec(L"er", L"ER", L"fur", engine_phoneme_spec::dipthong),
00080 engine_phoneme_spec(L"ey", L"EY", L"ate", engine_phoneme_spec::dipthong),
00081 engine_phoneme_spec(L"f", L"f", L"fork", engine_phoneme_spec::unvoiced),
00082 engine_phoneme_spec(L"g", L"g", L"gut", engine_phoneme_spec::unvoiced),
00083 engine_phoneme_spec(L"h", L"h", L"help", engine_phoneme_spec::unvoiced),
00084 engine_phoneme_spec(L"ih", L"IH", L"fill", engine_phoneme_spec::silence),
00085 engine_phoneme_spec(L"iy", L"IY", L"feel", engine_phoneme_spec::silence),
00086 engine_phoneme_spec(L"jh", L"j", L"joy", engine_phoneme_spec::unvoiced),
00087 engine_phoneme_spec(L"k", L"k", L"cut", engine_phoneme_spec::unvoiced),
00088 engine_phoneme_spec(L"l", L"l", L"lid", engine_phoneme_spec::voiced),
00089 engine_phoneme_spec(L"m", L"m", L"mat", engine_phoneme_spec::unvoiced),
00090 engine_phoneme_spec(L"n", L"n", L"no", engine_phoneme_spec::unvoiced),
00091 engine_phoneme_spec(L"ng", L"NG", L"sing", engine_phoneme_spec::unvoiced),
00092 engine_phoneme_spec(L"ow", L"OW", L"go", engine_phoneme_spec::dipthong),
00093 engine_phoneme_spec(L"oy", L"OY", L"toy", engine_phoneme_spec::dipthong),
00094 engine_phoneme_spec(L"p", L"p", L"put", engine_phoneme_spec::unvoiced),
00095 engine_phoneme_spec(L"r", L"r", L"red", engine_phoneme_spec::voiced),
00096 engine_phoneme_spec(L"s", L"s", L"sit", engine_phoneme_spec::unvoiced),
00097 engine_phoneme_spec(L"sh", L"SH", L"she", engine_phoneme_spec::unvoiced),
00098 engine_phoneme_spec(L"t", L"t", L"talk", engine_phoneme_spec::unvoiced),
00099 engine_phoneme_spec(L"th", L"TH", L"thin", engine_phoneme_spec::unvoiced),
00100 engine_phoneme_spec(L"uh", L"UH", L"book", engine_phoneme_spec::dipthong),
00101 engine_phoneme_spec(L"uw", L"UW", L"too", engine_phoneme_spec::dipthong),
00102 engine_phoneme_spec(L"v", L"v", L"vat", engine_phoneme_spec::unvoiced),
00103 engine_phoneme_spec(L"w", L"w", L"with", engine_phoneme_spec::voiced),
00104 engine_phoneme_spec(L"y", L"y", L"yard", engine_phoneme_spec::unvoiced),
00105 engine_phoneme_spec(L"z", L"z", L"zap", engine_phoneme_spec::unvoiced),
00106 engine_phoneme_spec(L"zh", L"ZH", L"pleasure", engine_phoneme_spec::unvoiced),
00107 // end marker
00108 engine_phoneme_spec(L"", L"", L"", engine_phoneme_spec::silence)
00109 };
00110 
00111 
00114 // phoneme_estimator implementation
00117 
00122 phoneme_estimator::phoneme_estimator()
00123 {
00124     this->m_pSpec = SapiEnglish51;
00125 }
00126 
00140 phoneme_estimator::phoneme_estimator(engine_phoneme_spec* pSpec)
00141 {
00142     m_pSpec = pSpec;
00143 }
00144 
00160 void phoneme_estimator::EstimatePhonemeAlignment(alignment_result& align)
00161 {
00162     float actual_duration = float(align.m_msEnd - align.m_msStart);
00163     
00164     // silence, unvoiced = 30
00165     // voiced = 50
00166     // dipthong = 60
00167     // what we do is calculate the "optimal" time given the 30, 50, 60 rule
00168     // then look at the difference between the actual duration and the optimal
00169     // duration and scale the numbers by the difference. Very simplistic.
00170     std::vector<long> optimalEndTimes;
00171     unsigned long i;
00172     long lastEndTime = 0;
00173     for (i = 0; i < align.m_phonemes.size(); i++)
00174     {
00175         bool bFound = false;
00176         engine_phoneme_spec *p = m_pSpec;
00177         while (p->enginePhoneme.size() && !bFound)
00178         {
00179             if (align.m_phonemes[i] == p->enginePhoneme)
00180             {
00181                 bFound = true;
00182                 if (p->outputPhoneme.size())
00183                 {  // swap the phoneme
00184                     align.m_phonemes[i] = p->outputPhoneme;
00185                 }
00186                 switch (p->m_type)
00187                 {
00188                     
00189                     case engine_phoneme_spec::voiced:
00190                         optimalEndTimes.push_back(lastEndTime + 50);
00191                         lastEndTime += 50;
00192                         break;
00193                     case engine_phoneme_spec::dipthong:
00194                         optimalEndTimes.push_back(lastEndTime + 60);
00195                         lastEndTime += 60;
00196                         break;
00197                     case engine_phoneme_spec::unvoiced:
00198                     case engine_phoneme_spec::silence:                      
00199                     default:
00200                         optimalEndTimes.push_back(lastEndTime + 30);
00201                         lastEndTime += 30;
00202                 }
00203             }
00204             else
00205                 p++;
00206         }
00207         if (!bFound)
00208         {
00209             std::wcerr << L"Phoneme label" << align.m_phonemes[i].c_str() << L" not found in phoneme mapper" << std::endl; 
00210             optimalEndTimes.push_back(lastEndTime + 50);
00211             lastEndTime += 50;
00212         }
00213     }
00214     if (lastEndTime > 0)
00215     {
00216         align.m_phonemeEndTimes.clear(); // clear the results
00217         float opt_duration = (float)lastEndTime;
00218         float diff = actual_duration/opt_duration;
00219         for (i = 0; i < optimalEndTimes.size(); i++)
00220         {
00221             float act = optimalEndTimes[i] * diff;
00222             align.m_phonemeEndTimes.push_back(long(act) + align.m_msStart);
00223         }
00224     }
00225 }
00226 
00227 
00246 void phoneme_estimator::TrivialPhonemeAlignment(alignment_result& align)
00247 {
00248     float duration = float(align.m_msEnd - align.m_msStart);
00249     if (align.m_phonemes.size())
00250     {
00251         float inc = duration/float(align.m_phonemes.size());
00252 
00253         long pos = align.m_msStart;
00254         align.m_phonemeEndTimes.clear();
00255         for (unsigned long i = 0; i < align.m_phonemes.size(); i++)
00256         {
00257             long endTime = pos + (long)inc;
00258             align.m_phonemeEndTimes.push_back(pos);
00259             pos = endTime;
00260         }
00261     }
00262 }