phoneme_estimator::EstimatePhonemeAlignment

The alignment result should have the alignment_results::m_phonemes instantiated with the correct value, and it should have the start and end time established for the word.

This method will calculate the alignment_results::m_phonemeStartTimes given the specification for the phonemes. If there is a conversion process in the engine_phoneme_spec list, that conversion will also be done.

This method uses no signal processing, the estimation is based on the type of phoneme being timed out.

00161 {
00162     float actual_duration = float(align.m_msEnd - align.m_msStart);
00163     
00164     // silence, unvoiced = 30
00165     // voiced = 50
00166     // dipthong = 60
00167     // what we do is calculate the "optimal" time given the 30, 50, 60 rule
00168     // then look at the difference between the actual duration and the optimal
00169     // duration and scale the numbers by the difference. Very simplistic.
00170     std::vector<long> optimalEndTimes;
00171     unsigned long i;
00172     long lastEndTime = 0;
00173     for (i = 0; i < align.m_phonemes.size(); i++)
00174     {
00175         bool bFound = false;
00176         engine_phoneme_spec *p = m_pSpec;
00177         while (p->enginePhoneme.size() && !bFound)
00178         {
00179             if (align.m_phonemes[i] == p->enginePhoneme)
00180             {
00181                 bFound = true;
00182                 if (p->outputPhoneme.size())
00183                 {  // swap the phoneme
00184                     align.m_phonemes[i] = p->outputPhoneme;
00185                 }
00186                 switch (p->m_type)
00187                 {
00188                     
00189                     case engine_phoneme_spec::voiced:
00190                         optimalEndTimes.push_back(lastEndTime + 50);
00191                         lastEndTime += 50;
00192                         break;
00193                     case engine_phoneme_spec::dipthong:
00194                         optimalEndTimes.push_back(lastEndTime + 60);
00195                         lastEndTime += 60;
00196                         break;
00197                     case engine_phoneme_spec::unvoiced:
00198                     case engine_phoneme_spec::silence:                      
00199                     default:
00200                         optimalEndTimes.push_back(lastEndTime + 30);
00201                         lastEndTime += 30;
00202                 }
00203             }
00204             else
00205                 p++;
00206         }
00207         if (!bFound)
00208         {
00209             std::wcerr << L"Phoneme label" << align.m_phonemes[i].c_str() << L" not found in phoneme mapper" << std::endl; 
00210             optimalEndTimes.push_back(lastEndTime + 50);
00211             lastEndTime += 50;
00212         }
00213     }
00214     if (lastEndTime > 0)
00215     {
00216         align.m_phonemeEndTimes.clear(); // clear the results
00217         float opt_duration = (float)lastEndTime;
00218         float diff = actual_duration/opt_duration;
00219         for (i = 0; i < optimalEndTimes.size(); i++)
00220         {
00221             float act = optimalEndTimes[i] * diff;
00222             align.m_phonemeEndTimes.push_back(long(act) + align.m_msStart);
00223         }
00224     }
00225 }