FeatureExtractionModelTransformer.cpp

Go to the documentation of this file.
00001 /* -*- c-basic-offset: 4 indent-tabs-mode: nil -*-  vi:set ts=8 sts=4 sw=4: */
00002 
00003 /*
00004     Sonic Visualiser
00005     An audio file viewer and annotation editor.
00006     Centre for Digital Music, Queen Mary, University of London.
00007     This file copyright 2006 Chris Cannam and QMUL.
00008     
00009     This program is free software; you can redistribute it and/or
00010     modify it under the terms of the GNU General Public License as
00011     published by the Free Software Foundation; either version 2 of the
00012     License, or (at your option) any later version.  See the file
00013     COPYING included with this distribution for more information.
00014 */
00015 
00016 #include "FeatureExtractionModelTransformer.h"
00017 
00018 #include "plugin/FeatureExtractionPluginFactory.h"
00019 #include "plugin/PluginXml.h"
00020 #include "vamp-sdk/Plugin.h"
00021 
00022 #include "data/model/Model.h"
00023 #include "base/Window.h"
00024 #include "data/model/SparseOneDimensionalModel.h"
00025 #include "data/model/SparseTimeValueModel.h"
00026 #include "data/model/EditableDenseThreeDimensionalModel.h"
00027 #include "data/model/DenseTimeValueModel.h"
00028 #include "data/model/NoteModel.h"
00029 #include "data/model/FFTModel.h"
00030 #include "data/model/WaveFileModel.h"
00031 
00032 #include "TransformFactory.h"
00033 
00034 #include <QMessageBox>
00035 
00036 #include <iostream>
00037 
00038 FeatureExtractionModelTransformer::FeatureExtractionModelTransformer(Input in,
00039                                                                      const Transform &transform) :
00040     ModelTransformer(in, transform),
00041     m_plugin(0),
00042     m_descriptor(0),
00043     m_outputFeatureNo(0)
00044 {
00045 //    std::cerr << "FeatureExtractionModelTransformer::FeatureExtractionModelTransformer: plugin " << pluginId.toStdString() << ", outputName " << m_transform.getOutput().toStdString() << std::endl;
00046 
00047     QString pluginId = transform.getPluginIdentifier();
00048 
00049     FeatureExtractionPluginFactory *factory =
00050         FeatureExtractionPluginFactory::instanceFor(pluginId);
00051 
00052     if (!factory) {
00053         m_message = tr("No factory available for feature extraction plugin id \"%1\" (unknown plugin type, or internal error?)").arg(pluginId);
00054         return;
00055     }
00056 
00057     DenseTimeValueModel *input = getConformingInput();
00058     if (!input) {
00059         m_message = tr("Input model for feature extraction plugin \"%1\" is of wrong type (internal error?)").arg(pluginId);
00060         return;
00061     }
00062 
00063     m_plugin = factory->instantiatePlugin(pluginId, input->getSampleRate());
00064     if (!m_plugin) {
00065         m_message = tr("Failed to instantiate plugin \"%1\"").arg(pluginId);
00066         return;
00067     }
00068 
00069     TransformFactory::getInstance()->makeContextConsistentWithPlugin
00070         (m_transform, m_plugin);
00071 
00072     TransformFactory::getInstance()->setPluginParameters
00073         (m_transform, m_plugin);
00074 
00075     size_t channelCount = input->getChannelCount();
00076     if (m_plugin->getMaxChannelCount() < channelCount) {
00077         channelCount = 1;
00078     }
00079     if (m_plugin->getMinChannelCount() > channelCount) {
00080         m_message = tr("Cannot provide enough channels to feature extraction plugin \"%1\" (plugin min is %2, max %3; input model has %4)")
00081             .arg(pluginId)
00082             .arg(m_plugin->getMinChannelCount())
00083             .arg(m_plugin->getMaxChannelCount())
00084             .arg(input->getChannelCount());
00085         return;
00086     }
00087 
00088     std::cerr << "Initialising feature extraction plugin with channels = "
00089               << channelCount << ", step = " << m_transform.getStepSize()
00090               << ", block = " << m_transform.getBlockSize() << std::endl;
00091 
00092     if (!m_plugin->initialise(channelCount,
00093                               m_transform.getStepSize(),
00094                               m_transform.getBlockSize())) {
00095 
00096         size_t pstep = m_transform.getStepSize();
00097         size_t pblock = m_transform.getBlockSize();
00098 
00099         m_transform.setStepSize(0);
00100         m_transform.setBlockSize(0);
00101         TransformFactory::getInstance()->makeContextConsistentWithPlugin
00102             (m_transform, m_plugin);
00103 
00104         if (m_transform.getStepSize() != pstep ||
00105             m_transform.getBlockSize() != pblock) {
00106             
00107             if (!m_plugin->initialise(channelCount,
00108                                       m_transform.getStepSize(),
00109                                       m_transform.getBlockSize())) {
00110 
00111                 m_message = tr("Failed to initialise feature extraction plugin \"%1\"").arg(pluginId);
00112                 return;
00113 
00114             } else {
00115 
00116                 m_message = tr("Feature extraction plugin \"%1\" rejected the given step and block sizes (%2 and %3); using plugin defaults (%4 and %5) instead")
00117                     .arg(pluginId)
00118                     .arg(pstep)
00119                     .arg(pblock)
00120                     .arg(m_transform.getStepSize())
00121                     .arg(m_transform.getBlockSize());
00122             }
00123 
00124         } else {
00125 
00126             m_message = tr("Failed to initialise feature extraction plugin \"%1\"").arg(pluginId);
00127             return;
00128         }
00129     }
00130 
00131     if (m_transform.getPluginVersion() != "") {
00132         QString pv = QString("%1").arg(m_plugin->getPluginVersion());
00133         if (pv != m_transform.getPluginVersion()) {
00134             QString vm = tr("Transform was configured for version %1 of plugin \"%2\", but the plugin being used is version %3")
00135                 .arg(m_transform.getPluginVersion())
00136                 .arg(pluginId)
00137                 .arg(pv);
00138             if (m_message != "") {
00139                 m_message = QString("%1; %2").arg(vm).arg(m_message);
00140             } else {
00141                 m_message = vm;
00142             }
00143         }
00144     }
00145 
00146     Vamp::Plugin::OutputList outputs = m_plugin->getOutputDescriptors();
00147 
00148     if (outputs.empty()) {
00149         m_message = tr("Plugin \"%1\" has no outputs").arg(pluginId);
00150         return;
00151     }
00152     
00153     for (size_t i = 0; i < outputs.size(); ++i) {
00154         if (m_transform.getOutput() == "" ||
00155             outputs[i].identifier == m_transform.getOutput().toStdString()) {
00156             m_outputFeatureNo = i;
00157             m_descriptor = new Vamp::Plugin::OutputDescriptor
00158                 (outputs[i]);
00159             break;
00160         }
00161     }
00162 
00163     if (!m_descriptor) {
00164         m_message = tr("Plugin \"%1\" has no output named \"%2\"")
00165             .arg(pluginId)
00166             .arg(m_transform.getOutput());
00167         return;
00168     }
00169 
00170 //    std::cerr << "FeatureExtractionModelTransformer: output sample type "
00171 //            << m_descriptor->sampleType << std::endl;
00172 
00173     int binCount = 1;
00174     float minValue = 0.0, maxValue = 0.0;
00175     bool haveExtents = false;
00176     
00177     if (m_descriptor->hasFixedBinCount) {
00178         binCount = m_descriptor->binCount;
00179     }
00180 
00181 //    std::cerr << "FeatureExtractionModelTransformer: output bin count "
00182 //            << binCount << std::endl;
00183 
00184     if (binCount > 0 && m_descriptor->hasKnownExtents) {
00185         minValue = m_descriptor->minValue;
00186         maxValue = m_descriptor->maxValue;
00187         haveExtents = true;
00188     }
00189 
00190     size_t modelRate = input->getSampleRate();
00191     size_t modelResolution = 1;
00192     
00193     switch (m_descriptor->sampleType) {
00194 
00195     case Vamp::Plugin::OutputDescriptor::VariableSampleRate:
00196         if (m_descriptor->sampleRate != 0.0) {
00197             modelResolution = size_t(modelRate / m_descriptor->sampleRate + 0.001);
00198         }
00199         break;
00200 
00201     case Vamp::Plugin::OutputDescriptor::OneSamplePerStep:
00202         modelResolution = m_transform.getStepSize();
00203         break;
00204 
00205     case Vamp::Plugin::OutputDescriptor::FixedSampleRate:
00206         modelRate = size_t(m_descriptor->sampleRate + 0.001);
00207         break;
00208     }
00209 
00210     if (binCount == 0) {
00211 
00212         m_output = new SparseOneDimensionalModel(modelRate, modelResolution,
00213                                                  false);
00214 
00215     } else if (binCount == 1) {
00216 
00217         SparseTimeValueModel *model;
00218         if (haveExtents) {
00219             model = new SparseTimeValueModel
00220                 (modelRate, modelResolution, minValue, maxValue, false);
00221         } else {
00222             model = new SparseTimeValueModel
00223                 (modelRate, modelResolution, false);
00224         }
00225         model->setScaleUnits(outputs[m_outputFeatureNo].unit.c_str());
00226 
00227         m_output = model;
00228 
00229     } else if (m_descriptor->sampleType ==
00230                Vamp::Plugin::OutputDescriptor::VariableSampleRate) {
00231 
00232         // We don't have a sparse 3D model, so interpret this as a
00233         // note model.  There's nothing to define which values to use
00234         // as which parameters of the note -- for the moment let's
00235         // treat the first as pitch, second as duration in frames,
00236         // third (if present) as velocity. (Our note model doesn't
00237         // yet store velocity.)
00239         
00240         NoteModel *model;
00241         if (haveExtents) {
00242             model = new NoteModel
00243                 (modelRate, modelResolution, minValue, maxValue, false);
00244         } else {
00245             model = new NoteModel
00246                 (modelRate, modelResolution, false);
00247         }            
00248         model->setScaleUnits(outputs[m_outputFeatureNo].unit.c_str());
00249 
00250         m_output = model;
00251 
00252     } else {
00253 
00254         EditableDenseThreeDimensionalModel *model =
00255             new EditableDenseThreeDimensionalModel
00256             (modelRate, modelResolution, binCount, false);
00257 
00258         if (!m_descriptor->binNames.empty()) {
00259             std::vector<QString> names;
00260             for (size_t i = 0; i < m_descriptor->binNames.size(); ++i) {
00261                 names.push_back(m_descriptor->binNames[i].c_str());
00262             }
00263             model->setBinNames(names);
00264         }
00265         
00266         m_output = model;
00267     }
00268 
00269     if (m_output) m_output->setSourceModel(input);
00270 }
00271 
00272 FeatureExtractionModelTransformer::~FeatureExtractionModelTransformer()
00273 {
00274     std::cerr << "FeatureExtractionModelTransformer::~FeatureExtractionModelTransformer()" << std::endl;
00275     delete m_plugin;
00276     delete m_descriptor;
00277 }
00278 
00279 DenseTimeValueModel *
00280 FeatureExtractionModelTransformer::getConformingInput()
00281 {
00282     DenseTimeValueModel *dtvm =
00283         dynamic_cast<DenseTimeValueModel *>(getInputModel());
00284     if (!dtvm) {
00285         std::cerr << "FeatureExtractionModelTransformer::getConformingInput: WARNING: Input model is not conformable to DenseTimeValueModel" << std::endl;
00286     }
00287     return dtvm;
00288 }
00289 
00290 void
00291 FeatureExtractionModelTransformer::run()
00292 {
00293     DenseTimeValueModel *input = getConformingInput();
00294     if (!input) return;
00295 
00296     if (!m_output) return;
00297 
00298     while (!input->isReady()) {
00299 /*
00300         if (dynamic_cast<WaveFileModel *>(input)) {
00301             std::cerr << "FeatureExtractionModelTransformer::run: Model is not ready, but it's not a WaveFileModel (it's a " << typeid(input).name() << "), so that's OK" << std::endl;
00302             sleep(2);
00303             break; // no need to wait
00304         }
00305 */
00306         std::cerr << "FeatureExtractionModelTransformer::run: Waiting for input model to be ready..." << std::endl;
00307         sleep(1);
00308     }
00309 
00310     size_t sampleRate = input->getSampleRate();
00311 
00312     size_t channelCount = input->getChannelCount();
00313     if (m_plugin->getMaxChannelCount() < channelCount) {
00314         channelCount = 1;
00315     }
00316 
00317     float **buffers = new float*[channelCount];
00318     for (size_t ch = 0; ch < channelCount; ++ch) {
00319         buffers[ch] = new float[m_transform.getBlockSize() + 2];
00320     }
00321 
00322     size_t stepSize = m_transform.getStepSize();
00323     size_t blockSize = m_transform.getBlockSize();
00324 
00325     bool frequencyDomain = (m_plugin->getInputDomain() ==
00326                             Vamp::Plugin::FrequencyDomain);
00327     std::vector<FFTModel *> fftModels;
00328 
00329     if (frequencyDomain) {
00330         for (size_t ch = 0; ch < channelCount; ++ch) {
00331             FFTModel *model = new FFTModel
00332                                   (getConformingInput(),
00333                                    channelCount == 1 ? m_input.getChannel() : ch,
00334                                    m_transform.getWindowType(),
00335                                    blockSize,
00336                                    stepSize,
00337                                    blockSize,
00338                                    false,
00339                                    StorageAdviser::PrecisionCritical);
00340             if (!model->isOK()) {
00341                 QMessageBox::critical
00342                     (0, tr("FFT cache failed"),
00343                      tr("Failed to create the FFT model for this transform.\n"
00344                         "There may be insufficient memory or disc space to continue."));
00345                 delete model;
00346                 setCompletion(100);
00347                 return;
00348             }
00349             model->resume();
00350             fftModels.push_back(model);
00351         }
00352     }
00353 
00354     long startFrame = m_input.getModel()->getStartFrame();
00355     long   endFrame = m_input.getModel()->getEndFrame();
00356 
00357     RealTime contextStartRT = m_transform.getStartTime();
00358     RealTime contextDurationRT = m_transform.getDuration();
00359 
00360     long contextStart =
00361         RealTime::realTime2Frame(contextStartRT, sampleRate);
00362 
00363     long contextDuration =
00364         RealTime::realTime2Frame(contextDurationRT, sampleRate);
00365 
00366     if (contextStart == 0 || contextStart < startFrame) {
00367         contextStart = startFrame;
00368     }
00369 
00370     if (contextDuration == 0) {
00371         contextDuration = endFrame - contextStart;
00372     }
00373     if (contextStart + contextDuration > endFrame) {
00374         contextDuration = endFrame - contextStart;
00375     }
00376 
00377     long blockFrame = contextStart;
00378 
00379     long prevCompletion = 0;
00380 
00381     setCompletion(0);
00382 
00383     while (!m_abandoned) {
00384 
00385         if (frequencyDomain) {
00386             if (blockFrame - int(blockSize)/2 >
00387                 contextStart + contextDuration) break;
00388         } else {
00389             if (blockFrame >= 
00390                 contextStart + contextDuration) break;
00391         }
00392 
00393 //      std::cerr << "FeatureExtractionModelTransformer::run: blockFrame "
00394 //                << blockFrame << ", endFrame " << endFrame << ", blockSize "
00395 //                  << blockSize << std::endl;
00396 
00397         long completion =
00398             (((blockFrame - contextStart) / stepSize) * 99) /
00399             (contextDuration / stepSize);
00400 
00401         // channelCount is either m_input.getModel()->channelCount or 1
00402 
00403         if (frequencyDomain) {
00404             for (size_t ch = 0; ch < channelCount; ++ch) {
00405                 int column = (blockFrame - startFrame) / stepSize;
00406                 for (size_t i = 0; i <= blockSize/2; ++i) {
00407                     fftModels[ch]->getValuesAt
00408                         (column, i, buffers[ch][i*2], buffers[ch][i*2+1]);
00409                 }
00410             }
00411         } else {
00412             getFrames(channelCount, blockFrame, blockSize, buffers);
00413         }
00414 
00415         Vamp::Plugin::FeatureSet features = m_plugin->process
00416             (buffers, Vamp::RealTime::frame2RealTime(blockFrame, sampleRate));
00417 
00418         for (size_t fi = 0; fi < features[m_outputFeatureNo].size(); ++fi) {
00419             Vamp::Plugin::Feature feature =
00420                 features[m_outputFeatureNo][fi];
00421             addFeature(blockFrame, feature);
00422         }
00423 
00424         if (blockFrame == contextStart || completion > prevCompletion) {
00425             setCompletion(completion);
00426             prevCompletion = completion;
00427         }
00428 
00429         blockFrame += stepSize;
00430     }
00431 
00432     if (m_abandoned) return;
00433 
00434     Vamp::Plugin::FeatureSet features = m_plugin->getRemainingFeatures();
00435 
00436     for (size_t fi = 0; fi < features[m_outputFeatureNo].size(); ++fi) {
00437         Vamp::Plugin::Feature feature =
00438             features[m_outputFeatureNo][fi];
00439         addFeature(blockFrame, feature);
00440     }
00441 
00442     if (frequencyDomain) {
00443         for (size_t ch = 0; ch < channelCount; ++ch) {
00444             delete fftModels[ch];
00445         }
00446     }
00447 
00448     setCompletion(100);
00449 }
00450 
00451 void
00452 FeatureExtractionModelTransformer::getFrames(int channelCount,
00453                                              long startFrame, long size,
00454                                              float **buffers)
00455 {
00456     long offset = 0;
00457 
00458     if (startFrame < 0) {
00459         for (int c = 0; c < channelCount; ++c) {
00460             for (int i = 0; i < size && startFrame + i < 0; ++i) {
00461                 buffers[c][i] = 0.0f;
00462             }
00463         }
00464         offset = -startFrame;
00465         size -= offset;
00466         if (size <= 0) return;
00467         startFrame = 0;
00468     }
00469 
00470     DenseTimeValueModel *input = getConformingInput();
00471     if (!input) return;
00472     
00473     long got = 0;
00474 
00475     if (channelCount == 1) {
00476 
00477         got = input->getData(m_input.getChannel(), startFrame, size,
00478                              buffers[0] + offset);
00479 
00480         if (m_input.getChannel() == -1 && input->getChannelCount() > 1) {
00481             // use mean instead of sum, as plugin input
00482             float cc = float(input->getChannelCount());
00483             for (long i = 0; i < size; ++i) {
00484                 buffers[0][i + offset] /= cc;
00485             }
00486         }
00487 
00488     } else {
00489 
00490         float **writebuf = buffers;
00491         if (offset > 0) {
00492             writebuf = new float *[channelCount];
00493             for (int i = 0; i < channelCount; ++i) {
00494                 writebuf[i] = buffers[i] + offset;
00495             }
00496         }
00497 
00498         got = input->getData(0, channelCount-1, startFrame, size, writebuf);
00499 
00500         if (writebuf != buffers) delete[] writebuf;
00501     }
00502 
00503     while (got < size) {
00504         for (int c = 0; c < channelCount; ++c) {
00505             buffers[c][got + offset] = 0.0;
00506         }
00507         ++got;
00508     }
00509 }
00510 
00511 void
00512 FeatureExtractionModelTransformer::addFeature(size_t blockFrame,
00513                                              const Vamp::Plugin::Feature &feature)
00514 {
00515     size_t inputRate = m_input.getModel()->getSampleRate();
00516 
00517 //    std::cerr << "FeatureExtractionModelTransformer::addFeature("
00518 //            << blockFrame << ")" << std::endl;
00519 
00520     int binCount = 1;
00521     if (m_descriptor->hasFixedBinCount) {
00522         binCount = m_descriptor->binCount;
00523     }
00524 
00525     size_t frame = blockFrame;
00526 
00527     if (m_descriptor->sampleType ==
00528         Vamp::Plugin::OutputDescriptor::VariableSampleRate) {
00529 
00530         if (!feature.hasTimestamp) {
00531             std::cerr
00532                 << "WARNING: FeatureExtractionModelTransformer::addFeature: "
00533                 << "Feature has variable sample rate but no timestamp!"
00534                 << std::endl;
00535             return;
00536         } else {
00537             frame = Vamp::RealTime::realTime2Frame(feature.timestamp, inputRate);
00538         }
00539 
00540     } else if (m_descriptor->sampleType ==
00541                Vamp::Plugin::OutputDescriptor::FixedSampleRate) {
00542 
00543         if (feature.hasTimestamp) {
00545             frame = Vamp::RealTime::realTime2Frame(feature.timestamp,
00546                                                    lrintf(m_descriptor->sampleRate));
00547         } else {
00548             frame = m_output->getEndFrame();
00549         }
00550     }
00551         
00552     if (binCount == 0) {
00553 
00554         SparseOneDimensionalModel *model =
00555             getConformingOutput<SparseOneDimensionalModel>();
00556         if (!model) return;
00557 
00558         model->addPoint(SparseOneDimensionalModel::Point(frame, feature.label.c_str()));
00559         
00560     } else if (binCount == 1) {
00561 
00562         float value = 0.0;
00563         if (feature.values.size() > 0) value = feature.values[0];
00564 
00565         SparseTimeValueModel *model =
00566             getConformingOutput<SparseTimeValueModel>();
00567         if (!model) return;
00568 
00569         model->addPoint(SparseTimeValueModel::Point(frame, value, feature.label.c_str()));
00570 //        std::cerr << "SparseTimeValueModel::addPoint(" << frame << ", " << value << "), " << feature.label.c_str() << std::endl;
00571 
00572     } else if (m_descriptor->sampleType == 
00573                Vamp::Plugin::OutputDescriptor::VariableSampleRate) {
00574 
00575         float pitch = 0.0;
00576         if (feature.values.size() > 0) pitch = feature.values[0];
00577 
00578         float duration = 1;
00579         if (feature.values.size() > 1) duration = feature.values[1];
00580         
00581         float velocity = 100;
00582         if (feature.values.size() > 2) velocity = feature.values[2];
00583         if (velocity < 0) velocity = 127;
00584         if (velocity > 127) velocity = 127;
00585 
00586         NoteModel *model = getConformingOutput<NoteModel>();
00587         if (!model) return;
00588 
00589         model->addPoint(NoteModel::Point(frame, pitch,
00590                                          lrintf(duration),
00591                                          velocity / 127.f,
00592                                          feature.label.c_str()));
00593         
00594     } else {
00595         
00596         DenseThreeDimensionalModel::Column values = feature.values;
00597         
00598         EditableDenseThreeDimensionalModel *model =
00599             getConformingOutput<EditableDenseThreeDimensionalModel>();
00600         if (!model) return;
00601 
00602         model->setColumn(frame / model->getResolution(), values);
00603     }
00604 }
00605 
00606 void
00607 FeatureExtractionModelTransformer::setCompletion(int completion)
00608 {
00609     int binCount = 1;
00610     if (m_descriptor->hasFixedBinCount) {
00611         binCount = m_descriptor->binCount;
00612     }
00613 
00614 //    std::cerr << "FeatureExtractionModelTransformer::setCompletion("
00615 //              << completion << ")" << std::endl;
00616 
00617     if (binCount == 0) {
00618 
00619         SparseOneDimensionalModel *model =
00620             getConformingOutput<SparseOneDimensionalModel>();
00621         if (!model) return;
00622         model->setCompletion(completion, true); 
00623 
00624     } else if (binCount == 1) {
00625 
00626         SparseTimeValueModel *model =
00627             getConformingOutput<SparseTimeValueModel>();
00628         if (!model) return;
00629         model->setCompletion(completion, true); 
00630 
00631     } else if (m_descriptor->sampleType ==
00632                Vamp::Plugin::OutputDescriptor::VariableSampleRate) {
00633 
00634         NoteModel *model =
00635             getConformingOutput<NoteModel>();
00636         if (!model) return;
00637         model->setCompletion(completion, true); 
00638 
00639     } else {
00640 
00641         EditableDenseThreeDimensionalModel *model =
00642             getConformingOutput<EditableDenseThreeDimensionalModel>();
00643         if (!model) return;
00644         model->setCompletion(completion, true); 
00645     }
00646 }
00647 

Generated on Wed Feb 20 15:45:25 2008 for SonicVisualiser by  doxygen 1.5.1