001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030/** 031 * 032 */ 033package org.openimaj.demos.sandbox.audio; 034 035import java.io.DataOutputStream; 036import java.io.File; 037import java.io.FileFilter; 038import java.io.FileNotFoundException; 039import java.io.FileOutputStream; 040import java.io.IOException; 041import java.util.ArrayList; 042import java.util.Arrays; 043import java.util.Collections; 044import java.util.List; 045 046import org.apache.commons.io.filefilter.WildcardFileFilter; 047import org.kohsuke.args4j.Argument; 048import org.kohsuke.args4j.CmdLineException; 049import org.kohsuke.args4j.CmdLineParser; 050import org.kohsuke.args4j.Option; 051import org.openimaj.audio.AudioAnnotator.AudioAnnotatorType; 052import org.openimaj.audio.SampleChunk; 053import org.openimaj.audio.conversion.MultichannelToMonoProcessor; 054import org.openimaj.audio.features.MFCC; 055import org.openimaj.feature.DoubleFV; 056import org.openimaj.feature.FeatureExtractor; 057import org.openimaj.io.IOUtils; 058import org.openimaj.ml.annotation.Annotated; 059import org.openimaj.ml.annotation.AnnotatedObject; 060import org.openimaj.ml.training.IncrementalTrainer; 061import org.openimaj.video.xuggle.XuggleAudio; 062 063/** 064 * A trainer for the speech detector. Uses MFCCs to train a KNN classifier. 065 * 066 * @author David Dupplaw (dpd@ecs.soton.ac.uk) 067 * @created 6 Dec 2012 068 * @version $Author$, $Revision$, $Date$ 069 */ 070public class SpeechDetectorTrainer 071{ 072 /** 073 * Feature extractor for extracting MFCC features from a sample chunk. 074 * It assumes the sample chunk is mono, and will only return the MFCC 075 * features from the first channel. 076 * 077 * @author David Dupplaw (dpd@ecs.soton.ac.uk) 078 * @created 6 Mar 2013 079 */ 080 protected static class MFCCFeatureExtractor 081 implements FeatureExtractor<DoubleFV,SampleChunk> 082 { 083 private final MFCC mfcc = new MFCC(); 084 085 @Override 086 public DoubleFV extractFeature( final SampleChunk object ) 087 { 088 final double[] d = this.mfcc.calculateMFCC( object.getSampleBuffer() )[0]; 089 return new DoubleFV(d); 090 } 091 } 092 093 /** 094 * Options for the tool 095 * 096 * @author David Dupplaw (dpd@ecs.soton.ac.uk) 097 * @created 6 Dec 2012 098 * @version $Author$, $Revision$, $Date$ 099 */ 100 protected static class Options 101 { 102 /** Directory containing input files */ 103 @Argument(required=true,usage="Directory containing positive examples",metaVar="INPUT-DIR") 104 public File speechDir; 105 106 /** Output filename for the classifier */ 107 @Argument(required=true,usage="Classifier output file",index=1,metaVar="CLASSIFIER-OUTPUT-FILE") 108 public File outputFile; 109 110 /** Directory containing negative input files */ 111 @Option(name="--negative",aliases="-n",usage="Directory containing negative examples") 112 public File nonSpeechDir; 113 114 /** The trainer to use */ 115 @Option(name="--annotator",aliases="-a",usage="Classifier type (default:KNN)") 116 public AudioAnnotatorType trainer = AudioAnnotatorType.KNN; 117 118 /** The filename filter to use */ 119 @Option(name="--filter",aliases="-f",usage="Filename filter (default:*.wav,*.mp3)") 120 public List<String> filePattern = new ArrayList<String>( 121 Arrays.asList( new String[]{"*.wav","*.mp3"}) ); 122 123 /** Whether to recurse subdirectories */ 124 @Option(name="--recurse",aliases="-R",usage="Recurse subdirectories" ) 125 public boolean recurseSubdirectories = false; 126 127 @Option(name="--limit",aliases="-l",usage="Limit number of example files") 128 public int limitNumber = -1; 129 } 130 131 /** The options being used in this instance of the trainer */ 132 private final Options options; 133 134 /** The number of files processed so far by this class */ 135 // Note this means this class should only be used to process 136 // one directory at a time (don't call process from multiple Threads) 137 private int numProcessed; 138 139 /** MFCC Calculator */ 140 private final MFCC mfcc = new MFCC(); 141 142 /** 143 * Constructor for the trainer tool 144 * @param options the options to use. 145 * @throws FileNotFoundException 146 */ 147 public SpeechDetectorTrainer( final Options options ) 148 throws FileNotFoundException 149 { 150 this.options = options; 151 152 // Create the trainer 153 final IncrementalTrainer<Annotated<DoubleFV, String>> t = 154 options.trainer.getAnnotator(); 155 156 // Add the speech data to the space 157 this.processDirectory( options.speechDir, true, t, options.limitNumber ); 158 159 // Add the non-speech stuff to the space 160 if( options.nonSpeechDir != null ) 161 this.processDirectory( options.nonSpeechDir, false, t, options.limitNumber ); 162 163 // Write the classifier to a file 164 try 165 { 166 IOUtils.write( t, new DataOutputStream( new FileOutputStream( options.outputFile ) ) ); 167 } 168 catch( final IOException e ) 169 { 170 e.printStackTrace(); 171 } 172 } 173 174 /** 175 * Processes the input directory as a set of files whose speech content 176 * is given by the boolean parameter 177 * @param inputDir The input directory 178 * @param speechFiles Whether the files contain speech 179 * @param t The trainer to use 180 * @param limit The number of files to limit to 181 * @throws FileNotFoundException if the input directory cannot be found 182 */ 183 private void processDirectory( final File inputDir, final boolean speechFiles, 184 final IncrementalTrainer<Annotated<DoubleFV, String>> t, final int limit ) 185 throws FileNotFoundException 186 { 187 if( !inputDir.exists() ) 188 throw new FileNotFoundException( inputDir+" does not exist." ); 189 190 System.out.println( "Entering directory "+inputDir ); 191 192 // Instantiate our filename filter 193 final FileFilter fileFilter = new WildcardFileFilter( 194 this.options.filePattern ); 195 196 // Go through all the files in the directory 197 int chunkCount = 0; 198 final File[] files = inputDir.listFiles( fileFilter ); 199 this.numProcessed = 0; 200 for( final File file: files ) 201 { 202 if( limit > 0 && this.numProcessed >= limit ) 203 break; 204 205 if( file.isDirectory() ) 206 { 207 if( this.options.recurseSubdirectories ) 208 this.processDirectory( file, speechFiles, t, limit ); 209 210 continue; 211 } 212 213 System.out.println( "Processing "+file ); 214 215 try 216 { 217 int fileChunkCount = 0; 218 219 // Create an audio object for the input file 220 final XuggleAudio xa = new XuggleAudio( file ); 221 final MultichannelToMonoProcessor mtm = 222 new MultichannelToMonoProcessor( xa ); 223 224 // Loop through all the chunks in the audio file 225 SampleChunk sc = null; 226 while( (sc = mtm.nextSampleChunk()) != null ) 227 { 228 // Calculate the MFCC for this frame. 229 final double[][] calculatedMFCC = 230 this.mfcc.calculateMFCC( sc.getSampleBuffer() ); 231 232 // Create an annotated object that says that this sample chunk 233 // either does or doesn't represent speech 234 final AnnotatedObject<DoubleFV,String> o = 235 new AnnotatedObject<DoubleFV, String>( 236 new DoubleFV(calculatedMFCC[0]), 237 speechFiles?"Speech":"Non-Speech" ); 238 239 // Now train on that data. Training will involve extracting 240 // a feature and inserting it into some feature space. 241 t.train( Collections.singleton( o ) ); 242 243 // Show a little counter thingy 244 if( fileChunkCount % 1000 == 0 ) 245 System.out.print( fileChunkCount+"..." ); 246 247 // count how many items we've trained on 248 fileChunkCount++; 249 } 250 251 chunkCount += fileChunkCount; 252 this.numProcessed++; 253 } 254 catch( final Exception e ) 255 { 256 e.printStackTrace(); 257 } 258 finally 259 { 260 } 261 } 262 263 System.out.println( "Trained on "+chunkCount+" sample frames."); 264 } 265 266 /** 267 * Returns the trainer used for this trainer 268 * @return The trainer 269 */ 270 public AudioAnnotatorType getTrainer() 271 { 272 return this.options.trainer; 273 } 274 275 276 277 // ====================================================================== 278 /** 279 * Parses the command line arguments to create an options object. 280 * @param args 281 */ 282 private static Options parseArgs( final String args[] ) 283 { 284 final Options o = new Options(); 285 final CmdLineParser p = new CmdLineParser( o ); 286 try 287 { 288 p.parseArgument( args ); 289 } 290 catch( final CmdLineException e ) 291 { 292 System.err.println( e.getMessage() ); 293 System.err.println( "java SpeechDetectorTrainer INPUT-DIR CLASSIFIER-OUTPUT-FILE"); 294 p.printUsage( System.err ); 295 System.exit(1); 296 } 297 298 return o; 299 } 300 301 /** 302 * 303 * @param args 304 */ 305 public static void main( final String[] args ) 306 { 307 try 308 { 309 final Options options = SpeechDetectorTrainer.parseArgs( args ); 310 new SpeechDetectorTrainer( options ); 311 } 312 catch( final FileNotFoundException e ) 313 { 314 e.printStackTrace(); 315 } 316 } 317}