001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030/** 031 * 032 */ 033package org.openimaj.demos.sandbox.audio; 034 035import java.io.File; 036import java.io.FileFilter; 037import java.io.FileNotFoundException; 038import java.util.ArrayList; 039import java.util.Arrays; 040import java.util.List; 041 042import org.apache.commons.io.filefilter.WildcardFileFilter; 043import org.kohsuke.args4j.Argument; 044import org.kohsuke.args4j.CmdLineException; 045import org.kohsuke.args4j.CmdLineParser; 046import org.kohsuke.args4j.Option; 047import org.openimaj.audio.AudioAnnotator; 048import org.openimaj.audio.AudioAnnotator.AudioAnnotatorType; 049import org.openimaj.audio.SampleChunk; 050import org.openimaj.audio.conversion.MultichannelToMonoProcessor; 051import org.openimaj.audio.features.MFCC; 052import org.openimaj.data.dataset.ListBackedDataset; 053import org.openimaj.data.dataset.ListDataset; 054import org.openimaj.data.dataset.MapBackedDataset; 055import org.openimaj.feature.DoubleFV; 056import org.openimaj.video.xuggle.XuggleAudio; 057 058/** 059 * 060 * 061 * @author David Dupplaw (dpd@ecs.soton.ac.uk) 062 * @created 12 Mar 2013 063 */ 064public class SpeechDetector 065{ 066 /** 067 * Options for the speech detector 068 * 069 * @author David Dupplaw (dpd@ecs.soton.ac.uk) 070 * @created 6 Dec 2012 071 * @version $Author$, $Revision$, $Date$ 072 */ 073 protected static class SpeechDetectorOptions 074 { 075 /** Directory containing input files */ 076 @Argument(required=true,usage="Directory containing positive examples",metaVar="INPUT-DIR") 077 public File speechDir; 078 079 /** Output filename for the audioAnnotator */ 080 @Argument(required=true,usage="Classifier output file",index=1,metaVar="CLASSIFIER-OUTPUT-FILE") 081 public File outputFile; 082 083 // ---------------------------------------------------------------- // 084 085 /** Directory containing negative input files */ 086 @Option(name="--negative",aliases="-n",usage="Directory containing negative examples") 087 public File nonSpeechDir; 088 089 /** The filename filter to use */ 090 @Option(name="--filter",aliases="-f",usage="Filename filter (default:*.wav,*.mp3)") 091 public List<String> filePattern = new ArrayList<String>( 092 Arrays.asList( new String[]{"*.wav","*.mp3"}) ); 093 094 /** Whether to recurse subdirectories */ 095 @Option(name="--recurse",aliases="-R",usage="Recurse subdirectories" ) 096 public boolean recurseSubdirectories = false; 097 098 /** The maximum number of files to read from (or -1 for all) */ 099 @Option(name="--limit",aliases="-l",usage="Limit number of example files") 100 public int limitNumber = -1; 101 102 /** The annotator type : default KNN */ 103 @Option(name="--annotator",aliases="-a",usage="Annotator type (default: KNN)") 104 public AudioAnnotatorType audioAnnotatorType = AudioAnnotatorType.KNN; 105 } 106 107 /** The options for this detector */ 108 private SpeechDetectorOptions options; 109 110 /** The number of files processed so far by this class */ 111 // Note this means this class should only be used to process 112 // one directory at a time (don't call process from multiple Threads) 113 private int numProcessed; 114 115 /** The classifier/annotator we're going to use */ 116 private AudioAnnotator audioAnnotator; 117 118 /** 119 * Returns a new instance of the audio annotator 120 * that we're going to use to extract features. 121 * @return The audio annotator 122 */ 123 public AudioAnnotator getNewAnnotator() 124 { 125 final MFCC a = new MFCC(); 126// a.setAnnotator( this.options.audioAnnotatorType ); 127 return null; 128 } 129 130 /** 131 * Generates a dataset based on this speech detector's options that 132 * contains both positive and negative examples. 133 * 134 * @return A {@link MapBackedDataset} containing both positive 135 * and negative classes 136 */ 137 public MapBackedDataset<String,ListDataset<DoubleFV>,DoubleFV> 138 generateDataset() 139 { 140 try 141 { 142 // Create a dataset from the input directories.. first the positive 143 System.out.println( "----------- POSITIVE EXAMPLES ------------"); 144 MapBackedDataset<String, ListDataset<DoubleFV>, DoubleFV> ds = 145 this.generateDataset( this.options.speechDir, 146 true, this.options.limitNumber ); 147 148 // Update the dataset with the negative examples 149 System.out.println( "----------- NEGATIVE EXAMPLES ------------"); 150 ds = this.generateDataset( this.options.nonSpeechDir, 151 false, this.options.limitNumber, ds ); 152 153 return ds; 154 } 155 catch( final FileNotFoundException e ) 156 { 157 e.printStackTrace(); 158 } 159 160 return null; 161 } 162 163 /** 164 * Processes the input directory as a set of files whose speech content 165 * is given by the boolean parameter and creates a {@link GroupedDataset}. 166 * 167 * @param inputDir The input directory 168 * @param speechFiles Whether the files contain speech 169 * @param limit The number of files to limit to 170 * @throws FileNotFoundException if the input directory cannot be found 171 */ 172 private MapBackedDataset<String, ListDataset<DoubleFV>, DoubleFV> 173 generateDataset( final File inputDir, final boolean speechFiles, final int limit ) 174 throws FileNotFoundException 175 { 176 // This will be the dataset we'll return 177 final MapBackedDataset<String, ListDataset<DoubleFV>, DoubleFV> newDataset 178 = new MapBackedDataset<String, ListDataset<DoubleFV>, DoubleFV>(); 179 180 // This is the recursive call 181 this.generateDataset( inputDir, speechFiles, limit, newDataset ); 182 183 return newDataset; 184 } 185 186 /** 187 * Processes the input directory as a set of files whose speech content 188 * is given by the boolean parameter and creates a {@link GroupedDataset}. 189 * 190 * @param inputDir The input directory 191 * @param speechFiles Whether the files contain speech 192 * @param limit The number of files to limit to 193 * @param dataset The dataset object to fill with data 194 * @throws FileNotFoundException if the input directory cannot be found 195 */ 196 private MapBackedDataset<String,ListDataset<DoubleFV>, DoubleFV> 197 generateDataset( final File inputDir, final boolean speechFiles, final int limit, 198 final MapBackedDataset<String,ListDataset<DoubleFV>,DoubleFV> dataset ) 199 throws FileNotFoundException 200 { 201 if( !inputDir.exists() ) 202 throw new FileNotFoundException( inputDir+" does not exist." ); 203 204 System.out.println( "Entering directory "+inputDir ); 205 206 // Instantiate our filename filter 207 final FileFilter fileFilter = new WildcardFileFilter( 208 this.options.filePattern ); 209 210 // The name of the group TODO: maybe pass this in? 211 final String groupName = (speechFiles? "Speech" : "Non-Speech" ); 212 213 // This will be the dataset for this group 214 ListBackedDataset<DoubleFV> lbds = null; 215 if( (lbds = (ListBackedDataset<DoubleFV>)dataset.getMap().get( groupName ) ) == null ) 216 dataset.getMap().put( groupName, lbds = new ListBackedDataset<DoubleFV>() ); 217 218 // Go through all the files in the directory 219 int chunkCount = 0; 220 final File[] files = inputDir.listFiles( fileFilter ); 221 this.numProcessed = 0; 222 for( final File file: files ) 223 { 224 if( limit > 0 && this.numProcessed >= limit ) 225 break; 226 227 if( file.isDirectory() ) 228 { 229 if( this.options.recurseSubdirectories ) 230 this.generateDataset( file, speechFiles, limit, dataset ); 231 232 continue; 233 } 234 235 System.out.println( "Processing "+file ); 236 237 try 238 { 239 int fileChunkCount = 0; 240 241 // Create an audio object for the input file 242 final XuggleAudio xa = new XuggleAudio( file ); 243 final MultichannelToMonoProcessor mtm = 244 new MultichannelToMonoProcessor( xa ); 245 246 // Loop through all the chunks in the audio file 247 SampleChunk sc = null; 248 while( (sc = mtm.nextSampleChunk()) != null ) 249 { 250 // Calculate the MFCC for this frame. 251 final DoubleFV mfcc = this.audioAnnotator.extractFeature( sc ); 252 253 // We know there's only one channel (we're using a multichannel 254 // to mono processor), so we can just take the first array element. 255 if( mfcc != null ) 256 lbds.add( mfcc ); 257 else System.out.println( "WARNING: Null MFCC at "+fileChunkCount ); 258 259 // Show a little counter thingy 260 if( fileChunkCount % 1000 == 0 ) 261 System.out.print( fileChunkCount+"..." ); 262 263 // count how many items we've trained on 264 fileChunkCount++; 265 } 266 267 System.out.println( fileChunkCount+". " ); 268 chunkCount += fileChunkCount; 269 this.numProcessed++; 270 } 271 catch( final Exception e ) 272 { 273 e.printStackTrace(); 274 } 275 finally 276 { 277 } 278 } 279 280 System.out.println( "Loaded "+chunkCount+" sample frames."); 281 282 return dataset; 283 } 284 285 // ====================================================================== 286 /** 287 * Parses the command line arguments to create an options object. 288 * @param args The arguments from the command-line 289 * @return The options that were parsed from the command-line 290 */ 291 public SpeechDetectorOptions parseArgs( final String args[] ) 292 { 293 final SpeechDetectorOptions o = new SpeechDetectorOptions(); 294 final CmdLineParser p = new CmdLineParser( o ); 295 try 296 { 297 p.parseArgument( args ); 298 } 299 catch( final CmdLineException e ) 300 { 301 System.err.println( e.getMessage() ); 302 System.err.println( "java SpeechDetectorTrainer INPUT-DIR CLASSIFIER-OUTPUT-FILE"); 303 p.printUsage( System.err ); 304 System.exit(1); 305 } 306 307 this.options = o; 308 this.audioAnnotator = this.getNewAnnotator(); 309 return o; 310 } 311}