Source code

001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030/**
031 *
032 */
033package org.openimaj.demos.sandbox.audio;
034
035import java.io.DataOutputStream;
036import java.io.File;
037import java.io.FileFilter;
038import java.io.FileNotFoundException;
039import java.io.FileOutputStream;
040import java.io.IOException;
041import java.util.ArrayList;
042import java.util.Arrays;
043import java.util.Collections;
044import java.util.List;
045
046import org.apache.commons.io.filefilter.WildcardFileFilter;
047import org.kohsuke.args4j.Argument;
048import org.kohsuke.args4j.CmdLineException;
049import org.kohsuke.args4j.CmdLineParser;
050import org.kohsuke.args4j.Option;
051import org.openimaj.audio.AudioAnnotator.AudioAnnotatorType;
052import org.openimaj.audio.SampleChunk;
053import org.openimaj.audio.conversion.MultichannelToMonoProcessor;
054import org.openimaj.audio.features.MFCC;
055import org.openimaj.feature.DoubleFV;
056import org.openimaj.feature.FeatureExtractor;
057import org.openimaj.io.IOUtils;
058import org.openimaj.ml.annotation.Annotated;
059import org.openimaj.ml.annotation.AnnotatedObject;
060import org.openimaj.ml.training.IncrementalTrainer;
061import org.openimaj.video.xuggle.XuggleAudio;
062
063/**
064 *      A trainer for the speech detector. Uses MFCCs to train a KNN classifier.
065 *
066 *      @author David Dupplaw (dpd@ecs.soton.ac.uk)
067 *  @created 6 Dec 2012
068 *      @version $Author$, $Revision$, $Date$
069 */
070public class SpeechDetectorTrainer
071{
072        /**
073         *      Feature extractor for extracting MFCC features from a sample chunk.
074         *      It assumes the sample chunk is mono, and will only return the MFCC
075         *      features from the first channel.
076         *
077         *      @author David Dupplaw (dpd@ecs.soton.ac.uk)
078         *  @created 6 Mar 2013
079         */
080        protected static class MFCCFeatureExtractor
081                implements FeatureExtractor<DoubleFV,SampleChunk>
082        {
083                private final MFCC mfcc = new MFCC();
084
085                @Override
086                public DoubleFV extractFeature( final SampleChunk object )
087                {
088                        final double[] d = this.mfcc.calculateMFCC( object.getSampleBuffer() )[0];
089                        return new DoubleFV(d);
090                }
091        }
092
093        /**
094         *      Options for the tool
095         *
096         *      @author David Dupplaw (dpd@ecs.soton.ac.uk)
097         *  @created 6 Dec 2012
098         *      @version $Author$, $Revision$, $Date$
099         */
100        protected static class Options
101        {
102                /** Directory containing input files */
103                @Argument(required=true,usage="Directory containing positive examples",metaVar="INPUT-DIR")
104                public File speechDir;
105
106                /** Output filename for the classifier */
107                @Argument(required=true,usage="Classifier output file",index=1,metaVar="CLASSIFIER-OUTPUT-FILE")
108                public File outputFile;
109
110                /** Directory containing negative input files */
111                @Option(name="--negative",aliases="-n",usage="Directory containing negative examples")
112                public File nonSpeechDir;
113
114                /** The trainer to use */
115                @Option(name="--annotator",aliases="-a",usage="Classifier type (default:KNN)")
116                public AudioAnnotatorType trainer = AudioAnnotatorType.KNN;
117
118                /** The filename filter to use */
119                @Option(name="--filter",aliases="-f",usage="Filename filter (default:*.wav,*.mp3)")
120                public List<String> filePattern = new ArrayList<String>(
121                                Arrays.asList( new String[]{"*.wav","*.mp3"}) );
122
123                /** Whether to recurse subdirectories */
124                @Option(name="--recurse",aliases="-R",usage="Recurse subdirectories" )
125                public boolean recurseSubdirectories = false;
126
127                @Option(name="--limit",aliases="-l",usage="Limit number of example files")
128                public int limitNumber = -1;
129        }
130
131        /** The options being used in this instance of the trainer */
132        private final Options options;
133
134        /** The number of files processed so far by this class */
135        // Note this means this class should only be used to process
136        // one directory at a time (don't call process from multiple Threads)
137        private int numProcessed;
138
139        /** MFCC Calculator */
140        private final MFCC mfcc = new MFCC();
141
142        /**
143         *      Constructor for the trainer tool
144         *      @param options the options to use.
145         * @throws FileNotFoundException
146         */
147        public SpeechDetectorTrainer( final Options options )
148                        throws FileNotFoundException
149        {
150                this.options = options;
151
152                // Create the trainer
153                final IncrementalTrainer<Annotated<DoubleFV, String>> t =
154                                options.trainer.getAnnotator();
155
156                // Add the speech data to the space
157                this.processDirectory( options.speechDir, true, t, options.limitNumber );
158
159                // Add the non-speech stuff to the space
160                if( options.nonSpeechDir != null )
161                        this.processDirectory( options.nonSpeechDir, false, t, options.limitNumber );
162
163                // Write the classifier to a file
164                try
165                {
166                        IOUtils.write( t, new DataOutputStream( new FileOutputStream( options.outputFile ) ) );
167                }
168                catch( final IOException e )
169                {
170                        e.printStackTrace();
171                }
172        }
173
174        /**
175         *      Processes the input directory as a set of files whose speech content
176         *      is given by the boolean parameter
177         *      @param inputDir The input directory
178         *      @param speechFiles Whether the files contain speech
179         *      @param t The trainer to use
180         *      @param limit The number of files to limit to
181         *      @throws FileNotFoundException if the input directory cannot be found
182         */
183        private void processDirectory( final File inputDir, final boolean speechFiles,
184                        final IncrementalTrainer<Annotated<DoubleFV, String>> t, final int limit )
185                        throws FileNotFoundException
186        {
187                if( !inputDir.exists() )
188                        throw new FileNotFoundException( inputDir+" does not exist." );
189
190                System.out.println( "Entering directory "+inputDir );
191
192                // Instantiate our filename filter
193                final FileFilter fileFilter = new WildcardFileFilter(
194                                this.options.filePattern );
195
196                // Go through all the files in the directory
197                int chunkCount = 0;
198                final File[] files = inputDir.listFiles( fileFilter );
199                this.numProcessed = 0;
200                for( final File file: files )
201                {
202                        if( limit > 0 && this.numProcessed >= limit )
203                                break;
204
205                        if( file.isDirectory() )
206                        {
207                                if( this.options.recurseSubdirectories )
208                                        this.processDirectory( file, speechFiles, t, limit );
209
210                                continue;
211                        }
212
213                        System.out.println( "Processing "+file );
214
215                        try
216                        {
217                                int fileChunkCount = 0;
218
219                                // Create an audio object for the input file
220                                final XuggleAudio xa = new XuggleAudio( file );
221                                final MultichannelToMonoProcessor mtm =
222                                                new MultichannelToMonoProcessor( xa );
223
224                                // Loop through all the chunks in the audio file
225                                SampleChunk sc = null;
226                                while( (sc = mtm.nextSampleChunk()) != null )
227                                {
228                                        // Calculate the MFCC for this frame.
229                                        final double[][] calculatedMFCC =
230                                                        this.mfcc.calculateMFCC( sc.getSampleBuffer() );
231
232                                        // Create an annotated object that says that this sample chunk
233                                        // either does or doesn't represent speech
234                                        final AnnotatedObject<DoubleFV,String> o =
235                                                        new AnnotatedObject<DoubleFV, String>(
236                                                                        new DoubleFV(calculatedMFCC[0]),
237                                                                        speechFiles?"Speech":"Non-Speech" );
238
239                                        // Now train on that data. Training will involve extracting
240                                        // a feature and inserting it into some feature space.
241                                        t.train( Collections.singleton( o ) );
242
243                                        // Show a little counter thingy
244                                        if( fileChunkCount % 1000 == 0 )
245                                                System.out.print( fileChunkCount+"..." );
246
247                                        // count how many items we've trained on
248                                        fileChunkCount++;
249                                }
250
251                                chunkCount += fileChunkCount;
252                                this.numProcessed++;
253                        }
254                        catch( final Exception e )
255                        {
256                                e.printStackTrace();
257                        }
258                        finally
259                        {
260                        }
261                }
262
263                System.out.println( "Trained on "+chunkCount+" sample frames.");
264        }
265
266        /**
267         *      Returns the trainer used for this trainer
268         *      @return The trainer
269         */
270        public AudioAnnotatorType getTrainer()
271        {
272                return this.options.trainer;
273        }
274
275
276
277        // ======================================================================
278        /**
279         *      Parses the command line arguments to create an options object.
280         *      @param args
281         */
282        private static Options parseArgs( final String args[] )
283        {
284                final Options o = new Options();
285                final CmdLineParser p = new CmdLineParser( o );
286                try
287                {
288                        p.parseArgument( args );
289                }
290                catch( final CmdLineException e )
291                {
292                System.err.println( e.getMessage() );
293                System.err.println( "java SpeechDetectorTrainer INPUT-DIR CLASSIFIER-OUTPUT-FILE");
294                p.printUsage( System.err );
295                System.exit(1);
296                }
297
298                return o;
299        }
300
301        /**
302         *
303         *      @param args
304         */
305        public static void main( final String[] args )
306        {
307                try
308                {
309                        final Options options = SpeechDetectorTrainer.parseArgs( args );
310                        new SpeechDetectorTrainer( options );
311                }
312                catch( final FileNotFoundException e )
313                {
314                        e.printStackTrace();
315                }
316        }
317}