001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030/**
031 *
032 */
033package org.openimaj.demos.sandbox.audio;
034
035import java.io.File;
036import java.io.FileFilter;
037import java.io.FileNotFoundException;
038import java.util.ArrayList;
039import java.util.Arrays;
040import java.util.List;
041
042import org.apache.commons.io.filefilter.WildcardFileFilter;
043import org.kohsuke.args4j.Argument;
044import org.kohsuke.args4j.CmdLineException;
045import org.kohsuke.args4j.CmdLineParser;
046import org.kohsuke.args4j.Option;
047import org.openimaj.audio.AudioAnnotator;
048import org.openimaj.audio.AudioAnnotator.AudioAnnotatorType;
049import org.openimaj.audio.SampleChunk;
050import org.openimaj.audio.conversion.MultichannelToMonoProcessor;
051import org.openimaj.audio.features.MFCC;
052import org.openimaj.data.dataset.ListBackedDataset;
053import org.openimaj.data.dataset.ListDataset;
054import org.openimaj.data.dataset.MapBackedDataset;
055import org.openimaj.feature.DoubleFV;
056import org.openimaj.video.xuggle.XuggleAudio;
057
058/**
059 *
060 *
061 *      @author David Dupplaw (dpd@ecs.soton.ac.uk)
062 *  @created 12 Mar 2013
063 */
064public class SpeechDetector
065{
066        /**
067         *      Options for the speech detector
068         *
069         *      @author David Dupplaw (dpd@ecs.soton.ac.uk)
070         *  @created 6 Dec 2012
071         *      @version $Author$, $Revision$, $Date$
072         */
073        protected static class SpeechDetectorOptions
074        {
075                /** Directory containing input files */
076                @Argument(required=true,usage="Directory containing positive examples",metaVar="INPUT-DIR")
077                public File speechDir;
078
079                /** Output filename for the audioAnnotator */
080                @Argument(required=true,usage="Classifier output file",index=1,metaVar="CLASSIFIER-OUTPUT-FILE")
081                public File outputFile;
082
083                // ---------------------------------------------------------------- //
084
085                /** Directory containing negative input files */
086                @Option(name="--negative",aliases="-n",usage="Directory containing negative examples")
087                public File nonSpeechDir;
088
089                /** The filename filter to use */
090                @Option(name="--filter",aliases="-f",usage="Filename filter (default:*.wav,*.mp3)")
091                public List<String> filePattern = new ArrayList<String>(
092                                Arrays.asList( new String[]{"*.wav","*.mp3"}) );
093
094                /** Whether to recurse subdirectories */
095                @Option(name="--recurse",aliases="-R",usage="Recurse subdirectories" )
096                public boolean recurseSubdirectories = false;
097
098                /** The maximum number of files to read from (or -1 for all) */
099                @Option(name="--limit",aliases="-l",usage="Limit number of example files")
100                public int limitNumber = -1;
101
102                /** The annotator type : default KNN */
103                @Option(name="--annotator",aliases="-a",usage="Annotator type (default: KNN)")
104                public AudioAnnotatorType audioAnnotatorType = AudioAnnotatorType.KNN;
105        }
106
107        /** The options for this detector */
108        private SpeechDetectorOptions options;
109
110        /** The number of files processed so far by this class */
111        // Note this means this class should only be used to process
112        // one directory at a time (don't call process from multiple Threads)
113        private int numProcessed;
114
115        /** The classifier/annotator we're going to use */
116        private AudioAnnotator audioAnnotator;
117
118        /**
119         *      Returns a new instance of the audio annotator
120         *      that we're going to use to extract features.
121         *      @return The audio annotator
122         */
123        public AudioAnnotator getNewAnnotator()
124        {
125                final MFCC a = new MFCC();
126//              a.setAnnotator( this.options.audioAnnotatorType );
127                return null;
128        }
129
130        /**
131         *      Generates a dataset based on this speech detector's options that
132         *      contains both positive and negative examples.
133         *
134         *      @return A {@link MapBackedDataset} containing both positive
135         *              and negative classes
136         */
137        public MapBackedDataset<String,ListDataset<DoubleFV>,DoubleFV>
138                generateDataset()
139        {
140                try
141                {
142                        // Create a dataset from the input directories.. first the positive
143                        System.out.println( "----------- POSITIVE EXAMPLES ------------");
144                        MapBackedDataset<String, ListDataset<DoubleFV>, DoubleFV> ds =
145                                        this.generateDataset( this.options.speechDir,
146                                                        true, this.options.limitNumber );
147
148                        // Update the dataset with the negative examples
149                        System.out.println( "----------- NEGATIVE EXAMPLES ------------");
150                        ds = this.generateDataset( this.options.nonSpeechDir,
151                                        false, this.options.limitNumber, ds );
152
153                        return ds;
154                }
155                catch( final FileNotFoundException e )
156                {
157                        e.printStackTrace();
158                }
159
160                return null;
161        }
162
163        /**
164         *      Processes the input directory as a set of files whose speech content
165         *      is given by the boolean parameter and creates a {@link GroupedDataset}.
166         *
167         *      @param inputDir The input directory
168         *      @param speechFiles Whether the files contain speech
169         *      @param limit The number of files to limit to
170         *      @throws FileNotFoundException if the input directory cannot be found
171         */
172        private MapBackedDataset<String, ListDataset<DoubleFV>, DoubleFV>
173                generateDataset( final File inputDir, final boolean speechFiles, final int limit )
174                        throws FileNotFoundException
175        {
176                // This will be the dataset we'll return
177                final MapBackedDataset<String, ListDataset<DoubleFV>, DoubleFV> newDataset
178                         = new MapBackedDataset<String, ListDataset<DoubleFV>, DoubleFV>();
179
180                // This is the recursive call
181                this.generateDataset( inputDir, speechFiles, limit, newDataset );
182
183                return newDataset;
184        }
185
186        /**
187         *      Processes the input directory as a set of files whose speech content
188         *      is given by the boolean parameter and creates a {@link GroupedDataset}.
189         *
190         *      @param inputDir The input directory
191         *      @param speechFiles Whether the files contain speech
192         *      @param limit The number of files to limit to
193         *      @param dataset The dataset object to fill with data
194         *      @throws FileNotFoundException if the input directory cannot be found
195         */
196        private MapBackedDataset<String,ListDataset<DoubleFV>, DoubleFV>
197                generateDataset( final File inputDir, final boolean speechFiles, final int limit,
198                                final MapBackedDataset<String,ListDataset<DoubleFV>,DoubleFV> dataset )
199                                                throws FileNotFoundException
200        {
201                if( !inputDir.exists() )
202                        throw new FileNotFoundException( inputDir+" does not exist." );
203
204                System.out.println( "Entering directory "+inputDir );
205
206                // Instantiate our filename filter
207                final FileFilter fileFilter = new WildcardFileFilter(
208                                this.options.filePattern );
209
210                // The name of the group  TODO: maybe pass this in?
211                final String groupName = (speechFiles? "Speech" : "Non-Speech" );
212
213                // This will be the dataset for this group
214                ListBackedDataset<DoubleFV> lbds = null;
215                if( (lbds = (ListBackedDataset<DoubleFV>)dataset.getMap().get( groupName ) ) == null )
216                        dataset.getMap().put( groupName, lbds = new ListBackedDataset<DoubleFV>() );
217
218                // Go through all the files in the directory
219                int chunkCount = 0;
220                final File[] files = inputDir.listFiles( fileFilter );
221                this.numProcessed = 0;
222                for( final File file: files )
223                {
224                        if( limit > 0 && this.numProcessed >= limit )
225                                break;
226
227                        if( file.isDirectory() )
228                        {
229                                if( this.options.recurseSubdirectories )
230                                        this.generateDataset( file, speechFiles, limit, dataset );
231
232                                continue;
233                        }
234
235                        System.out.println( "Processing "+file );
236
237                        try
238                        {
239                                int fileChunkCount = 0;
240
241                                // Create an audio object for the input file
242                                final XuggleAudio xa = new XuggleAudio( file );
243                                final MultichannelToMonoProcessor mtm =
244                                                new MultichannelToMonoProcessor( xa );
245
246                                // Loop through all the chunks in the audio file
247                                SampleChunk sc = null;
248                                while( (sc = mtm.nextSampleChunk()) != null )
249                                {
250                                        // Calculate the MFCC for this frame.
251                                        final DoubleFV mfcc = this.audioAnnotator.extractFeature( sc );
252
253                                        // We know there's only one channel (we're using a multichannel
254                                        // to mono processor), so we can just take the first array element.
255                                        if( mfcc != null )
256                                                        lbds.add( mfcc );
257                                        else    System.out.println( "WARNING: Null MFCC at "+fileChunkCount );
258
259                                        // Show a little counter thingy
260                                        if( fileChunkCount % 1000 == 0 )
261                                                System.out.print( fileChunkCount+"..." );
262
263                                        // count how many items we've trained on
264                                        fileChunkCount++;
265                                }
266
267                                System.out.println( fileChunkCount+". " );
268                                chunkCount += fileChunkCount;
269                                this.numProcessed++;
270                        }
271                        catch( final Exception e )
272                        {
273                                e.printStackTrace();
274                        }
275                        finally
276                        {
277                        }
278                }
279
280                System.out.println( "Loaded "+chunkCount+" sample frames.");
281
282                return dataset;
283        }
284
285        // ======================================================================
286        /**
287         *      Parses the command line arguments to create an options object.
288         *      @param args The arguments from the command-line
289         *      @return The options that were parsed from the command-line
290         */
291        public SpeechDetectorOptions parseArgs( final String args[] )
292        {
293                final SpeechDetectorOptions o = new SpeechDetectorOptions();
294                final CmdLineParser p = new CmdLineParser( o );
295                try
296                {
297                        p.parseArgument( args );
298                }
299                catch( final CmdLineException e )
300                {
301                System.err.println( e.getMessage() );
302                System.err.println( "java SpeechDetectorTrainer INPUT-DIR CLASSIFIER-OUTPUT-FILE");
303                p.printUsage( System.err );
304                System.exit(1);
305                }
306
307                this.options = o;
308                this.audioAnnotator = this.getNewAnnotator();
309                return o;
310        }
311}