001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030/**
031 * 
032 */
033package org.openimaj.demos.sandbox.audio;
034
035import java.awt.Font;
036import java.io.File;
037import java.io.IOException;
038import java.net.URL;
039import java.util.ArrayList;
040import java.util.List;
041import java.util.regex.Matcher;
042import java.util.regex.Pattern;
043
044import org.openimaj.audio.AudioFormat;
045import org.openimaj.audio.AudioStream;
046import org.openimaj.audio.conversion.BitDepthConverter;
047import org.openimaj.audio.conversion.BitDepthConverter.BitDepthConversionAlgorithm;
048import org.openimaj.audio.conversion.MultichannelToMonoProcessor;
049import org.openimaj.audio.conversion.SampleRateConverter;
050import org.openimaj.audio.conversion.SampleRateConverter.SampleRateConversionAlgorithm;
051import org.openimaj.audio.filters.EQFilter;
052import org.openimaj.audio.filters.EQFilter.EQType;
053import org.openimaj.image.DisplayUtilities;
054import org.openimaj.image.MBFImage;
055import org.openimaj.image.colour.RGBColour;
056import org.openimaj.image.typography.FontStyle;
057import org.openimaj.image.typography.general.GeneralFont;
058import org.openimaj.math.geometry.shape.Rectangle;
059import org.openimaj.video.xuggle.XuggleAudio;
060import org.openimaj.vis.audio.AudioOverviewVisualisation;
061
062import edu.cmu.sphinx.recognizer.Recognizer;
063import edu.cmu.sphinx.result.Result;
064import edu.cmu.sphinx.util.props.ConfigurationManager;
065import edu.cmu.sphinx.util.props.PropertyException;
066
067/**
068 * Basic Sphinx demo (from their webpage). Uses the OpenIMAJ audio file data
069 * source to link OpenIMAJ audio engine to Sphinx.
070 * 
071 * @author David Dupplaw (dpd@ecs.soton.ac.uk)
072 * 
073 * @created 23 May 2012
074 */
075public class SpeechRecognition
076{
077        /**
078         *      Returns the affected audio stream.
079         *      @param as The audio stream to affect
080         *      @return The affected audio stream
081         */
082        public static AudioStream getStream( final AudioStream as )
083        {
084                // Effect chain:
085                //
086                //              -> Mono
087                //              -> Band-pass filter (LPF + HPF)
088                //              -> Sample rate to 16KHz
089                //              -> Bit rate to 8-bit
090                //
091
092                final MultichannelToMonoProcessor m2m2 = new MultichannelToMonoProcessor( as );
093
094                final double fc = 1000; // mid-point 1000Hz
095                final double q = 1600;  // HPF @ 200Hz, LPF @ 1800Hz
096                final EQFilter lpf = new EQFilter( m2m2, EQType.LPF, fc+q/2 );
097                final EQFilter hpf = new EQFilter( lpf, EQType.HPF, fc-q/2 );
098
099                final SampleRateConverter src2 = new SampleRateConverter( hpf,
100                                SampleRateConversionAlgorithm.LINEAR_INTERPOLATION,
101                                new AudioFormat( m2m2.getFormat().getNBits(),
102                                                16, m2m2.getFormat().getNumChannels() ) );
103
104                final BitDepthConverter xa2 = new BitDepthConverter( src2,
105                                BitDepthConversionAlgorithm.NEAREST,
106                                new AudioFormat( 8, src2.getFormat().getSampleRateKHz(),
107                                                src2.getFormat().getNumChannels() ) );
108
109                return xa2;
110        }
111
112        /**
113         * @param args
114         * @throws PropertyException
115         * @throws IOException
116         * @throws InstantiationException
117         * @throws InterruptedException
118         */
119        public static void main( final String[] args ) throws IOException,
120        PropertyException, InstantiationException, InterruptedException
121        {
122                final URL configFile = SpeechRecognition.class
123                                .getResource( "/org/openimaj/demos/sandbox/audio/sphinx-config-hub4.xml" );
124
125                // Check the configuration file exists
126                if( configFile == null )
127                {
128                        System.err.println( "Cannot find config file" );
129                        System.exit( 1 );
130                }
131
132                // Get the audio file input
133                // URL audioFileURL = new URL( "http://www.moviewavs.com/0058349934/WAVS/Movies/Juno/experimenting.wav" );
134                final File audioFileURL = new File( "videoplayback.mp4" );
135
136                try
137                {
138                        final List<Rectangle> boundingBoxes = new ArrayList<Rectangle>();
139
140                        System.out.println( audioFileURL );
141
142                        // Get a display of the audio waveform
143                        final XuggleAudio xuggle = new XuggleAudio( audioFileURL );
144                        final AudioOverviewVisualisation awp = new AudioOverviewVisualisation( SpeechRecognition.getStream( xuggle ) );
145                        final MBFImage awi = awp.plotAudioWaveformImage( 1000, 300,
146                                        new Float[]
147                                                        { 0f, 0f, 0f, 1f }, new Float[]
148                                                                        { 1f, 1f, 1f, 1f } );
149
150                        System.out.println( awp.millisecondsInView );
151
152                        final MBFImage img = new MBFImage( 1000, 400, 3 );
153                        img.drawImage( awi, 0, 0 );
154                        DisplayUtilities.displayName( img, "waveform" );
155
156                        // Load the configuration
157                        final ConfigurationManager cm = new ConfigurationManager( configFile );
158
159                        // Allocate the recognizer
160                        System.out.println( "Loading..." );
161                        final Recognizer recognizer = (Recognizer)cm.lookup( "recognizer" );
162                        recognizer.allocate();
163
164                        // Configure the audio input for the recognizer
165                        final OpenIMAJAudioFileDataSource dataSource = (OpenIMAJAudioFileDataSource)cm
166                                        .lookup( "audioFileDataSource" );
167                        final XuggleAudio xuggle2 = new XuggleAudio( audioFileURL );
168                        dataSource.setAudioStream( SpeechRecognition.getStream( xuggle2 ) );
169
170                        // Play the audio
171                        //                      XuggleAudio xuggleToPlay = new XuggleAudio( audioFileURL );
172                        //                      AudioPlayer ap = AudioPlayer.createAudioPlayer( getStream( xuggleToPlay ) );
173                        //                      ap.run();
174
175                        // The font to plot the words
176                        final GeneralFont font = new GeneralFont("Courier", Font.PLAIN );
177                        final FontStyle<Float[]> fontStyle = font.createStyle( awi.createRenderer() );
178
179                        // Start recognising words from the audio file
180                        final Pattern p = Pattern.compile( "([A-Za-z0-9'_]+)\\(([0-9.]+),([0-9.]+)\\)" );
181                        Result result = null;
182                        final StringBuffer sb = new StringBuffer();
183                        while( (result = recognizer.recognize()) != null )
184                        {
185                                final String resultText = result.getTimedBestResult( false, true );
186                                System.out.println( resultText );
187
188                                final Matcher matcher = p.matcher( resultText );
189                                while( matcher.find() )
190                                {
191                                        System.out.println( "Word:  " + matcher.group( 1 ) );
192                                        System.out.println( "Start: " + matcher.group( 2 ) );
193                                        System.out.println( "End:   " + matcher.group( 3 ) );
194
195                                        // Parse the word and timings from the result
196                                        final String word = matcher.group(1);
197                                        final double s = Double.parseDouble( matcher.group(2) ) * 1000;
198                                        final double e = Double.parseDouble( matcher.group(3) ) * 1000;
199                                        sb.append( word+" " );
200
201                                        // Get the bounds of the word polygon
202                                        final Rectangle bounds = font.getRenderer(
203                                                        awi.createRenderer() ).getSize(
204                                                                        word, fontStyle );
205
206                                        // Determine the pixel coordinate of the start and end times
207                                        final int startX = (int)(s/awp.millisecondsInView*1000);
208                                        final int endX   = (int)(e/awp.millisecondsInView*1000);
209
210                                        // Draw bars showing the range of the word
211                                        img.drawLine( startX, 320, endX, 320, RGBColour.YELLOW );
212                                        img.drawLine( startX, 318, startX, 322, RGBColour.GREEN );
213                                        img.drawLine( endX, 318, endX, 322, RGBColour.RED );
214
215                                        int y = 350;
216                                        bounds.translate( startX, y );
217                                        boolean noIntersection = true;
218                                        do
219                                        {
220                                                noIntersection = true;
221                                                for( final Rectangle r : boundingBoxes )
222                                                        if( r.isOverlapping( bounds ) )
223                                                        { noIntersection = false; break; }
224
225                                                if( !noIntersection )
226                                                        bounds.translate( 0, bounds.height );
227                                        } while( !noIntersection );
228                                        y = (int)bounds.y;
229
230                                        // Draw the word
231                                        img.drawLine( startX, 322, startX, (int)(y+bounds.height),
232                                                        new Float[]{0.4f,0.4f,0.4f} );
233                                        img.drawLine( startX, (int)(y+bounds.height), startX+8,
234                                                        (int)(y+bounds.height), new Float[]{0.4f,0.4f,0.4f} );
235                                        img.drawText( word, startX, y, font, 24, RGBColour.WHITE  );
236
237                                        // Store the bounding box
238                                        boundingBoxes.add( bounds );
239                                }
240                        }
241
242                        DisplayUtilities.displayName( img, "waveform" );
243                        System.out.println( "=======================================" );
244                        System.out.println( "Text: \n"+sb.toString() );
245                        System.out.println( "=======================================" );
246                }
247                catch( final NumberFormatException e )
248                {
249                        e.printStackTrace();
250                }
251                catch( final IllegalStateException e )
252                {
253                        e.printStackTrace();
254                }
255        }
256}