001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030/** 031 * 032 */ 033package org.openimaj.demos.sandbox.audio; 034 035import java.awt.Font; 036import java.io.File; 037import java.io.IOException; 038import java.net.URL; 039import java.util.ArrayList; 040import java.util.List; 041import java.util.regex.Matcher; 042import java.util.regex.Pattern; 043 044import org.openimaj.audio.AudioFormat; 045import org.openimaj.audio.AudioStream; 046import org.openimaj.audio.conversion.BitDepthConverter; 047import org.openimaj.audio.conversion.BitDepthConverter.BitDepthConversionAlgorithm; 048import org.openimaj.audio.conversion.MultichannelToMonoProcessor; 049import org.openimaj.audio.conversion.SampleRateConverter; 050import org.openimaj.audio.conversion.SampleRateConverter.SampleRateConversionAlgorithm; 051import org.openimaj.audio.filters.EQFilter; 052import org.openimaj.audio.filters.EQFilter.EQType; 053import org.openimaj.image.DisplayUtilities; 054import org.openimaj.image.MBFImage; 055import org.openimaj.image.colour.RGBColour; 056import org.openimaj.image.typography.FontStyle; 057import org.openimaj.image.typography.general.GeneralFont; 058import org.openimaj.math.geometry.shape.Rectangle; 059import org.openimaj.video.xuggle.XuggleAudio; 060import org.openimaj.vis.audio.AudioOverviewVisualisation; 061 062import edu.cmu.sphinx.recognizer.Recognizer; 063import edu.cmu.sphinx.result.Result; 064import edu.cmu.sphinx.util.props.ConfigurationManager; 065import edu.cmu.sphinx.util.props.PropertyException; 066 067/** 068 * Basic Sphinx demo (from their webpage). Uses the OpenIMAJ audio file data 069 * source to link OpenIMAJ audio engine to Sphinx. 070 * 071 * @author David Dupplaw (dpd@ecs.soton.ac.uk) 072 * 073 * @created 23 May 2012 074 */ 075public class SpeechRecognition 076{ 077 /** 078 * Returns the affected audio stream. 079 * @param as The audio stream to affect 080 * @return The affected audio stream 081 */ 082 public static AudioStream getStream( final AudioStream as ) 083 { 084 // Effect chain: 085 // 086 // -> Mono 087 // -> Band-pass filter (LPF + HPF) 088 // -> Sample rate to 16KHz 089 // -> Bit rate to 8-bit 090 // 091 092 final MultichannelToMonoProcessor m2m2 = new MultichannelToMonoProcessor( as ); 093 094 final double fc = 1000; // mid-point 1000Hz 095 final double q = 1600; // HPF @ 200Hz, LPF @ 1800Hz 096 final EQFilter lpf = new EQFilter( m2m2, EQType.LPF, fc+q/2 ); 097 final EQFilter hpf = new EQFilter( lpf, EQType.HPF, fc-q/2 ); 098 099 final SampleRateConverter src2 = new SampleRateConverter( hpf, 100 SampleRateConversionAlgorithm.LINEAR_INTERPOLATION, 101 new AudioFormat( m2m2.getFormat().getNBits(), 102 16, m2m2.getFormat().getNumChannels() ) ); 103 104 final BitDepthConverter xa2 = new BitDepthConverter( src2, 105 BitDepthConversionAlgorithm.NEAREST, 106 new AudioFormat( 8, src2.getFormat().getSampleRateKHz(), 107 src2.getFormat().getNumChannels() ) ); 108 109 return xa2; 110 } 111 112 /** 113 * @param args 114 * @throws PropertyException 115 * @throws IOException 116 * @throws InstantiationException 117 * @throws InterruptedException 118 */ 119 public static void main( final String[] args ) throws IOException, 120 PropertyException, InstantiationException, InterruptedException 121 { 122 final URL configFile = SpeechRecognition.class 123 .getResource( "/org/openimaj/demos/sandbox/audio/sphinx-config-hub4.xml" ); 124 125 // Check the configuration file exists 126 if( configFile == null ) 127 { 128 System.err.println( "Cannot find config file" ); 129 System.exit( 1 ); 130 } 131 132 // Get the audio file input 133 // URL audioFileURL = new URL( "http://www.moviewavs.com/0058349934/WAVS/Movies/Juno/experimenting.wav" ); 134 final File audioFileURL = new File( "videoplayback.mp4" ); 135 136 try 137 { 138 final List<Rectangle> boundingBoxes = new ArrayList<Rectangle>(); 139 140 System.out.println( audioFileURL ); 141 142 // Get a display of the audio waveform 143 final XuggleAudio xuggle = new XuggleAudio( audioFileURL ); 144 final AudioOverviewVisualisation awp = new AudioOverviewVisualisation( SpeechRecognition.getStream( xuggle ) ); 145 final MBFImage awi = awp.plotAudioWaveformImage( 1000, 300, 146 new Float[] 147 { 0f, 0f, 0f, 1f }, new Float[] 148 { 1f, 1f, 1f, 1f } ); 149 150 System.out.println( awp.millisecondsInView ); 151 152 final MBFImage img = new MBFImage( 1000, 400, 3 ); 153 img.drawImage( awi, 0, 0 ); 154 DisplayUtilities.displayName( img, "waveform" ); 155 156 // Load the configuration 157 final ConfigurationManager cm = new ConfigurationManager( configFile ); 158 159 // Allocate the recognizer 160 System.out.println( "Loading..." ); 161 final Recognizer recognizer = (Recognizer)cm.lookup( "recognizer" ); 162 recognizer.allocate(); 163 164 // Configure the audio input for the recognizer 165 final OpenIMAJAudioFileDataSource dataSource = (OpenIMAJAudioFileDataSource)cm 166 .lookup( "audioFileDataSource" ); 167 final XuggleAudio xuggle2 = new XuggleAudio( audioFileURL ); 168 dataSource.setAudioStream( SpeechRecognition.getStream( xuggle2 ) ); 169 170 // Play the audio 171 // XuggleAudio xuggleToPlay = new XuggleAudio( audioFileURL ); 172 // AudioPlayer ap = AudioPlayer.createAudioPlayer( getStream( xuggleToPlay ) ); 173 // ap.run(); 174 175 // The font to plot the words 176 final GeneralFont font = new GeneralFont("Courier", Font.PLAIN ); 177 final FontStyle<Float[]> fontStyle = font.createStyle( awi.createRenderer() ); 178 179 // Start recognising words from the audio file 180 final Pattern p = Pattern.compile( "([A-Za-z0-9'_]+)\\(([0-9.]+),([0-9.]+)\\)" ); 181 Result result = null; 182 final StringBuffer sb = new StringBuffer(); 183 while( (result = recognizer.recognize()) != null ) 184 { 185 final String resultText = result.getTimedBestResult( false, true ); 186 System.out.println( resultText ); 187 188 final Matcher matcher = p.matcher( resultText ); 189 while( matcher.find() ) 190 { 191 System.out.println( "Word: " + matcher.group( 1 ) ); 192 System.out.println( "Start: " + matcher.group( 2 ) ); 193 System.out.println( "End: " + matcher.group( 3 ) ); 194 195 // Parse the word and timings from the result 196 final String word = matcher.group(1); 197 final double s = Double.parseDouble( matcher.group(2) ) * 1000; 198 final double e = Double.parseDouble( matcher.group(3) ) * 1000; 199 sb.append( word+" " ); 200 201 // Get the bounds of the word polygon 202 final Rectangle bounds = font.getRenderer( 203 awi.createRenderer() ).getSize( 204 word, fontStyle ); 205 206 // Determine the pixel coordinate of the start and end times 207 final int startX = (int)(s/awp.millisecondsInView*1000); 208 final int endX = (int)(e/awp.millisecondsInView*1000); 209 210 // Draw bars showing the range of the word 211 img.drawLine( startX, 320, endX, 320, RGBColour.YELLOW ); 212 img.drawLine( startX, 318, startX, 322, RGBColour.GREEN ); 213 img.drawLine( endX, 318, endX, 322, RGBColour.RED ); 214 215 int y = 350; 216 bounds.translate( startX, y ); 217 boolean noIntersection = true; 218 do 219 { 220 noIntersection = true; 221 for( final Rectangle r : boundingBoxes ) 222 if( r.isOverlapping( bounds ) ) 223 { noIntersection = false; break; } 224 225 if( !noIntersection ) 226 bounds.translate( 0, bounds.height ); 227 } while( !noIntersection ); 228 y = (int)bounds.y; 229 230 // Draw the word 231 img.drawLine( startX, 322, startX, (int)(y+bounds.height), 232 new Float[]{0.4f,0.4f,0.4f} ); 233 img.drawLine( startX, (int)(y+bounds.height), startX+8, 234 (int)(y+bounds.height), new Float[]{0.4f,0.4f,0.4f} ); 235 img.drawText( word, startX, y, font, 24, RGBColour.WHITE ); 236 237 // Store the bounding box 238 boundingBoxes.add( bounds ); 239 } 240 } 241 242 DisplayUtilities.displayName( img, "waveform" ); 243 System.out.println( "=======================================" ); 244 System.out.println( "Text: \n"+sb.toString() ); 245 System.out.println( "=======================================" ); 246 } 247 catch( final NumberFormatException e ) 248 { 249 e.printStackTrace(); 250 } 251 catch( final IllegalStateException e ) 252 { 253 e.printStackTrace(); 254 } 255 } 256}