001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.hadoop.tools.twitter;
031
032import java.io.IOException;
033
034import org.apache.hadoop.fs.Path;
035import org.kohsuke.args4j.CmdLineException;
036import org.kohsuke.args4j.Option;
037import org.kohsuke.args4j.ProxyOptionHandler;
038import org.openimaj.hadoop.sequencefile.SequenceFileUtility;
039import org.openimaj.hadoop.tools.HadoopToolsUtil;
040import org.openimaj.tools.twitter.options.AbstractTwitterPreprocessingToolOptions;
041
042/**
043 * Hadoop specific options for twitter preprocessing
044 * 
045 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
046 * 
047 */
048public class HadoopTwitterPreprocessingToolOptions extends AbstractTwitterPreprocessingToolOptions {
049
050        private boolean beforeMaps;
051
052        /**
053         * The hadoop options, assume these are the options before mapping
054         * 
055         * @param args
056         * @throws CmdLineException
057         */
058        public HadoopTwitterPreprocessingToolOptions(String[] args) throws CmdLineException {
059                this(args, false);
060        }
061
062        /**
063         * The hadoop twitter preprocessing options
064         * 
065         * @param args
066         *            command line optios
067         * @param beforeMaps
068         *            if true, the output location is removed if the option to do so
069         *            is set
070         * @throws CmdLineException
071         */
072        public HadoopTwitterPreprocessingToolOptions(String[] args, boolean beforeMaps) throws CmdLineException {
073                super(args, false); // don't prepare using the superclass
074                this.beforeMaps = beforeMaps;
075        }
076
077        /*
078         * IO args
079         */
080        @Option(
081                        name = "--mapper-mode",
082                        aliases = "-mm",
083                        required = false,
084                        usage = "Choose a mapper mode.",
085                        handler = ProxyOptionHandler.class)
086        MapperMode mapperMode = MapperMode.STANDARD;
087        MapperMode.Mode mapperModeOp;
088
089        @Option(name = "--reudcer-mode", aliases = "-redm", required = false, usage = "Choose a reducer mode mode.")
090        ReducerModeOption reducerMode = ReducerModeOption.NULL;
091
092        @Option(
093                        name = "--return-immediately",
094                        aliases = "-ri",
095                        required = false,
096                        usage = "If set, the job is submitted to the cluster and this returns immediately")
097        boolean returnImmediately = false;
098
099        @Option(
100                        name = "--lzo-compress",
101                        aliases = "-lzoc",
102                        required = false,
103                        usage = "If set, compress the output of the preprocessing pipeline as LZO")
104        boolean lzoCompress = false;
105
106        @Override
107        public boolean validate() throws CmdLineException {
108                if (this.beforeMaps) {
109                        HadoopToolsUtil.validateInput(this);
110                        HadoopToolsUtil.validateOutput(this);
111                }
112                return true;
113        }
114
115        /**
116         * @return the list of input files
117         * @throws IOException
118         */
119        public Path[] getInputPaths() throws IOException {
120                final Path[] sequenceFiles = SequenceFileUtility.getFilePaths(this.getAllInputs(), "part");
121                return sequenceFiles;
122        }
123
124        /**
125         * @return the output path
126         */
127        public Path getOutputPath() {
128                return new Path(this.getOutput());
129        }
130
131}