001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.stream.provider;
031
032import java.io.IOException;
033import java.net.URL;
034import java.util.HashMap;
035import java.util.Map;
036import java.util.regex.Matcher;
037import java.util.regex.Pattern;
038
039import org.openimaj.stream.provider.WikipediaEditsDataset.WikipediaEdit;
040import org.openimaj.stream.provider.irc.AbstractIRCStreamDataset;
041import org.openimaj.util.concurrent.ArrayBlockingDroppingQueue;
042import org.openimaj.util.concurrent.BlockingDroppingQueue;
043
044/**
045 * Streaming dataset based on the Wikipedia/Wikimedia edits published in
046 * real-time on the wikimedia IRC channels.
047 * 
048 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
049 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
050 * 
051 */
052public class WikipediaEditsDataset extends AbstractIRCStreamDataset<WikipediaEdit> {
053        private static final String RCPMTPA_REGEX = "" +
054                        "\\x0314\\[\\[\\x0307(.+?)\\x0314\\]\\]" +
055                        "\\x034 (.*?)" +
056                        "\\x0310.*" +
057                        "\\x0302(.*?)" +
058                        "\\x03.+" +
059                        "\\x0303(.+?)" +
060                        "\\x03.+" +
061                        "\\x03 [(](.*)[)] " +
062                        "\\x0310(.*)\\u0003.*";
063        private static Map<String, String> languageChannels;
064
065        static {
066                languageChannels = new HashMap<String, String>();
067                languageChannels.put("en", "#en.wikipedia");
068        }
069
070        private static Pattern regex = Pattern.compile(RCPMTPA_REGEX);
071
072        /**
073         * Construct the edit stream with the given buffer and language.
074         * 
075         * @param buffer
076         *            the buffer
077         * @param language
078         *            the language id; currently only English "en" is supported
079         * @throws IOException
080         *             if there is a problem connecting
081         */
082        public WikipediaEditsDataset(BlockingDroppingQueue<WikipediaEdit> buffer, String language)
083                        throws IOException
084        {
085                super(buffer, "irc.wikimedia.org", languageChannels.get(language));
086        }
087
088        /**
089         * Construct the edit stream with an {@link ArrayBlockingDroppingQueue} of
090         * capacity 1.
091         * 
092         * @param lang
093         *            the language id; currently only English "en" is supported
094         * @throws IOException
095         *             if there is a problem connecting
096         */
097        public WikipediaEditsDataset(String lang) throws IOException {
098                this(new ArrayBlockingDroppingQueue<WikipediaEdit>(1), lang);
099        }
100
101        /**
102         * An edit
103         * 
104         * @author Sina Samangooei (ss@ecs.soton.ac.uk)
105         * 
106         */
107        public static class WikipediaEdit {
108
109                /** The change ID **/
110                public int change;
111                /** The user who made the change **/
112                public String user;
113                /** Was the edit anonymous? **/
114                public boolean anon;
115                /** The raw flags **/
116                public String flag;
117                /** Was it a robot that made the change? **/
118                public boolean isRobot;
119                /** Was a new page created? **/
120                public boolean isNewPage;
121                /** Is the edit unpatrolled? **/
122                public boolean isUnpatrolled;
123                /** The page that was edited **/
124                public String page;
125                /** The URL **/
126                public URL wikipedia;
127                /** The URL as a string **/
128                public String wikipediaUrl;
129                /** The URL of the page **/
130                public URL pageUrl;
131                /** The URL of the user **/
132                public URL userUrl;
133                /** The edit comment **/
134                public String comment;
135
136                /**
137                 * Default constructor
138                 * 
139                 * @param message
140                 *            the raw message string
141                 * @throws IOException
142                 *             if an error occurs during parsing
143                 */
144                protected WikipediaEdit(String message) throws IOException {
145                        final Matcher m = regex.matcher(message);
146                        if (!m.matches())
147                                throw new IOException("Wikipedia message not parseable");
148                        final String group1 = m.group(1);
149                        final String group2 = m.group(2);
150                        final String group3 = m.group(3);
151                        final String group4 = m.group(4);
152                        final String group5 = m.group(5).replace("+", "").replace("-", "");
153                        final int neg = m.group(5).contains("-") ? -1 : 1;
154                        final String group6 = m.group(6);
155                        change = neg * Integer.parseInt(group5);
156
157                        user = group4;
158                        anon = Pattern.matches("\\d+.\\d+.\\d+.\\d+", user);
159                        flag = group2;
160                        isRobot = flag.contains("M");
161                        isNewPage = flag.contains("N");
162                        isUnpatrolled = flag.contains("!");
163                        page = group1;
164                        wikipedia = new URL(group3);
165                        wikipediaUrl = "http://" + wikipedia.getHost();
166                        pageUrl = new URL(wikipediaUrl + "/wiki/" + page.replace(" ", "_"));
167                        if (!anon)
168                                userUrl = new URL(wikipediaUrl + "/wiki/User:" + user.replace(" ", "_"));
169                        else
170                                userUrl = null;
171                        comment = group6;
172                }
173
174                @Override
175                public String toString() {
176                        return String.format("User: %s, Change: %d", user, change);
177                }
178
179        }
180
181        @Override
182        public WikipediaEdit construct(String channel, String sender, String login, String hostname, String message) {
183                if (!sender.equals("rc-pmtpa"))
184                        return null;
185
186                try {
187                        return new WikipediaEdit(message);
188                } catch (final Exception e) {
189                        return null;
190                }
191        }
192}