001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.stream.provider; 031 032import java.io.IOException; 033import java.net.URL; 034import java.util.HashMap; 035import java.util.Map; 036import java.util.regex.Matcher; 037import java.util.regex.Pattern; 038 039import org.openimaj.stream.provider.WikipediaEditsDataset.WikipediaEdit; 040import org.openimaj.stream.provider.irc.AbstractIRCStreamDataset; 041import org.openimaj.util.concurrent.ArrayBlockingDroppingQueue; 042import org.openimaj.util.concurrent.BlockingDroppingQueue; 043 044/** 045 * Streaming dataset based on the Wikipedia/Wikimedia edits published in 046 * real-time on the wikimedia IRC channels. 047 * 048 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 049 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 050 * 051 */ 052public class WikipediaEditsDataset extends AbstractIRCStreamDataset<WikipediaEdit> { 053 private static final String RCPMTPA_REGEX = "" + 054 "\\x0314\\[\\[\\x0307(.+?)\\x0314\\]\\]" + 055 "\\x034 (.*?)" + 056 "\\x0310.*" + 057 "\\x0302(.*?)" + 058 "\\x03.+" + 059 "\\x0303(.+?)" + 060 "\\x03.+" + 061 "\\x03 [(](.*)[)] " + 062 "\\x0310(.*)\\u0003.*"; 063 private static Map<String, String> languageChannels; 064 065 static { 066 languageChannels = new HashMap<String, String>(); 067 languageChannels.put("en", "#en.wikipedia"); 068 } 069 070 private static Pattern regex = Pattern.compile(RCPMTPA_REGEX); 071 072 /** 073 * Construct the edit stream with the given buffer and language. 074 * 075 * @param buffer 076 * the buffer 077 * @param language 078 * the language id; currently only English "en" is supported 079 * @throws IOException 080 * if there is a problem connecting 081 */ 082 public WikipediaEditsDataset(BlockingDroppingQueue<WikipediaEdit> buffer, String language) 083 throws IOException 084 { 085 super(buffer, "irc.wikimedia.org", languageChannels.get(language)); 086 } 087 088 /** 089 * Construct the edit stream with an {@link ArrayBlockingDroppingQueue} of 090 * capacity 1. 091 * 092 * @param lang 093 * the language id; currently only English "en" is supported 094 * @throws IOException 095 * if there is a problem connecting 096 */ 097 public WikipediaEditsDataset(String lang) throws IOException { 098 this(new ArrayBlockingDroppingQueue<WikipediaEdit>(1), lang); 099 } 100 101 /** 102 * An edit 103 * 104 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 105 * 106 */ 107 public static class WikipediaEdit { 108 109 /** The change ID **/ 110 public int change; 111 /** The user who made the change **/ 112 public String user; 113 /** Was the edit anonymous? **/ 114 public boolean anon; 115 /** The raw flags **/ 116 public String flag; 117 /** Was it a robot that made the change? **/ 118 public boolean isRobot; 119 /** Was a new page created? **/ 120 public boolean isNewPage; 121 /** Is the edit unpatrolled? **/ 122 public boolean isUnpatrolled; 123 /** The page that was edited **/ 124 public String page; 125 /** The URL **/ 126 public URL wikipedia; 127 /** The URL as a string **/ 128 public String wikipediaUrl; 129 /** The URL of the page **/ 130 public URL pageUrl; 131 /** The URL of the user **/ 132 public URL userUrl; 133 /** The edit comment **/ 134 public String comment; 135 136 /** 137 * Default constructor 138 * 139 * @param message 140 * the raw message string 141 * @throws IOException 142 * if an error occurs during parsing 143 */ 144 protected WikipediaEdit(String message) throws IOException { 145 final Matcher m = regex.matcher(message); 146 if (!m.matches()) 147 throw new IOException("Wikipedia message not parseable"); 148 final String group1 = m.group(1); 149 final String group2 = m.group(2); 150 final String group3 = m.group(3); 151 final String group4 = m.group(4); 152 final String group5 = m.group(5).replace("+", "").replace("-", ""); 153 final int neg = m.group(5).contains("-") ? -1 : 1; 154 final String group6 = m.group(6); 155 change = neg * Integer.parseInt(group5); 156 157 user = group4; 158 anon = Pattern.matches("\\d+.\\d+.\\d+.\\d+", user); 159 flag = group2; 160 isRobot = flag.contains("M"); 161 isNewPage = flag.contains("N"); 162 isUnpatrolled = flag.contains("!"); 163 page = group1; 164 wikipedia = new URL(group3); 165 wikipediaUrl = "http://" + wikipedia.getHost(); 166 pageUrl = new URL(wikipediaUrl + "/wiki/" + page.replace(" ", "_")); 167 if (!anon) 168 userUrl = new URL(wikipediaUrl + "/wiki/User:" + user.replace(" ", "_")); 169 else 170 userUrl = null; 171 comment = group6; 172 } 173 174 @Override 175 public String toString() { 176 return String.format("User: %s, Change: %d", user, change); 177 } 178 179 } 180 181 @Override 182 public WikipediaEdit construct(String channel, String sender, String login, String hostname, String message) { 183 if (!sender.equals("rc-pmtpa")) 184 return null; 185 186 try { 187 return new WikipediaEdit(message); 188 } catch (final Exception e) { 189 return null; 190 } 191 } 192}