001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.text.nlp.patterns; 031 032import java.util.regex.Pattern; 033 034/** 035 * Various kinds of URL Pattern 036 * 037 * @author Sina Samangooei (ss@ecs.soton.ac.uk) 038 * 039 */ 040public class URLPatternProvider extends PatternProvider { 041 042 /** 043 * 044 * Implementation of the URL regex from 045 * http://daringfireball.net/2010/07/improved_regex_for_matching_urls 046 * 047 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk), Sina Samangooei 048 * <ss@ecs.soton.ac.uk> 049 * 050 */ 051 public static class DFURLPatternProvider extends URLPatternProvider { 052 /** 053 * 054 */ 055 public DFURLPatternProvider() { 056 Url = "\\b" + "(" + // Capture 1: entire matched URL 057 "(?:" + "https?://" + // http or https protocol 058 "|" + // or 059 "www\\d{0,3}[.]" + // "www.", "www1.", "www2." ... "www999." 060 "|" + // or 061 // "([\\S]+[.])+[a-z]{2,4}/" + // looks like domain 062 // name followed by a slash 063 "[A-Za-z0-9.\\-]+[.][a-z]{2,4}/" + // looks 064 // like 065 // domain 066 // name 067 // followed 068 // by a 069 // slash 070 ")" + "(?:" + // One or more: 071 "[^\\s()<>]+" + // Run of non-space, non-()<> 072 "|" + // or 073 "\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)" + // balanced 074 // parens, 075 // up to 2 076 // levels 077 ")+" + "(?:" + // End with: 078 "\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)" + // balanced 079 // parens, 080 // up to 2 081 // levels 082 "|" + // or 083 "[^\\s`!()\\[\\]{};:'\".,<>?\u00AB\u00BB\u201C\u201D\u2018\u2019]" + // not 084 // a 085 // space 086 // or 087 // one 088 // of 089 // these 090 // punct 091 // chars 092 ")" + ")"; 093 } 094 } 095 096 protected String Url; 097 098 /** 099 * @param punctuation 100 * @param entity 101 */ 102 public URLPatternProvider(PunctuationPatternProvider punctuation, EntityPatternProvider entity) { 103 // final String validLettersAndNumbers = "[a-z\\u00a1-\\uffff0-9]"; 104 final String validLettersAndNumbersAndDots = "[a-z\\u00a1-\\uffff0-9\\-.]"; 105 /* 106 * final String hostNamePart = "(?:" + validLettersAndNumbers + "+-?)*" 107 * + validLettersAndNumbers + "+"; // something // or // 108 * something-something // but // never // just // something- 109 */Url = "\\b" + 110 // protocol identifier 111 "(?:(?:https?://|ftp://|www\\d{0,3}[.]))" + 112 // user:pass authentication 113 "(?:\\S+(?::\\S*)?@)?" + "(?:" + 114 // IP address exclusion 115 // private & local networks 116 "(?!10(?:\\.\\d{1,3}){3})" 117 // 118 + "(?!127(?:\\.\\d{1,3}){3})" 119 // 120 + "(?!169\\.254(?:\\.\\d{1,3}){2})" 121 // 122 + "(?!192\\.168(?:\\.\\d{1,3}){2})" 123 // 124 + "(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})" 125 // 126 + 127 // IP address dotted notation octets 128 // excludes loopback network 0.0.0.0 129 // excludes reserved space >= 224.0.0.0 130 // excludes network & broacast addresses 131 // (first & last IP address of each class) 132 "(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])" 133 // 134 + "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}" 135 // 136 + "(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" + "|" 137 // 138 + 139 // // host name 140 // "(?:"+hostNamePart +")" 141 // + 142 // // domain name 143 // "(?:\\."+hostNamePart+")*" 144 // + 145 "(?:" + validLettersAndNumbersAndDots + ")+" + 146 // TLD identifier 147 "(?:\\.(?:[a-z\\u00a1-\\uffff]{2,4})" + ")" + ")" + 148 // port number 149 "(?::\\d{2,5})?" + 150 // resource path 151 "(?:/[^\\s]*)?"; 152 } 153 154 /** 155 * 156 */ 157 public URLPatternProvider() { 158 this(new PunctuationPatternProvider(), new EntityPatternProvider()); 159 } 160 161 @Override 162 public String patternString() { 163 return String.format("(%s)", Url); 164 } 165 166 @Override 167 public Pattern pattern() { 168 return Pattern.compile(patternString(), Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE); 169 } 170 171}