001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.text.nlp.patterns;
031
032import java.util.regex.Pattern;
033
034/**
035 * Various kinds of URL Pattern
036 * 
037 * @author Sina Samangooei (ss@ecs.soton.ac.uk)
038 * 
039 */
040public class URLPatternProvider extends PatternProvider {
041
042        /**
043         * 
044         * Implementation of the URL regex from
045         * http://daringfireball.net/2010/07/improved_regex_for_matching_urls
046         * 
047         * @author Jonathon Hare (jsh2@ecs.soton.ac.uk), Sina Samangooei
048         *         <ss@ecs.soton.ac.uk>
049         * 
050         */
051        public static class DFURLPatternProvider extends URLPatternProvider {
052                /**
053                 * 
054                 */
055                public DFURLPatternProvider() {
056                        Url = "\\b" + "(" + // Capture 1: entire matched URL
057                                        "(?:" + "https?://" + // http or https protocol
058                                        "|" + // or
059                                        "www\\d{0,3}[.]" + // "www.", "www1.", "www2." ... "www999."
060                                        "|" + // or
061                                        // "([\\S]+[.])+[a-z]{2,4}/" + // looks like domain
062                                        // name followed by a slash
063                                        "[A-Za-z0-9.\\-]+[.][a-z]{2,4}/" + // looks
064                                                                                                                // like
065                                                                                                                // domain
066                                                                                                                // name
067                                                                                                                // followed
068                                                                                                                // by a
069                                                                                                                // slash
070                                        ")" + "(?:" + // One or more:
071                                        "[^\\s()<>]+" + // Run of non-space, non-()<>
072                                        "|" + // or
073                                        "\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)" + // balanced
074                                                                                                                                        // parens,
075                                                                                                                                        // up to 2
076                                                                                                                                        // levels
077                                        ")+" + "(?:" + // End with:
078                                        "\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)" + // balanced
079                                                                                                                                        // parens,
080                                                                                                                                        // up to 2
081                                                                                                                                        // levels
082                                        "|" + // or
083                                        "[^\\s`!()\\[\\]{};:'\".,<>?\u00AB\u00BB\u201C\u201D\u2018\u2019]" + // not
084                                                                                                                                                                                        // a
085                                                                                                                                                                                        // space
086                                                                                                                                                                                        // or
087                                                                                                                                                                                        // one
088                                                                                                                                                                                        // of
089                                                                                                                                                                                        // these
090                                                                                                                                                                                        // punct
091                                                                                                                                                                                        // chars
092                                        ")" + ")";
093                }
094        }
095
096        protected String Url;
097
098        /**
099         * @param punctuation
100         * @param entity
101         */
102        public URLPatternProvider(PunctuationPatternProvider punctuation, EntityPatternProvider entity) {
103                // final String validLettersAndNumbers = "[a-z\\u00a1-\\uffff0-9]";
104                final String validLettersAndNumbersAndDots = "[a-z\\u00a1-\\uffff0-9\\-.]";
105                /*
106                 * final String hostNamePart = "(?:" + validLettersAndNumbers + "+-?)*"
107                 * + validLettersAndNumbers + "+"; // something // or //
108                 * something-something // but // never // just // something-
109                 */Url = "\\b" +
110                // protocol identifier
111                                "(?:(?:https?://|ftp://|www\\d{0,3}[.]))" +
112                                // user:pass authentication
113                                "(?:\\S+(?::\\S*)?@)?" + "(?:" +
114                                // IP address exclusion
115                                // private & local networks
116                                "(?!10(?:\\.\\d{1,3}){3})"
117                                //
118                                + "(?!127(?:\\.\\d{1,3}){3})"
119                                //
120                                + "(?!169\\.254(?:\\.\\d{1,3}){2})"
121                                //
122                                + "(?!192\\.168(?:\\.\\d{1,3}){2})"
123                                //
124                                + "(?!172\\.(?:1[6-9]|2\\d|3[0-1])(?:\\.\\d{1,3}){2})"
125                                //
126                                +
127                                // IP address dotted notation octets
128                                // excludes loopback network 0.0.0.0
129                                // excludes reserved space >= 224.0.0.0
130                                // excludes network & broacast addresses
131                                // (first & last IP address of each class)
132                                "(?:[1-9]\\d?|1\\d\\d|2[01]\\d|22[0-3])"
133                                //
134                                + "(?:\\.(?:1?\\d{1,2}|2[0-4]\\d|25[0-5])){2}"
135                                //
136                                + "(?:\\.(?:[1-9]\\d?|1\\d\\d|2[0-4]\\d|25[0-4]))" + "|"
137                                //
138                                +
139                                // // host name
140                                // "(?:"+hostNamePart +")"
141                                // +
142                                // // domain name
143                                // "(?:\\."+hostNamePart+")*"
144                                // +
145                                "(?:" + validLettersAndNumbersAndDots + ")+" +
146                                // TLD identifier
147                                "(?:\\.(?:[a-z\\u00a1-\\uffff]{2,4})" + ")" + ")" +
148                                // port number
149                                "(?::\\d{2,5})?" +
150                                // resource path
151                                "(?:/[^\\s]*)?";
152        }
153
154        /**
155         * 
156         */
157        public URLPatternProvider() {
158                this(new PunctuationPatternProvider(), new EntityPatternProvider());
159        }
160
161        @Override
162        public String patternString() {
163                return String.format("(%s)", Url);
164        }
165
166        @Override
167        public Pattern pattern() {
168                return Pattern.compile(patternString(), Pattern.UNICODE_CASE | Pattern.CASE_INSENSITIVE);
169        }
170
171}