001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.tools.web;
031
032import java.io.FileInputStream;
033import java.io.IOException;
034import java.io.InputStreamReader;
035import java.util.ArrayList;
036import java.util.HashMap;
037import java.util.HashSet;
038import java.util.List;
039import java.util.Map;
040import java.util.Set;
041
042import javax.xml.parsers.ParserConfigurationException;
043import javax.xml.parsers.SAXParserFactory;
044
045import org.xml.sax.Attributes;
046import org.xml.sax.InputSource;
047import org.xml.sax.SAXException;
048import org.xml.sax.XMLReader;
049import org.xml.sax.helpers.DefaultHandler;
050
051/**
052 * A script for reading the RDF dump from DMOZ and flattening it to
053 * CSV format
054 *
055 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
056 *
057 */
058public class Dmoz2CSV {
059        static class Topic {
060                String name;
061                List<String> link = new ArrayList<String>();
062                List<String> link1 = new ArrayList<String>();
063        }
064
065        static class ExternalPage {
066                String about;
067                String title;
068                String description;
069        }
070
071        static class RecordsHandler extends DefaultHandler {
072                Set<Topic> topics = new HashSet<Topic>();
073                Map<String, ExternalPage> resources = new HashMap<String, ExternalPage>();
074                Topic currentTopic = null;
075                ExternalPage currentResource = null;
076                boolean isTitle = false;
077                boolean isDescription = false;
078
079                @Override
080                public void startElement(String ns, String localName, String qName, Attributes atts) {
081                        if (qName.equals("Topic")) {
082                                currentTopic = new Topic();
083                                currentTopic.name = atts.getValue("r:id");
084                        } else if (qName.equals("link")) {
085                                currentTopic.link.add(atts.getValue("r:resource"));
086                        } else if (qName.equals("link1")) {
087                                currentTopic.link1.add(atts.getValue("r:resource"));
088                        } else if (qName.equals("ExternalPage")) {
089                                currentResource = new ExternalPage();
090                                currentResource.about = atts.getValue("about");
091                        } else if (qName.equals("d:Title")) {
092                                isTitle = true;
093                        } else if (qName.equals("d:Description")) {
094                                isDescription = true;
095                        }
096                }
097
098                @Override
099                public void characters(char[] chars, int offset, int length) {
100                        if (isDescription)
101                                currentResource.description = new String(chars, offset, length);
102                        if (isTitle)
103                                currentResource.title = new String(chars, offset, length);
104
105                        isTitle = false;
106                        isDescription = false;
107                }
108
109                @Override
110                public void endElement(String ns, String localName, String qName) {
111                        if (qName.equals("Topic")) {
112                                if (currentTopic.link.size() > 0 && currentTopic.link1.size() > 0)
113                                        topics.add(currentTopic);
114                        } else if (qName.equals("ExternalPage")) {
115                                resources.put(currentResource.about, currentResource);
116                        }
117                }
118        }
119
120        /**
121         * Returns a field value escaped for special characters
122         * @param input A String to be evaluated
123         * @return A properly formatted String
124         */
125        static String escape(String input) {
126                input = input.replaceAll("\n", " ");
127                input = input.replaceAll("\r", " ");
128
129                if (input.contains(",") || input.contains("\"") || (!input.trim().equals(input))) { 
130                        return '"' + input.replaceAll("\"", "\"\"") + '"';
131                } else {
132                        return input;
133                }
134        }
135
136        /**
137         * Appends a row of values to the output
138         * @param values A list of values
139         * @return this CsvBuffer instance
140         */
141        static String toCSV(Object... values) {
142                List<String> escapedValues = new ArrayList<String>();
143                for (Object o : values) escapedValues.add(escape(o.toString()));
144
145                StringBuilder content = new StringBuilder();
146                for (int i=0; i<escapedValues.size(); i++) { 
147                        content.append(escapedValues.get(i));
148                        if (i < escapedValues.size()-1) content.append(",");
149                }
150                content.append("\r\n");
151                return content.toString();
152        }
153
154        /**
155         * Main method. 
156         * @param args
157         * @throws SAXException
158         * @throws ParserConfigurationException
159         * @throws IOException
160         */
161        public static void main(String[] args) throws SAXException, ParserConfigurationException, IOException {
162                RecordsHandler handler = new RecordsHandler();
163                XMLReader reader = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
164                reader.setContentHandler(handler);
165
166                for (String file : args) {
167                        InputSource is = new InputSource(new InputStreamReader(new FileInputStream(file), "UTF-8"));
168
169                        is.setEncoding("UTF-8");
170                        reader.parse(is);
171
172                        for (Topic top : handler.topics) {
173                                for (String it : top.link1) {
174                                        System.out.println(toCSV(top.name, "LINK1", it, handler.resources.get(it).title, handler.resources.get(it).description));
175                                }
176
177                                for (String it : top.link) {
178                                        System.out.println(toCSV(top.name, "LINK", it, handler.resources.get(it).title, handler.resources.get(it).description));
179                                }
180                        }
181                }
182        }
183}