001/** 002 * Copyright (c) 2011, The University of Southampton and the individual contributors. 003 * All rights reserved. 004 * 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * 008 * * Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * 011 * * Redistributions in binary form must reproduce the above copyright notice, 012 * this list of conditions and the following disclaimer in the documentation 013 * and/or other materials provided with the distribution. 014 * 015 * * Neither the name of the University of Southampton nor the names of its 016 * contributors may be used to endorse or promote products derived from this 017 * software without specific prior written permission. 018 * 019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 029 */ 030package org.openimaj.tools.web; 031 032import java.io.FileInputStream; 033import java.io.IOException; 034import java.io.InputStreamReader; 035import java.util.ArrayList; 036import java.util.HashMap; 037import java.util.HashSet; 038import java.util.List; 039import java.util.Map; 040import java.util.Set; 041 042import javax.xml.parsers.ParserConfigurationException; 043import javax.xml.parsers.SAXParserFactory; 044 045import org.xml.sax.Attributes; 046import org.xml.sax.InputSource; 047import org.xml.sax.SAXException; 048import org.xml.sax.XMLReader; 049import org.xml.sax.helpers.DefaultHandler; 050 051/** 052 * A script for reading the RDF dump from DMOZ and flattening it to 053 * CSV format 054 * 055 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk) 056 * 057 */ 058public class Dmoz2CSV { 059 static class Topic { 060 String name; 061 List<String> link = new ArrayList<String>(); 062 List<String> link1 = new ArrayList<String>(); 063 } 064 065 static class ExternalPage { 066 String about; 067 String title; 068 String description; 069 } 070 071 static class RecordsHandler extends DefaultHandler { 072 Set<Topic> topics = new HashSet<Topic>(); 073 Map<String, ExternalPage> resources = new HashMap<String, ExternalPage>(); 074 Topic currentTopic = null; 075 ExternalPage currentResource = null; 076 boolean isTitle = false; 077 boolean isDescription = false; 078 079 @Override 080 public void startElement(String ns, String localName, String qName, Attributes atts) { 081 if (qName.equals("Topic")) { 082 currentTopic = new Topic(); 083 currentTopic.name = atts.getValue("r:id"); 084 } else if (qName.equals("link")) { 085 currentTopic.link.add(atts.getValue("r:resource")); 086 } else if (qName.equals("link1")) { 087 currentTopic.link1.add(atts.getValue("r:resource")); 088 } else if (qName.equals("ExternalPage")) { 089 currentResource = new ExternalPage(); 090 currentResource.about = atts.getValue("about"); 091 } else if (qName.equals("d:Title")) { 092 isTitle = true; 093 } else if (qName.equals("d:Description")) { 094 isDescription = true; 095 } 096 } 097 098 @Override 099 public void characters(char[] chars, int offset, int length) { 100 if (isDescription) 101 currentResource.description = new String(chars, offset, length); 102 if (isTitle) 103 currentResource.title = new String(chars, offset, length); 104 105 isTitle = false; 106 isDescription = false; 107 } 108 109 @Override 110 public void endElement(String ns, String localName, String qName) { 111 if (qName.equals("Topic")) { 112 if (currentTopic.link.size() > 0 && currentTopic.link1.size() > 0) 113 topics.add(currentTopic); 114 } else if (qName.equals("ExternalPage")) { 115 resources.put(currentResource.about, currentResource); 116 } 117 } 118 } 119 120 /** 121 * Returns a field value escaped for special characters 122 * @param input A String to be evaluated 123 * @return A properly formatted String 124 */ 125 static String escape(String input) { 126 input = input.replaceAll("\n", " "); 127 input = input.replaceAll("\r", " "); 128 129 if (input.contains(",") || input.contains("\"") || (!input.trim().equals(input))) { 130 return '"' + input.replaceAll("\"", "\"\"") + '"'; 131 } else { 132 return input; 133 } 134 } 135 136 /** 137 * Appends a row of values to the output 138 * @param values A list of values 139 * @return this CsvBuffer instance 140 */ 141 static String toCSV(Object... values) { 142 List<String> escapedValues = new ArrayList<String>(); 143 for (Object o : values) escapedValues.add(escape(o.toString())); 144 145 StringBuilder content = new StringBuilder(); 146 for (int i=0; i<escapedValues.size(); i++) { 147 content.append(escapedValues.get(i)); 148 if (i < escapedValues.size()-1) content.append(","); 149 } 150 content.append("\r\n"); 151 return content.toString(); 152 } 153 154 /** 155 * Main method. 156 * @param args 157 * @throws SAXException 158 * @throws ParserConfigurationException 159 * @throws IOException 160 */ 161 public static void main(String[] args) throws SAXException, ParserConfigurationException, IOException { 162 RecordsHandler handler = new RecordsHandler(); 163 XMLReader reader = SAXParserFactory.newInstance().newSAXParser().getXMLReader(); 164 reader.setContentHandler(handler); 165 166 for (String file : args) { 167 InputSource is = new InputSource(new InputStreamReader(new FileInputStream(file), "UTF-8")); 168 169 is.setEncoding("UTF-8"); 170 reader.parse(is); 171 172 for (Topic top : handler.topics) { 173 for (String it : top.link1) { 174 System.out.println(toCSV(top.name, "LINK1", it, handler.resources.get(it).title, handler.resources.get(it).description)); 175 } 176 177 for (String it : top.link) { 178 System.out.println(toCSV(top.name, "LINK", it, handler.resources.get(it).title, handler.resources.get(it).description)); 179 } 180 } 181 } 182 } 183}