001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030package org.openimaj.data.dataset;
031
032import java.util.Arrays;
033import java.util.Comparator;
034import java.util.LinkedHashMap;
035import java.util.Map;
036import java.util.Set;
037
038import org.apache.commons.vfs2.FileObject;
039import org.apache.commons.vfs2.FileSystemException;
040import org.apache.commons.vfs2.FileSystemManager;
041import org.apache.commons.vfs2.FileType;
042import org.apache.commons.vfs2.FileTypeSelector;
043import org.apache.commons.vfs2.VFS;
044import org.openimaj.data.identity.Identifiable;
045import org.openimaj.io.InputStreamObjectReader;
046import org.openimaj.io.ObjectReader;
047
048/**
049 * A {@link GroupedDataset} of {@link VFSListDataset}s backed by directories of
050 * items (either locally or remotely), or items stored in a hierarchical
051 * structure within a compressed archive.
052 * <p>
053 * This implementation only supports a basic grouped dataset with {@link String}
054 * keys created from the names of directories, and {@link VFSListDataset} values
055 * from all the readable files within each directory.
056 * <p>
057 * As an example, this class can be used to easily create a
058 * {@link GroupedDataset} from a directory containing directories of images:
059 * 
060 * <pre>
061 * GroupedDataset&lt;String, VFSListDataset&lt;FImage&gt;, FImage&gt; dataset = new VFSGroupDataset&lt;FImage&gt;(
062 *              &quot;/path/to/directory/of/images&quot;,
063 *              ImageUtilities.FIMAGE_READER);
064 * </pre>
065 * 
066 * a zip file of directories of images:
067 * 
068 * <pre>
069 * GroupedDataset&lt;String, VFSListDataset&lt;FImage&gt;, FImage&gt; dataset = new VFSGroupDataset&lt;FImage&gt;(
070 *              &quot;zip:file:/path/to/images.zip&quot;, ImageUtilities.FIMAGE_READER);
071 * </pre>
072 * 
073 * or even a remote zip of directories of images hosted via http:
074 * 
075 * <pre>
076 * GroupedDataset&lt;String, VFSListDataset&lt;FImage&gt;, FImage&gt; dataset = new VFSGroupDataset&lt;FImage&gt;(
077 *              &quot;zip:http://localhost/&tilde;jsh2/thumbnails.zip&quot;, ImageUtilities.FIMAGE_READER);
078 * </pre>
079 * 
080 * @author Jonathon Hare (jsh2@ecs.soton.ac.uk)
081 * 
082 * @param <INSTANCE>
083 *            The type of instance in the dataset
084 */
085public class VFSGroupDataset<INSTANCE>
086                extends
087                ReadableGroupDataset<String, VFSListDataset<INSTANCE>, INSTANCE, FileObject>
088                implements
089                Identifiable
090{
091        private Map<String, VFSListDataset<INSTANCE>> files = new LinkedHashMap<String, VFSListDataset<INSTANCE>>();
092        private Map<String, FileObject> directoryInfo = new LinkedHashMap<String, FileObject>();
093        private FileObject base;
094
095        /**
096         * Construct a grouped dataset from any virtual file system source (local
097         * directory, remote zip file, etc). Only the child directories under the
098         * given path will be used to create groups; the contents of any
099         * sub-directories will be merged automatically. Only directories with
100         * readable items as children will be included in the resultant dataset.
101         * 
102         * @see "http://commons.apache.org/proper/commons-vfs/filesystems.html"
103         * @param path
104         *            the file system path or uri. See the Apache Commons VFS2
105         *            documentation for all the details.
106         * @param reader
107         *            the {@link InputStreamObjectReader} that reads the data from
108         *            the VFS
109         * @throws FileSystemException
110         *             if an error occurs accessing the VFS
111         */
112        public VFSGroupDataset(final String path, final InputStreamObjectReader<INSTANCE> reader) throws FileSystemException {
113                this(path, new VFSListDataset.FileObjectISReader<INSTANCE>(reader));
114        }
115
116        /**
117         * Construct a grouped dataset from any virtual file system source (local
118         * directory, remote zip file, etc). Only the child directories under the
119         * given path will be used to create groups; the contents of any
120         * sub-directories will be merged automatically. Only directories with
121         * readable items as children will be included in the resultant dataset.
122         * 
123         * @see "http://commons.apache.org/proper/commons-vfs/filesystems.html"
124         * @param path
125         *            the file system path or uri. See the Apache Commons VFS2
126         *            documentation for all the details.
127         * @param reader
128         *            the {@link InputStreamObjectReader} that reads the data from
129         *            the VFS
130         * @throws FileSystemException
131         *             if an error occurs accessing the VFS
132         */
133        public VFSGroupDataset(final String path, final ObjectReader<INSTANCE, FileObject> reader) throws FileSystemException
134        {
135                super(reader);
136
137                final FileSystemManager fsManager = VFS.getManager();
138                base = fsManager.resolveFile(path);
139
140                final FileObject[] folders = base.findFiles(new FileTypeSelector(FileType.FOLDER));
141
142                Arrays.sort(folders, new Comparator<FileObject>() {
143                        @Override
144                        public int compare(FileObject o1, FileObject o2) {
145                                return o1.getName().toString().compareToIgnoreCase(o2.getName().toString());
146                        }
147                });
148
149                for (final FileObject folder : folders) {
150                        if (folder.equals(base))
151                                continue;
152
153                        directoryInfo.put(folder.getName().getBaseName(), folder);
154                        final VFSListDataset<INSTANCE> list = new VFSListDataset<INSTANCE>(folder.getName().getURI(), reader);
155
156                        if (list.size() > 0)
157                                files.put(folder.getName().getBaseName(), list);
158                }
159        }
160
161        /**
162         * Get the underlying file descriptors of the directories that form the
163         * groups of the dataset
164         * 
165         * @return the array of file objects
166         */
167        public Map<String, FileObject> getGroupDirectories() {
168                return directoryInfo;
169        }
170
171        /**
172         * Get the underlying file descriptor for a particular group in the dataset.
173         * 
174         * @param key
175         *            key of the group
176         * 
177         * @return the file object corresponding to the instance
178         */
179        public FileObject getFileObject(String key) {
180                return directoryInfo.get(key);
181        }
182
183        @Override
184        public String toString() {
185                return String.format("%s(%d groups with a total of %d instances)", this.getClass().getName(), this.size(),
186                                this.numInstances());
187        }
188
189        @Override
190        public Set<Entry<String, VFSListDataset<INSTANCE>>> entrySet() {
191                return files.entrySet();
192        }
193
194        @Override
195        public String getID() {
196                return base.getName().getBaseName();
197        }
198}