001/**
002 * Copyright (c) 2011, The University of Southampton and the individual contributors.
003 * All rights reserved.
004 *
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 *
008 *   *  Redistributions of source code must retain the above copyright notice,
009 *      this list of conditions and the following disclaimer.
010 *
011 *   *  Redistributions in binary form must reproduce the above copyright notice,
012 *      this list of conditions and the following disclaimer in the documentation
013 *      and/or other materials provided with the distribution.
014 *
015 *   *  Neither the name of the University of Southampton nor the names of its
016 *      contributors may be used to endorse or promote products derived from this
017 *      software without specific prior written permission.
018 *
019 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
020 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
021 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
022 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
023 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
024 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
026 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
027 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
028 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
029 */
030/**
031 *
032 */
033package org.openimaj.image.annotation.evaluation.datasets;
034
035import java.io.BufferedReader;
036import java.io.File;
037import java.io.FileNotFoundException;
038import java.io.FileReader;
039import java.io.IOException;
040import java.net.MalformedURLException;
041import java.net.URL;
042import java.util.ArrayList;
043import java.util.Arrays;
044import java.util.HashMap;
045import java.util.List;
046import java.util.Map;
047
048import org.openimaj.citation.annotation.Reference;
049import org.openimaj.citation.annotation.ReferenceType;
050import org.openimaj.data.dataset.GroupedDataset;
051import org.openimaj.data.dataset.ListBackedDataset;
052import org.openimaj.data.dataset.ListDataset;
053import org.openimaj.data.dataset.MapBackedDataset;
054import org.openimaj.experiment.annotations.DatasetDescription;
055import org.openimaj.experiment.evaluation.agreement.CohensKappaInterraterAgreement;
056import org.openimaj.experiment.evaluation.agreement.MajorityVoting;
057import org.openimaj.ml.annotation.ScoredAnnotation;
058import org.openimaj.util.iterator.TextLineIterable;
059import org.openimaj.util.pair.ObjectFloatPair;
060import org.openimaj.web.flickr.FlickrImage;
061
062/**
063 * A wrapper dataset for the MMSys2013 Fashion-Focussed Creative Commons social
064 * dataset (Loni, et.al).
065 * 
066 * TODO: Need to add the citation here. From
067 * http://dl.acm.org/citation.cfm?id=2483984
068 * 
069 * @author David Dupplaw (dpd@ecs.soton.ac.uk)
070 * @created 12 Aug 2013
071 * @version $Author$, $Revision$, $Date$
072 */
073@Reference(
074                type = ReferenceType.Inproceedings,
075                author = { "Loni, Babak", "Menendez, Maria", "Georgescu, Mihai", "Galli, Luca", "Massari, Claudio", "Altingovde, Ismail Sengor", "Martinenghi, Davide", "Melenhorst, Mark", "Vliegendhart, Raynor", "Larson, Martha" },
076                title = "Fashion-focused creative commons social dataset",
077                year = "2013",
078                booktitle = "Proceedings of the 4th ACM Multimedia Systems Conference",
079                pages = { "72", "", "77" },
080                url = "http://doi.acm.org/10.1145/2483977.2483984",
081                publisher = "ACM",
082                series = "MMSys '13",
083                customData = {
084                                "isbn", "978-1-4503-1894-5",
085                                "location", "Oslo, Norway",
086                                "numpages", "6",
087                                "doi", "10.1145/2483977.2483984",
088                                "acmid", "2483984",
089                                "address", "New York, NY, USA",
090                                "keywords", "crowdsourcing, dataset, fashion, multimedia content analysis"
091                })
092@DatasetDescription(
093                name = "Fashion-Focused Creative Commons Social Dataset",
094                description = "a fashion-focused Creative Commons dataset, which is "
095                                + "designed to contain a mix of general images as well as a large "
096                                + "component of images that are focused on fashion (i.e., relevant "
097                                + "to particular clothing items or fashion accessories)",
098                creator = "Babak Loni, Maria Menendez, Mihai Georgescu, Luca Galli, "
099                                + "Claudio Massari, Ismail Sengor Altingovde, Davide Martinenghi, "
100                                + "Mark Melenhorst, Raynor Vliegendhart, Martha Larson",
101                downloadUrls = {
102                                "http://skuld.cs.umass.edu/traces/mmsys/2013/fashion/Fashion Dataset.zip" })
103public class MMSys2013
104{
105        /**
106         * Allowable types of answer for each question.
107         * 
108         * @author David Dupplaw (dpd@ecs.soton.ac.uk)
109         * @created 12 Aug 2013
110         * @version $Author$, $Revision$, $Date$
111         */
112        public static enum QuestionResponse
113        {
114                /** No */
115                NO,
116                /** Yes */
117                YES,
118                /** Not sure */
119                NOT_SURE,
120                /** Question was unanswered */
121                UNANSWERED;
122        }
123
124        /**
125         * A response to a HIT
126         * 
127         * @author David Dupplaw (dpd@ecs.soton.ac.uk)
128         * @created 12 Aug 2013
129         * @version $Author$, $Revision$, $Date$
130         */
131        public static class Response
132        {
133                /** Whether the image contains a depicition of the category subject */
134                public QuestionResponse containsCategoryDepiction;
135
136                /** Whether the image is in the correct category */
137                public QuestionResponse isInCorrectCategory;
138
139                /** How familiar is the responder with the category */
140                public int familiarityWithCategory;
141
142                /**
143                 * Constructor
144                 * 
145                 * @param r1
146                 *            contains category depiction
147                 * @param r2
148                 *            is in correct category
149                 * @param familiarity
150                 *            familiarity with subject
151                 */
152                public Response(final QuestionResponse r1, final QuestionResponse r2, final int familiarity)
153                {
154                        this.containsCategoryDepiction = r1;
155                        this.isInCorrectCategory = r2;
156                        this.familiarityWithCategory = familiarity;
157                }
158
159                @Override
160                public String toString()
161                {
162                        return "{" + this.containsCategoryDepiction + "," +
163                                        this.isInCorrectCategory + "," + this.familiarityWithCategory + "}";
164                }
165        }
166
167        /**
168         * A record in the Fashion 10,000 dataset.
169         * 
170         * @author David Dupplaw (dpd@ecs.soton.ac.uk)
171         * @created 12 Aug 2013
172         * @version $Author$, $Revision$, $Date$
173         */
174        protected static class Record
175        {
176                /** The Flickr Photo */
177                public FlickrImage image;
178
179                /** The category in which the image was found */
180                public String category;
181
182                /** A set of responses for this image */
183                public Response[] annotations;
184
185                @Override
186                public String toString()
187                {
188                        return this.image.getId() + ":" + this.category + "[" +
189                                        Arrays.toString(this.annotations) + "]";
190                }
191        }
192
193        protected String baseLocation =
194                        "/data/degas/mediaeval/mediaeval-crowdsourcing/MMSys2013/";
195
196        protected String expertDataFile =
197                        "Annotations/Annotation_PerImage_Trusted.csv";
198
199        protected String nonExpertDataFile =
200                        "Annotations/Annotation_PerImage_NonExperts.csv";
201
202        protected String groundTruthFile =
203                        "Annotations/GroundTruth.csv";
204
205        protected String queriesFile =
206                        "Metadata/queries.csv";
207
208        /**
209         * Returns the ground truth set.
210         * 
211         * @return The grouped dataset
212         */
213        public GroupedDataset<String, GroupedDataset<String, ListDataset<Response>, Response>, Response>
214                        getGroundTruth()
215        {
216                final GroupedDataset<String, GroupedDataset<String, ListDataset<Response>, Response>, Response> results = new MapBackedDataset<String, GroupedDataset<
217                                String, ListDataset<Response>, Response>, MMSys2013.Response>();
218
219                // The ground truth dataset doesn't contain categories, sadly - just
220                // the filename and the results. So we need to go and get the categories
221                // for each of the images first. We'll do that from the queries file.
222                final HashMap<Long, String> categoryCache = new HashMap<Long, String>();
223                boolean firstLine = true;
224                for (final String line : new TextLineIterable(new File(this.baseLocation, this.queriesFile)))
225                {
226                        if (!firstLine)
227                        {
228                                final String[] parts = line.split(",", -1);
229
230                                // The substrings remove the quotes either side of the value
231                                categoryCache.put(
232                                                Long.parseLong(parts[3].substring(1).substring(0, parts[3].length() - 2)),
233                                                parts[0].substring(1).substring(0, parts[0].length() - 2));
234                        }
235
236                        firstLine = false;
237                }
238
239                firstLine = true;
240                for (final String line : new TextLineIterable(new File(this.baseLocation, this.groundTruthFile)))
241                {
242                        if (!firstLine)
243                        {
244                                try
245                                {
246                                        final String[] parts = line.split(",", -1);
247
248                                        // Get the category for the given image.
249                                        final String url = parts[0];
250                                        final FlickrImage fi = FlickrImage.create(new URL(url));
251                                        final String cat = categoryCache.get(fi.getId());
252
253                                        // Get the category list
254                                        GroupedDataset<String, ListDataset<Response>, Response> gds = results.get(cat);
255
256                                        // Check whether we already have a dataset for
257                                        // the image in this category
258                                        if (gds == null)
259                                        {
260                                                // Create a new dataset for images in this category
261                                                gds = new MapBackedDataset<String,
262                                                                ListDataset<Response>, Response>();
263                                                results.put(cat, gds);
264                                        }
265
266                                        // See if we have any responses for this image already
267                                        ListDataset<Response> ids = gds.get(url);
268
269                                        // If not, create the dataset for this image
270                                        if (ids == null)
271                                        {
272                                                ids = new ListBackedDataset<Response>();
273                                                gds.put(url, ids);
274                                        }
275
276                                        // Get the response for this image and add it
277                                        final Response rr = new Response(
278                                                        this.parseQR(parts[1]),
279                                                        this.parseQR(parts[2]), 1);
280                                        ids.add(rr);
281                                } catch (final MalformedURLException e)
282                                {
283                                        e.printStackTrace();
284                                }
285                        }
286
287                        firstLine = false;
288                }
289
290                return results;
291        }
292
293        /**
294         * Returns the results from the non-expert turkers.
295         * 
296         * @return The grouped dataset
297         */
298        public GroupedDataset<String, GroupedDataset<String,
299                        ListDataset<Response>, Response>, Response> getNonExpertData()
300        {
301                return this.parseMetadata(new File(this.baseLocation, this.nonExpertDataFile));
302        }
303
304        /**
305         * Returns the results from the expert turkers.
306         * 
307         * @return The grouped dataset
308         */
309        public GroupedDataset<String, GroupedDataset<String,
310                        ListDataset<Response>, Response>, Response> getExpertData()
311        {
312                return this.parseMetadata(new File(this.baseLocation, this.expertDataFile));
313        }
314
315        /**
316         * @param metadataFile
317         * @return A grouped dataset
318         */
319        public GroupedDataset<String, GroupedDataset<String,
320                        ListDataset<Response>, Response>, Response> parseMetadata(
321                                        final File metadataFile)
322        {
323                final GroupedDataset<String, GroupedDataset<String, ListDataset<Response>, Response>, Response> results = new MapBackedDataset<String, GroupedDataset<
324                                String, ListDataset<Response>, Response>, MMSys2013.Response>();
325
326                BufferedReader br = null;
327                try
328                {
329                        br = new BufferedReader(new FileReader(metadataFile));
330                        String line;
331                        boolean firstLine = true;
332                        int count = 1;
333                        while ((line = br.readLine()) != null)
334                        {
335                                if (!firstLine)
336                                {
337                                        try
338                                        {
339                                                final String[] parts = line.split(",", -1);
340
341                                                final Response[] r = new Response[3];
342                                                r[0] = new Response(this.parseQR(parts[3]),
343                                                                this.parseQR(parts[6]), this.parseF(parts[9]));
344                                                r[1] = new Response(this.parseQR(parts[4]),
345                                                                this.parseQR(parts[7]), this.parseF(parts[10]));
346                                                r[2] = new Response(this.parseQR(parts[5]),
347                                                                this.parseQR(parts[8]), parts.length > 11 ?
348                                                                                this.parseF(parts[11]) : -1);
349
350                                                GroupedDataset<String, ListDataset<Response>, Response> gds = results.get(parts[2]);
351
352                                                // Check whether we already have a dataset for
353                                                // the image in this category
354                                                if (gds == null)
355                                                {
356                                                        // Create a new dataset for images in this category
357                                                        gds = new MapBackedDataset<String,
358                                                                        ListDataset<Response>, Response>();
359                                                        results.put(parts[2], gds);
360                                                }
361
362                                                // See if we have any responses for this image already
363                                                ListDataset<Response> ids = gds.get(parts[1]);
364
365                                                // If not, create the dataset for this image
366                                                if (ids == null)
367                                                {
368                                                        ids = new ListBackedDataset<Response>();
369                                                        gds.put(parts[1], ids);
370                                                }
371
372                                                // Add the each response for this image
373                                                for (final Response rr : r)
374                                                        ids.add(rr);
375                                        } catch (final Exception e)
376                                        {
377                                                System.err.println("Error on line " + count);
378                                                e.printStackTrace();
379                                        }
380                                }
381                                firstLine = false;
382                                count++;
383                        }
384                        br.close();
385                } catch (final FileNotFoundException e)
386                {
387                        e.printStackTrace();
388                } catch (final IOException e)
389                {
390                        e.printStackTrace();
391                } finally
392                {
393                        if (br != null)
394                                try
395                                {
396                                        br.close();
397                                } catch (final IOException e)
398                                {
399                                        e.printStackTrace();
400                                }
401                }
402
403                return results;
404        }
405
406        /**
407         * Given a string returns a question response.
408         * 
409         * @param qr
410         *            The string
411         * @return A {@link QuestionResponse}
412         */
413        protected QuestionResponse parseQR(final String qr)
414        {
415                if (qr.toLowerCase().equals("yes"))
416                        return QuestionResponse.YES;
417                if (qr.toLowerCase().equals("no"))
418                        return QuestionResponse.NO;
419                if (qr.toLowerCase().equals("notsure"))
420                        return QuestionResponse.NOT_SURE;
421                return QuestionResponse.UNANSWERED;
422        }
423
424        protected int parseF(final String f)
425        {
426                try
427                {
428                        return Integer.parseInt(f);
429                } catch (final NumberFormatException e)
430                {
431                        return -1;
432                }
433        }
434
435        /**
436         * For a given {@link GroupedDataset} that represents the results from a
437         * single category, returns a list of scored annotations for each group, for
438         * question 1 (contains depication of category).
439         * 
440         * @param data
441         *            The data
442         * @return a list of {@link ScoredAnnotation} linked to image URL
443         */
444        public static Map<String, List<ScoredAnnotation<QuestionResponse>>>
445                        getAnnotationsQ1(
446                                        final GroupedDataset<String, ListDataset<Response>, Response> data)
447        {
448                final Map<String, List<ScoredAnnotation<QuestionResponse>>> r =
449                                new HashMap<String, List<ScoredAnnotation<QuestionResponse>>>();
450
451                // Loop through the images in this dataset
452                for (final String imgUrl : data.getGroups())
453                {
454                        final ListDataset<Response> l = data.get(imgUrl);
455
456                        final List<ScoredAnnotation<QuestionResponse>> l2 =
457                                        new ArrayList<ScoredAnnotation<QuestionResponse>>();
458                        r.put(imgUrl, l2);
459
460                        // Loop through the responses for this image
461                        for (final Response rr : l)
462                                l2.add(new ScoredAnnotation<QuestionResponse>(
463                                                rr.containsCategoryDepiction, rr.familiarityWithCategory));
464                }
465
466                return r;
467        }
468
469        /**
470         * For a given {@link GroupedDataset} that represents the results from a
471         * single category, returns a list of scored annotations for each group, for
472         * question 2 (is in category).
473         * 
474         * @param data
475         *            The group name to retrieve
476         * @return a list of {@link ScoredAnnotation} linked to image URL
477         */
478        public static Map<String, List<ScoredAnnotation<QuestionResponse>>>
479                        getAnnotationsQ2(
480                                        final GroupedDataset<String, ListDataset<Response>, Response> data)
481        {
482                final Map<String, List<ScoredAnnotation<QuestionResponse>>> r =
483                                new HashMap<String, List<ScoredAnnotation<QuestionResponse>>>();
484
485                // Loop through the images in this dataset
486                for (final String imgUrl : data.getGroups())
487                {
488                        final ListDataset<Response> l = data.get(imgUrl);
489
490                        final List<ScoredAnnotation<QuestionResponse>> l2 =
491                                        new ArrayList<ScoredAnnotation<QuestionResponse>>();
492                        r.put(imgUrl, l2);
493
494                        // Loop through the responses for this image
495                        for (final Response rr : l)
496                                l2.add(new ScoredAnnotation<QuestionResponse>(
497                                                rr.isInCorrectCategory, rr.familiarityWithCategory));
498                }
499
500                return r;
501        }
502
503        /**
504         * @param args
505         */
506        public static void main(final String[] args)
507        {
508                System.out.println();
509
510                // Expert annotations for Q1 and Q2
511                final Map<String, List<ScoredAnnotation<QuestionResponse>>> q1r1 =
512                                MMSys2013.getAnnotationsQ1(new MMSys2013().getExpertData().get("Cowboy hat"));
513                final Map<String, List<ScoredAnnotation<QuestionResponse>>> q2r1 =
514                                MMSys2013.getAnnotationsQ2(new MMSys2013().getExpertData().get("Cowboy hat"));
515
516                // Non expert annotations for Q1 and Q2
517                final Map<String, List<ScoredAnnotation<QuestionResponse>>> q1r2 =
518                                MMSys2013.getAnnotationsQ1(new MMSys2013().getNonExpertData().get("Cowboy hat"));
519                final Map<String, List<ScoredAnnotation<QuestionResponse>>> q2r2 =
520                                MMSys2013.getAnnotationsQ2(new MMSys2013().getNonExpertData().get("Cowboy hat"));
521
522                // Ground truth data for Q1 and Q2
523                final Map<String, List<ScoredAnnotation<QuestionResponse>>> q1gt =
524                                MMSys2013.getAnnotationsQ1(new MMSys2013().getGroundTruth().get("Cowboy hat"));
525                final Map<String, List<ScoredAnnotation<QuestionResponse>>> q2gt =
526                                MMSys2013.getAnnotationsQ2(new MMSys2013().getGroundTruth().get("Cowboy hat"));
527
528                // Majority voting on the data sets
529                final Map<String, ObjectFloatPair<ScoredAnnotation<QuestionResponse>>> q1r1mv =
530                                MajorityVoting.calculateBasicMajorityVote(q1r1);
531                final Map<String, ObjectFloatPair<ScoredAnnotation<QuestionResponse>>> q2r1mv =
532                                MajorityVoting.calculateBasicMajorityVote(q2r1);
533                final Map<String, ObjectFloatPair<ScoredAnnotation<QuestionResponse>>> q1r2mv =
534                                MajorityVoting.calculateBasicMajorityVote(q1r2);
535                final Map<String, ObjectFloatPair<ScoredAnnotation<QuestionResponse>>> q2r2mv =
536                                MajorityVoting.calculateBasicMajorityVote(q2r2);
537                final Map<String, ObjectFloatPair<ScoredAnnotation<QuestionResponse>>> q1gtmv =
538                                MajorityVoting.calculateBasicMajorityVote(q1gt);
539                final Map<String, ObjectFloatPair<ScoredAnnotation<QuestionResponse>>> q2gtmv =
540                                MajorityVoting.calculateBasicMajorityVote(q2gt);
541
542                // Agreement output
543                System.out.println("Question 1 agreement between raters 1 and 2: " +
544                                CohensKappaInterraterAgreement.calculate(q1r1mv, q1r2mv));
545                System.out.println("Question 1 agreement between rater 1 and GT: " +
546                                CohensKappaInterraterAgreement.calculate(q1r1mv, q1gtmv));
547                System.out.println("Question 1 agreement between rater 2 and GT: " +
548                                CohensKappaInterraterAgreement.calculate(q1r2mv, q1gtmv));
549
550                System.out.println("Question 2 agreement between raters 1 and 2: " +
551                                CohensKappaInterraterAgreement.calculate(q2r1mv, q2r2mv));
552                System.out.println("Question 2 agreement between rater 1 and GT: " +
553                                CohensKappaInterraterAgreement.calculate(q2r1mv, q2gtmv));
554                System.out.println("Question 2 agreement between rater 2 and GT: " +
555                                CohensKappaInterraterAgreement.calculate(q2r2mv, q2gtmv));
556        }
557}