Merge pull request #61 from SuperKogito/add-more-datasets

Add more datasets
SuperKogito · Jun 23, 2024 · 54706b7 · 54706b7
2 parents d636434 + 7459c59
commit 54706b7
Show file tree

Hide file tree

Showing 2 changed files with 255 additions and 47 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -8,22 +8,20 @@ please feel free to add it.
 
 * Read the criteria below
 * Add one dataset per Pull Request.
-* Add the dataset and its information as a table row.
-* Make sure to maintain the table chronological order.
+* Add the dataset and its information as a json entry.
+* Make sure to maintain the json chronological order.
 * Keep descriptions in the content and emotion cells concise.
 * Check your spelling and grammar.
-* Keep the table correctly aligned.
 * Send a Pull Request with a short explanation of why does it belong to this list.
 
 ## Criteria
 
-* The dataset must be related to the field of *Spoken Emotion Recognition*.
+* The dataset must be related to the field of *Speech Emotion Recognition*.
 * The dataset should not be a duplicate.
 * The dataset should not be provided in an active PR.
 * The dataset should be available for researchers for free.
 * The information about the dataset must be accessible for verification.
 
 ## How to contribute
 First go to `src/` using `cd src`. Then add a the dictionary / part json data of the contributed dataset to `src/ser-datasets`. 
-Make sure the json is valid, then run the `python generate_files.py` to update the restructured text file, csv file and the README.
 That's it, Congrats! and thank you for your contribution. Now open a PR with your changes. I will review it and then publish the results :))
diff --git a/src/ser-datasets.json b/src/ser-datasets.json
@@ -1,4 +1,60 @@
 {
+    "nEmo": {
+        "Year": 2024,
+        "Content": "3 hours of samples recorded with the participation of nine actors.",
+        "Emotions": "6 emotions: anger, fear, happiness, sadness, surprised, and neutral.",
+        "Format": "Audio",
+        "Size": "0.434 GB",
+        "Language": "Polish",
+        "Paper": "nEMO: Dataset of Emotional Speech in Polish",
+        "Access": "Open",
+        "License": "CC BY 4.0",
+        "Dataset-link": "https://github.com/amu-cai/nEMO",
+        "Paper-link": "https://arxiv.org/abs/2404.06292",
+        "License-link": "https://creativecommons.org/licenses/by/4.0/"
+    },
+    "EMOVOME": {
+        "Year": 2024,
+        "Content": "999 spontaneous voice messages from 100 Spanish speakers, collected from real conversations on a messaging app.",
+        "Emotions": "Valence & arrousal dimensions and 7 emotions: happiness, disgust, anger, surprise, fear, sadness, and neutral.",
+        "Format": "Audio",
+        "Size": "--",
+        "Language": "Spanish",
+        "Paper": "EMOVOME Database: Advancing Emotion Recognition in Speech Beyond Staged Scenarios",
+        "Access": "Partially open",
+        "License": "CC BY 4.0",
+        "Dataset-link": "https://zenodo.org/records/10694370",
+        "Paper-link": "https://arxiv.org/abs/2403.02167",
+        "License-link": "https://creativecommons.org/licenses/by/4.0/"
+    },
+    "EMNS": {
+        "Year": 2023,
+        "Content": "1206 high quality labeled utterances by one female speaker (2-3 hours).",
+        "Emotions": "Anger, excitement, disgust, happiness, surprise, sadness, and neutral (plus sarcasm)",
+        "Format": "Audio",
+        "Size": "0.042 GB",
+        "Language": "English (British)",
+        "Paper": "EMNS /Imz/ Corpus: An emotive single-speaker dataset for narrative storytelling in games, television and graphic novels",
+        "Access": "Open",
+        "License": "Apache 2.0",
+        "Dataset-link": "http://www.openslr.org/136/",
+        "Paper-link": "https://arxiv.org/abs/2305.13137",
+        "License-link": "https://apache.org/licenses/LICENSE-2.0"
+    },
+    "CAVES": {
+        "Year": 2023,
+        "Content": "Full hd visual recordings of 10 native cantonese speakers uttering 50 sentences.",
+        "Emotions": "Anger, happiness, sadness, surprise, fear, disgust and neutral",
+        "Format": "Audio",
+        "Size": "47 GB",
+        "Language": "Chinese (cantonese)",
+        "Paper": "A Cantonese Audio-Visual Emotional Speech (CAVES) dataset",
+        "Access": "Open",
+        "License": "Available for research purposes only",
+        "Dataset-link": "https://rds.westernsydney.edu.au/Institutes/MARCS/2024/Christopher_Davis/",
+        "Paper-link": "https://link.springer.com/article/10.3758/s13428-023-02270-7",
+        "License-link": "--"
+    },
     "BANSpEmo": {
         "Year": 2023,
         "Content": "792 utterance recordings from 22 unprofessional speakers (11 males and 11 females) of six basic emotional reactions of two sets of sentences.",
@@ -68,6 +124,20 @@
         "Dataset-link": "https://data.mendeley.com/datasets/t9h6p943xy/5",
         "Paper-link": "https://www.sciencedirect.com/science/article/pii/S235234092200302X",
         "License-link": "https://creativecommons.org/licenses/by/4.0/"
+    },   
+    "B-SER": {
+        "Year": 2022,
+        "Content": "1224 speech-audio recordings by 34 non-professional participating actors (17 male and 17 female) from diverse age groups between 19 and 47 years.",
+        "Emotions": "angry, happy, sad and surprise",
+        "Format": "Audio",
+        "Size": "0.363 GB",
+        "Language": "Bangla",
+        "Paper": "--",
+        "Access": "Open",
+        "License": "CC BY 4.0",
+        "Dataset-link": "https://data.mendeley.com/datasets/t9h6p943xy/3",
+        "Paper-link": "--",
+        "License-link": "https://creativecommons.org/licenses/by/4.0/"
     },    
     "Kannada": {
         "Year": 2022,
@@ -83,6 +153,104 @@
         "Paper-link": "--",
         "License-link": "https://creativecommons.org/licenses/by/4.0/"
     },
+    "Quechua-SER": {
+        "Year": 2022,
+        "Content": "12420 audio recordings (~15 hours) and their transcriptions by 7 native speakers.",
+        "Emotions": "Emotional labels using dimensions: valence, arousal, and dominance.",
+        "Format": "Audio",
+        "Size": "3.53 GB",
+        "Language": "Quechua Collao",
+        "Paper": "A speech corpus of Quechua Collao for automatic dimensional emotion recognition",
+        "Access": "Open",
+        "License": "CC BY 4.0",
+        "Dataset-link": "https://figshare.com/articles/media/Quechua_Collao_for_Speech_Emotion_Recognition/20292516",
+        "Paper-link": "https://www.nature.com/articles/s41597-022-01855-9",
+        "License-link": "https://creativecommons.org/licenses/by/4.0/"
+    },
+    "MESD": {
+        "Year": 2022,
+        "Content": "864 audio files of single-word emotional utterances with Mexican cultural shaping.",
+        "Emotions": "6 emotions provides single-word utterances for anger, disgust, fear, happiness, neutral, and sadness.",
+        "Format": "Audio",
+        "Size": "0.097 GB",
+        "Language": "Spanish (Mexican)",
+        "Paper": "The Mexican Emotional Speech Database (MESD): elaboration and assessment based on machine learning",
+        "Access": "Open",
+        "License": "CC BY 4.0",
+        "Dataset-link": "https://data.mendeley.com/datasets/cy34mh68j9/5",
+        "Paper-link": "https://pubmed.ncbi.nlm.nih.gov/34891601/",
+        "License-link": "https://creativecommons.org/licenses/by/4.0/"
+    },
+    "SyntAct": {
+        "Year": 2022,
+        "Content": "Synthesized database with 997 utterances of three basic emotions and neutral expression based on rule-based manipulation for a diphone synthesizer which we release to the public ",
+        "Emotions": "6 emotions: angry, bored, happy, neutral, sad and scared",
+        "Format": "Audio",
+        "Size": "0.941 GB",
+        "Language": "German",
+        "Paper": "SyntAct: A Synthesized Database of Basic Emotions",
+        "Access": "Open",
+        "License": "CC BY-SA 4.0",
+        "Dataset-link": "https://zenodo.org/record/6573016#.ZAjy_9LMJpj",
+        "Paper-link": "http://felix.syntheticspeech.de/publications/synthetic_database.pdf",
+        "License-link": "https://creativecommons.org/licenses/by/4.0"
+    },
+    "BEAT": {
+        "Year": 2022,
+        "Content": "76-Hour and 30-Speaker of 4 different languages: English (60h), Chinese (12h), Spanish (2h) and Japanese (2h).",
+        "Emotions": "8 emotions: happiness, anger, disgust, sadness, contempt, surprise, fear, and neutral",
+        "Format": "Audio, Video",
+        "Size": "42 GB",
+        "Language": "English, Chinese, Spanish, Japanese",
+        "Paper": "A Large-Scale Semantic and Emotional Multi-Modal Dataset for Conversational Gestures Synthesis",
+        "Access": "Open",
+        "License": "Non-commercial license",
+        "Dataset-link": "https://drive.google.com/drive/folders/1EKuWH8q178QOtFUYaNohdkZbBHQYAmhL",
+        "Paper-link": "https://www.ecva.net/papers/eccv_2022/papers_ECCV/papers/136670605.pdf",
+        "License-link": "--"
+    },
+    "Dusha": {
+        "Year": 2022,
+        "Content": " 300 000 audio recordings (~350 hours) of Russian speech, their transcripts and emotiomal labels. The dataset has two subsets: acted and real-life",
+        "Emotions": "4 emotions: angry, happy, sad and neutral. Arousal and valence metrics are also available.",
+        "Format": "Audio",
+        "Size": "58 GB",
+        "Language": "Russian",
+        "Paper": "Large Raw Emotional Dataset with Aggregation Mechanism",
+        "Access": "Open",
+        "License": "Public license with attribution and conditions reserved",
+        "Dataset-link": "https://github.com/salute-developers/golos/tree/master/dusha",
+        "Paper-link": "https://arxiv.org/abs/2212.12266",
+        "License-link": "https://github.com/salute-developers/golos/blob/master/license/en_us.pdf"
+    },
+    "MAFW": {
+        "Year": 2022,
+        "Content": "10045 video-audio clips in the wild.",
+        "Emotions": "11 single-label emotion categories (anger, disgust, fear, happiness, neutral, sadness, surprise, contempt, anxiety, helplessness, and disappointment) and 32 multi-label emotion categories.",
+        "Format": "Audio, Video",
+        "Size": "--",
+        "Language": "--",
+        "Paper": "MAFW: A Large-scale, Multi-modal, Compound Affective Database for Dynamic Facial Expression Recognition in the Wild",
+        "Access": "Restricted",
+        "License": "Non-commercial research purposes",
+        "Dataset-link": "https://mafw-database.github.io/MAFW/",
+        "Paper-link": "https://arxiv.org/abs/2208.00847",
+        "License-link": "--"
+    },
+    "EMOVIE": {
+        "Year": 2021,
+        "Content": "9724 samples with audio files and its emotion human-labeled annotation.",
+        "Emotions": "Polarity metrics (positive:+1, negative:-1)",
+        "Format": "Audio",
+        "Size": "0.572 GB",
+        "Language": "Chinese (Mandarin)",
+        "Paper": "EMOVIE: A Mandarin Emotion Speech Dataset with a Simple Emotional Text-to-Speech Model",
+        "Access": "Open",
+        "License": "CC BY-NC-SA 2.0",
+        "Dataset-link": "https://viem-ccy.github.io/EMOVIE/dataset_release.html",
+        "Paper-link": "https://arxiv.org/abs/2106.09317",
+        "License-link": "https://creativecommons.org/licenses/by-nc-sa/2.0/legalcode"
+    },
     "emoUERJ": {
         "Year": 2021,
         "Content": "Ten sentences from eight actors, equally divided between genders, and they were free to choose the phrases for record audios in four emotions (377 audios). ",
@@ -97,6 +265,34 @@
         "Paper-link": "--",
         "License-link": "https://creativecommons.org/licenses/by/4.0/"
     },
+    "Thorsten-Voice Dataset 2021.06 emotional": {
+        "Year": 2021,
+        "Content": "2400 normalized mono recordings by one person (Thorsten Müller) representing 300 sentences. ",
+        "Emotions": "Amusement, Disgust Anger, Suprise and Neutral (plus drunk, whispering and sleepy states)",
+        "Format": "Audio",
+        "Size": "0.399 GB",
+        "Language": "German",
+        "Paper": "--",
+        "Access": "Open",
+        "License": "CC0: Public Domain",
+        "Dataset-link": "https://zenodo.org/records/5525023",
+        "Paper-link": "--",
+        "License-link": "https://creativecommons.org/publicdomain/zero/1.0/"
+    },
+    "ASED": {
+        "Year": 2021,
+        "Content": "2474 recordings by 65 participants (25 females and 40 males)). Recordings were judged and rejected according to the opionion of eight judges.",
+        "Emotions": "Five emotions: anger, happiness, fear, sadness and neutral",
+        "Format": "Audio",
+        "Size": "0.135 GB",
+        "Language": "Amharic",
+        "Paper": "A New Amharic Speech Emotion Dataset and Classification Benchmark",
+        "Access": "Open",
+        "License": "--",
+        "Dataset-link": "https://github.com/Ethio2021/ASED_V1",
+        "Paper-link": "https://dl.acm.org/doi/10.1145/3529759",
+        "License-link": "--"
+    },
     "ESCorpus-PE": {
         "Year": 2021,
         "Content": "Spanish peruvian speech gathered from Spanish interviews, TV reports, political debate and testimonials. It contains 3749 utterances, 80 speakers (44 male and 36 female), created from Youtube audios",
@@ -139,48 +335,6 @@
         "Paper-link": "--",
         "License-link": "https://creativecommons.org/publicdomain/zero/1.0/"
     },
-    "Quechua-SER": {
-        "Year": 2022,
-        "Content": "12420 audio recordings (~15 hours) and their transcriptions by 7 native speakers.",
-        "Emotions": "Emotional labels using dimensions: valence, arousal, and dominance.",
-        "Format": "Audio",
-        "Size": "3.53 GB",
-        "Language": "Quechua Collao",
-        "Paper": "A speech corpus of Quechua Collao for automatic dimensional emotion recognition",
-        "Access": "Open",
-        "License": "CC BY 4.0",
-        "Dataset-link": "https://figshare.com/articles/media/Quechua_Collao_for_Speech_Emotion_Recognition/20292516",
-        "Paper-link": "https://www.nature.com/articles/s41597-022-01855-9",
-        "License-link": "https://creativecommons.org/licenses/by/4.0/"
-    },
-    "MESD": {
-        "Year": 2022,
-        "Content": "864 audio files of single-word emotional utterances with Mexican cultural shaping.",
-        "Emotions": "6 emotions provides single-word utterances for anger, disgust, fear, happiness, neutral, and sadness.",
-        "Format": "Audio",
-        "Size": "0.097 GB",
-        "Language": "Spanish (Mexican)",
-        "Paper": "The Mexican Emotional Speech Database (MESD): elaboration and assessment based on machine learning",
-        "Access": "Open",
-        "License": "CC BY 4.0",
-        "Dataset-link": "https://data.mendeley.com/datasets/cy34mh68j9/5",
-        "Paper-link": "https://pubmed.ncbi.nlm.nih.gov/34891601/",
-        "License-link": "https://creativecommons.org/licenses/by/4.0/"
-    },
-    "SyntAct": {
-        "Year": 2022,
-        "Content": "Synthesized database of three basic emotions and neutral expression based on rule-based manipulation for a diphone synthesizer which we release to the public ",
-        "Emotions": "997 utterances including 6 emotions: angry, bored, happy, neutral, sad and scared",
-        "Format": "Audio",
-        "Size": "0.941 GB",
-        "Language": "German",
-        "Paper": "SyntAct: A Synthesized Database of Basic Emotions",
-        "Access": "Open",
-        "License": "CC BY-SA 4.0",
-        "Dataset-link": "https://zenodo.org/record/6573016#.ZAjy_9LMJpj",
-        "Paper-link": "http://felix.syntheticspeech.de/publications/synthetic_database.pdf",
-        "License-link": "https://creativecommons.org/licenses/by/4.0"
-    },
     "LSSED": {
         "Year": 2021,
         "Content": "LSSED: A Large-Scale Dataset and Benchmark for Speech Emotion Recognition",
@@ -307,6 +461,20 @@
         "Paper-link": "http://www.interspeech2020.org/index.php?m=content&c=index&a=show&catid=290&id=684",
         "License-link": "Academic License & Commercial License"
     },
+    "AISHELL-3": {
+        "Year": 2020,
+        "Content": "Roughly 85 hours of emotion-neutral recordings spoken by 218 native Chinese mandarin speakers and total 88035 utterances.",
+        "Emotions": "Neutral",
+        "Format": "Audio",
+        "Size": "19 GB",
+        "Language": "Chinese (Mandarin)",
+        "Paper": "AISHELL-3: A Multi-speaker Mandarin TTS Corpus and the Baselines",
+        "Access": "Open",
+        "License": "Apache 2.0",
+        "Dataset-link": "https://www.openslr.org/93/",
+        "Paper-link": "https://arxiv.org/abs/2010.11567",
+        "License-link": "https://apache.org/licenses/LICENSE-2.0"
+    },
     "BEASC": {
         "Year": 2020,
         "Content": "Bangla Emotional Audio-Speech Corpus",
@@ -377,6 +545,34 @@
         "Paper-link": "http://dx.doi.org/10.1037/emo0001048",
         "License-link": "https://creativecommons.org/licenses/by-nc-sa/4.0/"
     },
+    "VESUS": {
+        "Year": 2019,
+        "Content": "252 distinct phrases, each read by 10 actors totalling 6 hours of speech.",
+        "Emotions": "5 emotions: anger, happiness, sadness, fear and neutral.",
+        "Format": "Audio",
+        "Size": "--",
+        "Language": "English",
+        "Paper": "VESUS: A Crowd-Annotated Database to Study Emotion Production and Perception in Spoken English",
+        "Access": "Restricted",
+        "License": "Academic EULA",
+        "Dataset-link": "https://engineering.jhu.edu/nsa/vesus/",
+        "Paper-link": "https://engineering.jhu.edu/nsa/wp-content/uploads/2019/10/IS191413.pdf",
+        "License-link": "Academic EULA"
+    },
+    "Morgan Emotional Speech Set": {
+        "Year": 2019,
+        "Content": "999 spontaneous voice messages from 100 Spanish speakers, collected from real conversations on a messaging app.",
+        "Emotions": "Valence & arrousal dimensions and 4 emotions: happiness, anger, sadness, and calmness.",
+        "Format": "Audio",
+        "Size": "0.192 GB",
+        "Language": "English",
+        "Paper": "Categorical and Dimensional Ratings of Emotional Speech: Behavioral Findings From the Morgan Emotional Speech Set",
+        "Access": "Open",
+        "License": "CC BY 4.0",
+        "Dataset-link": "https://arxiv.org/abs/2403.02167",
+        "Paper-link": "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC7203525/",
+        "License-link": "https://creativecommons.org/licenses/by/4.0/"
+    },
     "PMEmo": {
         "Year": 2019,
         "Content": "Dataset containing emotion annotations of 794 songs as well as the simultaneous electrodermal activity (EDA) signals. A Music Emotion Experiment was well-designed for collecting the affective-annotated music corpus of high quality, which recruited 457 subjects.",
@@ -475,6 +671,20 @@
         "Paper-link": "https://arxiv.org/pdf/1806.09514.pdf",
         "License-link": "--"
     },
+    "OMG Emotion": {
+        "Year": 2018,
+        "Content": "420 relatively long emotion videos with an average length of 1 minute, collected from a variety of Youtube channels.",
+        "Emotions": "7 emotions:anger, disgust, fear, happy, sad, surprise and neutral. Plus valence, arousal.",
+        "Format": "Audio, Video",
+        "Size": "--",
+        "Language": "English",
+        "Paper": "The OMG-Emotion Behavior Dataset",
+        "Access": "Open",
+        "License": "CC BY-NC-SA 3.0",
+        "Dataset-link": "https://www2.informatik.uni-hamburg.de/wtm/OMG-EmotionChallenge/",
+        "Paper-link": "https://arxiv.org/abs/1803.05434",
+        "License-link": "https://creativecommons.org/licenses/by-nc-sa/3.0/"
+    },
     "RAVDESS": {
         "Year": 2018,
         "Content": "7356 recordings by 24 actors.",