-
Notifications
You must be signed in to change notification settings - Fork 12
/
multimodal_ml_music.bib
587 lines (541 loc) · 40.5 KB
/
multimodal_ml_music.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
@inproceedings{Laurier2008,
abstract = {In this paper we present a study on music mood classification using audio and lyrics information. The mood of a song is expressed by means of musical features but a relevant part also seems to be conveyed by the lyrics. We evaluate each factor independently and explore the possibility to combine both, using Natural Language Processing and Music Information Retrieval techniques. We show that standard distance-based methods and Latent Semantic Analysis are able to classify the lyrics significantly better than random , but the performance is still quite inferior to that of audio-based techniques. We then introduce a method based on differences between language models that gives performances closer to audio-based classifiers. Moreover, integrating this in a multimodal system (audio+text) allows an improvement in the overall performance. We demonstrate that lyrics and audio information are complementary, and can be combined to improve a classification system.},
author = {Laurier, Cyril and Grivolla, Jens and Herrera, Perfecto},
booktitle = {2008 Seventh International Conference on Machine Learning and Applications},
link = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.182.426&rep=rep1&type=pdf},
pages = {688--693},
task = {mood classification},
title = {Multimodal Music Mood Classification using Audio and Lyrics},
type = {Audio-Text},
year = {2008}
}
@inproceedings{Turnbull2009,
abstract = {When attempting to annotate music, it is important to consider both acoustic content and social context. This paper explores techniques for collecting and combining multiple sources of such information for the purpose of building a query-by-text music retrieval system. We consider two representations of the acoustic content (related to timbre and harmony) and two social sources (social tags and web documents). We then compare three algorithms that combine these information sources: calibrated score averaging (CSA), RankBoost, and kernel combination support vector machines (KC-SVM). We demonstrate empirically that each of these algorithms is superior to algorithms that use individual information sources. Copyright 2009 ACM.},
author = {Turnbull, Douglas R. and Barrington, Luke and Lanckriet, Gert and Yazdani, Mehrdad},
booktitle = {32nd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval},
doi = {10.1145/1571941.1572009},
isbn = {9781605584836},
keywords = {Calibrated score averaging,Combining data sources,Kernel combination SVM,Music IR,RankBoost},
link = {https://www.cs.swarthmore.edu/~turnbull/Papers/Turnbull_CombineMusicTags_SIGIR09.pdf},
pages = {387--394},
task = {music retrieval},
title = {Combining audio content and social context for semantic music discovery},
year = {2009}
}
@inproceedings{Liem2011,
author = {Liem, Cynthia CS and M{\"u}ller, Meinard and Eck, Douglas and Tzanetakis, George and Hanjalic, Alan},
booktitle = {Proceedings of the 1st international ACM workshop on Music information retrieval with user-centered and multimodal strategies},
link = {https://dl.acm.org/doi/pdf/10.1145/2072529.2072531},
pages = {1--6},
title = {The need for music information retrieval with user-centered and multimodal strategies},
year = {2011}
}
@inproceedings{Orio2011,
abstract = {This work presents the rationale, tasks and procedures of MusiCLEF, a novel benchmarking activity that has been developed along with the Cross-Language Evaluation Forum (CLEF). The main goal of MusiCLEF is to promote the development of new methodologies for music access and retrieval on real public music collections, which can combine content-based information, automatically extracted from music files, with contextual information, provided by users via tags, comments, or reviews. Moreover, MusiCLEF aims at maintaining a tight connection with real application scenarios, focusing on issues on music access and retrieval that are faced by professional users. To this end, this year's evaluation campaign focused on two main tasks: automatic categorization of music to be used as soundtrack of TV shows and automatic identification of the digitized material of a music digital library. {\textcopyright} 2011 International Society for Music Information Retrieval.},
author = {Orio, Nicola and Rizo, David and Miotto, Riccardo and Montecchio, Nicola and Schedl, Markus and Lartillot, Olivier},
booktitle = {Proceedings of the 12th International Society for Music Information Retrieval Conference, ISMIR 2011},
isbn = {9780615548654},
link = {https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.449.4173&rep=rep1&type=pdf},
title = {Musiclef: A benchmark activity in multimodal music information retrieval},
year = {2011}
}
@incollection{Barthet2013,
abstract = {The striking ability of music to elicit emotions assures its prominent status in human culture and every day life. Music is often enjoyed and sought for its ability to induce or convey emotions, which may manifest in anything from a slight variation in mood, to changes in our physical condition and actions. Consequently, research on how we might associate musical pieces with emotions and, more generally, how music brings about an emotional response is attracting ever increasing attention. First, this paper provides a thorough review of studies on the relation of music and emotions from different disciplines. We then propose new insights to enhance automated music emotion recognition models using recent results from psychology, musicology, affective computing, semantic technologies and music information retrieval. {\textcopyright} 2013 Springer-Verlag.},
author = {Barthet, Mathieu and Fazekas, Gy{\"{o}}rgy and Sandler, Mark},
booktitle = {From Sounds to Music and Emotions. CMMR 2012. Lecture Notes in Computer Science},
doi = {10.1007/978-3-642-41248-6_13},
isbn = {9783642412479},
issn = {03029743},
keywords = {appraisal,arousal,metadata,model,mood,multi-modal,music emotion,ontology,recognition,retrieval,review,state of the art,valence},
link = {https://qmro.qmul.ac.uk/xmlui/bitstream/handle/123456789/31911/Fazekas%20Music%20Emotion%20Recognition%202012%20Accepted.pdf;jsessionid=76AE783B989ED4CDBFB8B9C5CE013CE4?sequence=1},
pages = {228--252},
task = {emotion recognition},
title = {Music emotion recognition: From content- to context-based models},
volume = {7900},
year = {2013}
}
@inproceedings{Fried,
author = {Fried, Ohad and Fiebrink, Rebecca},
booktitle = {NIME},
keywords = {Deep learning,feature learning,gestural control,mapping},
link = {https://www.ohadf.com/papers/FriedFiebrink_NIME2013.pdf},
title = {Cross-modal Sound Mapping Using Deep Learning},
year = {2013}
}
@unpublished{Choi2016,
abstract = {Descriptions are often provided along with recommendations to help users' discovery. Recommending automatically generated music playlists (e.g. personalised playlists) introduces the problem of generating descriptions. In this paper, we propose a method for generating music playlist descriptions, which is called as music captioning. In the proposed method, audio content analysis and natural language processing are adopted to utilise the information of each track.},
archiveprefix = {arXiv},
arxivid = {1608.04868v2},
author = {Choi, Keunwoo and Fazekas, Gy{\"{o}}rgy and Sandler, Mark and Mcfee, Brian and Cho, Kyunghyun},
eprint = {1608.04868v2},
link = {https://arxiv.org/pdf/1608.04868.pdf},
task = {music captioning},
title = {Towards Music Captioning: Generating Music Playlist Descriptions},
type = {Audio-Text},
year = {2016}
}
@inproceedings{Oramas2016,
abstract = {In this paper, we explore a large multimodal dataset of about 65k albums constructed from a combination of Amazon customer reviews, MusicBrainz metadata and AcousticBrainz audio descriptors. Review texts are further enriched with named entity disambiguation along with polarity information derived from an aspect-based sentiment analysis framework. This dataset constitutes the cornerstone of two main contributions: First, we perform experiments on music genre classification, exploring a variety of feature types, including semantic, sentimental and acoustic features. These experiments show that modeling semantic information contributes to outperforming strong bag-of-words baselines. Second, we provide a diachronic study of the criticism of music genres via a quantitative analysis of the polarity associated to musical aspects over time. Our analysis hints at a potential correlation between key cultural and geopolitical events and the language and evolving sentiments found in music reviews.},
author = {Oramas, Sergio and Espinosa-Anke, Luis and Lawlor, Aonghus and Serra, Xavier and Saggion, Horacio},
booktitle = {Proceedings of the 17th International Society for Music Information Retrieval Conference, ISMIR 2016},
isbn = {9780692755068},
link = {https://repositori.upf.edu/bitstream/handle/10230/33063/Oramas_ISMIR2016_expl.pdf?sequence=1&isAllowed=y},
task = {genre classification},
title = {Exploring customer reviews for music genre classification and evolutionary studies},
type = {Audio-Text},
year = {2016}
}
@inproceedings{Jeon2017,
abstract = {Music emotion recognition (MER) is a key issue in user contextaware recommendation. Many existing methods require hand-crafted features on audio and lyrics. Here we propose a new end-To-end method for recognizing emotions of tracks from their acoustic signals and lyrics via multimodal deep neural networks. We evaluate our method on about 7,000 K-pop tracks labeled as positive or negative emotion. The proposed method is compared to end-To-end unimodal models using audio signals or lyrics only. The experimental results show that our multimodal model achieves the best accuracy as 80{\%}, and we discuss the reasons of these results.},
author = {Jeon, Byungsoo and Kim, Chanju and Kim, Adrian and Kim, Dongwon and Park, Jangyeon and Ha, Jung Woo},
booktitle = {CEUR Workshop Proceedings},
issn = {16130073},
keywords = {Multimodal neural network,Music emotion recognition,Music recommendation},
link = {http://ceur-ws.org/Vol-1905/recsys2017_poster18.pdf},
task = {emotion recognition},
title = {Music emotion recognition via end-To-end multimodal neural networks},
year = {2017}
}
@article{Kiela2017,
abstract = {Multi-modal semantics, which aims to ground semantic representations in perception, has relied on feature norms or raw image data for perceptual input. In this paper we examine grounding semantic representations in raw auditory data, using standard evaluations for multi-modal semantics. After having shown the quality of such auditorily grounded representations, we show how they can be applied to tasks where auditory perception is relevant, including two unsupervised categorization experiments, and provide further analysis. We find that features transfered from deep neural networks outperform bag of audio words approaches. To our knowledge, this is the first work to construct multi-modal models from a combination of textual information and auditory information extracted from deep neural networks, and the first work to evaluate the performance of tri-modal (textual, visual and auditory) semantic models.},
author = {Kiela, Douwe and Clark, Stephen},
doi = {10.1613/jair.5665},
issn = {10769757},
journal = {Journal of Artificial Intelligence Research},
link = {https://www.jair.org/index.php/jair/article/view/11101/26292},
pages = {1003--1030},
title = {Learning neural audio embeddings for grounding semantics in auditory perception},
volume = {60},
year = {2017}
}
@inproceedings{Oramas2017,
author = {Oramas, Sergio and Nieto, Oriol and Sordo, Mohamed and Serra, Xavier},
booktitle = {Proceedings of the 2nd Workshop on Deep Learning for Recommender Systems},
code = {https://github.com/sergiooramas/tartarus},
doi = {10.1145/3125486.3125492},
isbn = {9781450353533},
keywords = {Deep learning,Multimodal,Music,Recommender systems,Semantics},
link = {https://dl.acm.org/doi/pdf/10.1145/3125486.3125492},
month = {aug},
pages = {32--37},
publisher = {Association for Computing Machinery},
task = {music recommendation},
title = {A deep multimodal approach for cold-start music recommendation},
type = {Audio-User},
volume = {Part F1301},
year = {2017}
}
@inproceedings{Delbouys2018,
abstract = {We consider the task of multimodal music mood prediction based on the audio signal and the lyrics of a track. We reproduce the implementation of traditional feature engineering based approaches and propose a new model based on deep learning. We compare the performance of both approaches on a database containing 18,000 tracks with associated valence and arousal values and show that our approach outperforms classical models on the arousal detection task, and that both approaches perform equally on the valence prediction task. We also compare the a poste-riori fusion with fusion of modalities optimized simultaneously with each unimodal model, and observe a significant improvement of valence prediction. We release part of our database for comparison purposes.},
archiveprefix = {arXiv},
arxivid = {1809.07276},
author = {Delbouys, R{\'{e}}mi and Hennequin, Romain and Piccoli, Francesco and Royo-Letelier, Jimena and Moussallam, Manuel},
booktitle = {Proceedings of the 19th International Society for Music Information Retrieval Conference, ISMIR 2018},
eprint = {1809.07276},
isbn = {9782954035123},
link = {https://arxiv.org/pdf/1809.07276.pdf},
task = {mood classification},
title = {Music mood detection based on audio and lyrics with deep neural net},
type = {Audio-Text},
year = {2018}
}
@inproceedings{hong2018cbvmr,
author = {Hong, Sungeun and Im, Woobin and Yang, Hyun S},
booktitle = {Proceedings of the 2018 ACM on international conference on multimedia retrieval},
code = {https://github.com/csehong/VM-NET},
link = {https://dl.acm.org/doi/abs/10.1145/3206025.3206046},
pages = {353--361},
task = {music retrieval},
title = {Cbvmr: content-based video-music retrieval using soft intra-modal structure constraint},
type = {Audio-Video},
year = {2018}
}
@inproceedings{Liang2018,
abstract = {Learning social media content is the basis of many real-world applications, including information retrieval and recommendation systems, among others. In contrast with previous works that focus mainly on single modal or bi-modal learning, we propose to learn social media content by fusing jointly textual, acoustic, and visual information (JTAV). Effective strategies are proposed to extract fine-grained features of each modality, that is, attBiGRU and DCRNN. We also introduce cross-modal fusion and attentive pooling techniques to integrate multi-modal information comprehensively. Extensive experimental evaluation conducted on real-world datasets demonstrate our proposed model outperforms the state-of-the-art approaches by a large margin.},
address = {Santa Fe, New Mexico, USA},
author = {Liang, Hongru and Wang, Haozheng and Wang, Jun and You, Shaodi and Sun, Zhe and Wei, Jin-Mao and Yang, Zhenglu},
booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
code = {https://github.com/mengshor/JTAV},
link = {http://arxiv.org/abs/1806.01483},
pages = {1269--1280},
publisher = {Association for Computational Linguistics},
title = {JTAV: Jointly Learning Social Media Content Representation by Fusing Textual, Acoustic, and Visual Features},
year = {2018}
}
@article{Oramas2018,
author = {Oramas, Sergio and Barbieri, Francesco and Nieto, Oriol and Serra, Xavier},
code = {https://github.com/fvancesco/music_resnet_classification},
doi = {10.5334/tismir.10},
journal = {Transactions of the International Society for Music Information Retrieval},
link = {https://transactions.ismir.net/articles/10.5334/tismir.10/},
month = {sep},
number = {1},
pages = {4--21},
publisher = {Ubiquity Press, Ltd.},
task = {genre classification},
title = {Multimodal Deep Learning for Music Genre Classification},
volume = {1},
year = {2018}
}
@inproceedings{Qiu2018,
abstract = {Recently, the development of music appreciation device has made it possible to listen to various kinds of music regardless of location. On the other hand, if it is possible to express visual contents that best matches music, we can expect a more expressive music appreciation experience by not only "listening to" the music but also "watching" the music. In this paper, we address the problems below: (1) learning a correlation between music data and images; (2) generating images from music data automatically. The experiments show that our proposed method can effectively generate proper images from music data.},
author = {Qiu, Yue and Kataoka, Hirokatsu},
booktitle = {IEEE Conference on Computer Vision and Pattern Recognition Workshops},
link = {https://openaccess.thecvf.com/content_cvpr_2018_workshops/papers/w49/Qiu_Image_Generation_Associated_CVPR_2018_paper.pdf},
pages = {2510--2513},
task = {image generation},
title = {Image generation associated with music data},
type = {Audio-Image},
year = {2018}
}
@inproceedings{Zhao2018,
abstract = {We introduce PixelPlayer, a system that, by leveraging large amounts of unlabeled videos, learns to locate image regions which produce sounds and separate the input sounds into a set of components that represents the sound from each pixel. Our approach capitalizes on the natural synchronization of the visual and audio modalities to learn models that jointly parse sounds and images, without requiring additional manual supervision. Experimental results on a newly collected MUSIC dataset show that our proposed Mix-and-Separate framework outper-forms several baselines on source separation. Qualitative results suggest our model learns to ground sounds in vision, enabling applications such as independently adjusting the volume of sound sources.},
author = {Zhao, Hang and Gan, Chuang and Rouditchenko, Andrew and Vondrick, Carl and Mcdermott, Josh and Torralba, Antonio},
booktitle = {The European Conference on Computer Vision (ECCV)},
code = {https://github.com/hangzhaomit/Sound-of-Pixels},
keywords = {cross-modal learning,sound separation and localization},
link = {https://arxiv.org/pdf/1804.03160.pdf},
pages = {570--586},
task = {source separation},
title = {The Sound of Pixels},
type = {Audio-Image},
year = {2018}
}
@inproceedings{Li2019a,
abstract = {Cross-modal retrieval learns the relationship between the two types of data in a common space so that an input from one modality can retrieve data from a different modality. We focus on modeling the relationship between two highly diverse data, music and real-world videos. We learn cross-modal embeddings using a two-stream network trained with music-video pairs. Each branch takes one modality as the input and it is constrained with emotion tags. Then the constraints allow the cross-modal embeddings to be learned with significantly fewer music-video pairs. To retrieve music for an input video, the trained model ranks tracks in the music database by cross-modal distances to the query video. Quantitative evaluations show high accuracy of audio/video emotion tagging when evaluated on each branch independently and high performance for cross-modal music retrieval. We also present cross-modal music retrieval experiments on Spotify music using user-generated videos from Instagram and Youtube as queries, and subjective evaluations show that the proposed model can retrieve relevant music. We present the music retrieval results at: http://www.ece.rochester. edu/{\~{}}bli23/projects/query.html.},
author = {Li, Bochen},
booktitle = {20th International Society for Music Information Retrieval Conference},
link = {www.gracenote.com},
task = {music retrieval},
title = {Query by Video: Cross-Modal Music Retrieval},
type = {Audio-Video},
year = {2019}
}
@article{Li2019,
abstract = {We introduce a dataset for facilitating audio-visual analysis of music performances. The dataset comprises 44 simple multi-instrument classical music pieces assembled from coordinated but separately recorded performances of individual tracks. For each piece, we provide the musical score in MIDI format, the audio recordings of the individual tracks, the audio and video recording of the assembled mixture, and ground-truth annotation files including frame-level and note-level transcriptions. We describe our methodology for the creation of the dataset, particularly highlighting our approaches to address the challenges involved in maintaining synchronization and expressiveness. We demonstrate the high quality of synchronization achieved with our proposed approach by comparing the dataset with existing widely used music audio datasets. We anticipate that the dataset will be useful for the development and evaluation of existing music information retrieval (MIR) tasks, as well as for novel multimodal tasks. We benchmark two existing MIR tasks (multipitch analysis and score-informed source separation) on the dataset and compare them with other existing music audio datasets. In addition, we consider two novel multimodal MIR tasks (visually informed multipitch analysis and polyphonic vibrato analysis) enabled by the dataset and provide evaluation measurements and baseline systems for future comparisons (from our recent work). Finally, we propose several emerging research directions that the dataset enables.},
archiveprefix = {arXiv},
arxivid = {1612.08727},
author = {Li, Bochen and Liu, Xinzhao and Dinesh, Karthik and Duan, Zhiyao and Sharma, Gaurav},
doi = {10.1109/TMM.2018.2856090},
eprint = {1612.08727},
issn = {15209210},
journal = {IEEE Transactions on Multimedia},
keywords = {Multimodal music dataset,audio-visual analysis,music performance,synchronization},
link = {https://arxiv.org/pdf/1612.08727.pdf},
month = {feb},
number = {2},
pages = {522--535},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {Creating a Multitrack Classical Music Performance Dataset for Multimodal Music Analysis: Challenges, Insights, and Applications},
volume = {21},
year = {2019}
}
@article{Muller2019,
abstract = {There has been a rapid growth of digitally available music data, including audio recordings, digitized images of sheet music, album covers and liner notes, and video clips. This huge amount of data calls for retrieval strategies that allow users to explore large music collections in a convenient way. More precisely, there is a need for cross-modal retrieval algorithms that, given a query in one modality (e.g., a short audio excerpt), find corresponding information and entities in other modalities (e.g., the name of the piece and the sheet music). This goes beyond exact audio identification and subsequent retrieval of metainformation as performed by commercial applications like Shazam [1].},
author = {M{\"{u}}ller, Meinard and Arzt, Andreas and Balke, Stefan and Dorfer, Matthias and Widmer, Gerhard},
doi = {10.1109/MSP.2018.2868887},
issn = {15580792},
journal = {IEEE Signal Processing Magazine},
link = {https://arxiv.org/pdf/1902.04397.pdf},
month = {jan},
number = {1},
pages = {52--62},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
task = {music retrieval},
title = {Cross-Modal Music Retrieval and Applications: An Overview of Key Methodologies},
volume = {36},
year = {2019}
}
@inproceedings{Simonetta2019,
abstract = {Towards improving the performance in various music information processing tasks, recent studies exploit different modalities able to capture diverse aspects of music. Such modalities include audio recordings, symbolic music scores, mid-level representations, motion and gestural data, video recordings, editorial or cultural tags, lyrics and album cover arts. This paper critically reviews the various approaches adopted in Music Information Processing and Retrieval, and highlights how multimodal algorithms can help Music Computing applications. First, we categorize the related literature based on the application they address. Subsequently, we analyze existing information fusion approaches, and we conclude with the set of challenges that Music Information Retrieval and Sound and Music Computing research communities should focus in the next years.},
author = {Simonetta, Federico and Ntalampiras, Stavros and Avanzini, Federico},
booktitle = {Proceedings - 2019 International Workshop on Multilayer Music Representation and Processing, MMRP 2019},
doi = {10.1109/MMRP.2019.8665366},
isbn = {9781728116495},
keywords = {Information fusion,Multimodal music processing,Music description systems,Music information retrieval},
link = {https://arxiv.org/pdf/1902.05347.pdf},
title = {Multimodal music information processing and retrieval: Survey and future challenges},
year = {2019}
}
@inproceedings{Verma2019,
abstract = {We introduce the problem of learning affective correspondence between audio (music) and visual data (images). For this task, a music clip and an image are considered similar (having true correspondence) if they have similar emotion content. In order to estimate this crossmodal, emotion-centric similarity, we propose a deep neural network architecture that learns to project the data from the two modalities to a common representation space, and performs a binary classification task of predicting the affective correspondence (true or false). To facilitate the current study, we construct a large scale database containing more than 3, 500 music clips and 85, 000 images with three emotion classes (positive, neutral, negative). The proposed approach achieves 61.67{\%} accuracy for the affective correspondence prediction task on this database, outperforming two relevant and competitive baselines. We also demonstrate that our network learns modality-specific representations of emotion (without explicitly being trained with emotion labels), which are useful for emotion recognition in individual modalities.},
archiveprefix = {arXiv},
arxivid = {1904.00150},
author = {Verma, Gaurav and Dhekane, Eeshan Gunesh and Guha, Tanaya},
booktitle = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings},
doi = {10.1109/ICASSP.2019.8683133},
eprint = {1904.00150},
isbn = {9781479981311},
issn = {15206149},
keywords = {correspondence learning,crossmodal,deep learning,emotion recognition},
link = {https://arxiv.org/pdf/1904.00150.pdf},
month = {may},
pages = {3975--3979},
publisher = {Institute of Electrical and Electronics Engineers Inc.},
title = {Learning Affective Correspondence between Music and Image},
type = {Audio-Image},
volume = {2019-May},
year = {2019}
}
@inproceedings{Watanabe2019,
abstract = {This paper presents Query-by-Blending, a novel music exploration system that enables users to find unfamiliar music content by flexibly combining three musical aspects: lyric word, song audio, and artist. Although there are various systems for music retrieval based on the similarity between songs or artists and for music browsing based on visualized songs, it is still difficult to explore unfamiliar content by flexibly combining multiple musical aspects. Query-by-Blending overcomes this difficulty by representing each of the aspects as a latent vector representation (called a "flavor" in this paper) that is a distinctive quality felt to be characteristic of a given word/song/artist. By giving a lyric word as a query, for example, a user can find songs and artists whose flavors are similar to the flavor of the query word. Moreover, by giving a query combining (blending) lyric-word and song-audio flavors, the user can interactively explore unfamiliar content containing the blended flavor. This multi-aspect blending was achieved by constructing a novel vector space model into which all of the lyric words, song audio tracks, and artist IDs of a collection can be embedded. In our experiments, we embedded 14,505 lyric words, 433,936 songs, and 44,696 artists into the same shared vector space and found that the system can appropriately calculate similarities between different aspects and blend flavors to find related lyric words, songs, and artists.},
author = {Watanabe, Kento and Goto, Masataka},
booktitle = {Proceedings of the 20th International Society for Music Information Retrieval Conference},
link = {https://archives.ismir.net/ismir2019/paper/000015.pdf},
task = {music retrieval},
title = {Query-by-Blending: a Music Exploration System Blending Latent Vector Representations of Lyric Word, Song Audio, and Artist},
year = {2019}
}
@article{Yu2019,
archiveprefix = {arXiv},
arxivid = {1711.08976},
author = {Yu, Yi and Tang, Suhua and Raposo, Francisco and Chen, Lei},
doi = {10.1145/3281746},
eprint = {1711.08976},
issn = {15516865},
journal = {ACM Transactions on Multimedia Computing, Communications and Applications},
keywords = {Convolutional neural networks,Correlation learning between audio and lyrics,Cross-modal music retrieval,Deep cross-modal models,Music knowledge discovery},
link = {https://arxiv.org/pdf/1711.08976.pdf},
month = {feb},
number = {1},
publisher = {Association for Computing Machinery},
task = {music retrieval},
title = {Deep cross-modal correlation learning for audio and lyrics in music retrieval},
type = {Audio-Text},
volume = {15},
year = {2019}
}
@inproceedings{Zeng2019,
archiveprefix = {arXiv},
arxivid = {1908.03744},
author = {Zeng, Donghuo and Yu, Yi and Oyama, Keizo},
booktitle = {Proceedings - 2018 IEEE International Symposium on Multimedia, ISM 2018},
doi = {10.1109/ISM.2018.00-21},
eprint = {1908.03744},
isbn = {9781538668573},
keywords = {Cross-modal retrieval,Deep CCA,Deep learning},
link = {https://arxiv.org/pdf/1908.03744.pdf},
pages = {143--150},
publisher = {IEEE},
task = {music video retrieval},
title = {Audio-visual embedding for cross-modal music video retrieval through supervised deep CCA},
type = {Audio-Video},
year = {2019}
}
@inproceedings{Cai2020,
author = {Cai, Tian and Mandel, Michael I and He, Di},
booktitle = {Proceedings of the 1st Workshop on NLP for Music and Audio (NLP4MusA)},
link = {https://www.aclweb.org/anthology/2020.nlp4musa-1.14},
pages = {67--72},
publisher = {Association for Computational Linguistics},
task = {music captioning},
title = {Music autotagging as captioning},
type = {Audio-Text},
year = {2020}
}
@inproceedings{doh2020musical,
author = {Doh, Seungheon and Lee, Jongpil and Park, Tae Hong and Nam, Juhan},
booktitle = {Machine Learning for Media Discovery Workshop, International Conference on Machine Learning (ICML)},
link = {https://arxiv.org/abs/2008.01190},
title = {Musical word embedding: Bridging the gap between listening contexts and music},
year = {2020}
}
@unpublished{Favory2020a,
abstract = {Self-supervised audio representation learning offers an attractive alternative for obtaining generic audio embeddings, capable to be employed into various downstream tasks. Published approaches that consider both audio and words/tags associated with audio do not employ text processing models that are capable to generalize to tags unknown during training. In this work we propose a method for learning audio representations using an audio autoencoder (AAE), a general word embed-dings model (WEM), and a multi-head self-attention (MHA) mechanism. MHA attends on the output of the WEM, providing a contextualized representation of the tags associated with the audio, and we align the output of MHA with the output of the encoder of AAE using a contrastive loss. We jointly optimize AAE and MHA and we evaluate the audio representations (i.e. the output of the encoder of AAE) by utilizing them in three different downstream tasks, namely sound, music genre, and music instrument classification. Our results show that employing multi-head self-attention with multiple heads in the tag-based network can induce better learned audio representations.},
archiveprefix = {arXiv},
arxivid = {2010.14171v1},
author = {Favory, Xavier and Drossos, Konstantinos and Virtanen, Tuomas and Serra, Xavier},
code = {https://github.com/xavierfav/ae-w2v-attention},
eprint = {2010.14171v1},
keywords = {Index Terms-representation learning,audio classification,multimodal con-trastive learning},
link = {https://arxiv.org/pdf/2010.14171.pdf},
title = {Learning Contextual Tag Embeddings for Cross-Modal Alignment of Audio and Tags},
type = {Audio-User},
year = {2020}
}
@inproceedings{gan2020foley,
author = {Gan, Chuang and Huang, Deng and Chen, Peihao and Tenenbaum, Joshua B and Torralba, Antonio},
booktitle = {European Conference on Computer Vision},
link = {https://www.ecva.net/papers/eccv_2020/papers_ECCV/papers/123560732.pdf},
organization = {Springer},
pages = {758--775},
task = {music generation},
title = {Foley music: Learning to generate music from videos},
year = {2020}
}
@inproceedings{gan2020music,
author = {Gan, Chuang and Huang, Deng and Zhao, Hang and Tenenbaum, Joshua B and Torralba, Antonio},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
link = {https://openaccess.thecvf.com/content_CVPR_2020/papers/Gan_Music_Gesture_for_Visual_Sound_Separation_CVPR_2020_paper.pdf},
pages = {10478--10487},
task = {source separation},
title = {Music gesture for visual sound separation},
year = {2020}
}
@inproceedings{Huang2020,
author = {Huang, Qingqing and Jansen, Aren and Zhang, Li and Ellis, Daniel P. W. and Saurous, Rif A. and Anderson, John},
booktitle = {ICASSP 2020-2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
doi = {10.1109/icassp40776.2020.9053240},
keywords = {audio tagging},
link = {https://ieeexplore.ieee.org/abstract/document/9053240},
task = {music recommendation},
title = {Large-Scale Weakly-Supervised Content Embeddings for Music Recommendation and Tagging},
year = {2020}
}
@inproceedings{Jeong2021,
author = {Jeong, Dasaem and Doh, Seungheon and Kwon, Taegyun},
booktitle = {Workshop on Machine Learning for Creativity and Design, Neural Information Processing Systems (NeurIPS)},
code = {https://github.com/jdasam/traeumerAI},
link = {https://arxiv.org/abs/2102.04680},
task = {music-to-image synthesis},
title = {Tr$\backslash$" aumerai: Dreaming music with stylegan},
type = {Audio-Image},
year = {2020}
}
@inproceedings{Rossetto2020,
address = {Online},
author = {Rossetto, Federico and Dalton, Jeff},
booktitle = {Proceedings of the 1st Workshop on NLP for Music and Audio (NLP4MusA)},
link = {https://www.aclweb.org/anthology/2020.nlp4musa-1.13},
pages = {64--66},
publisher = {Association for Computational Linguistics},
title = {MusicBERT - learning multi-modal representations for music and text},
type = {Audio-Text},
year = {2020}
}
@inproceedings{doh2021music,
author = {Doh, SeungHeon and Lee, Junwon and Nam, Juhan},
booktitle = {Proceedings of the 2nd Workshop on NLP for Music and Spoken Audio (NLP4MuSA)},
code = {https://github.com/SeungHeonDoh/ply_title_gen},
link = {https://arxiv.org/abs/2110.07354},
task = {playlist captioning},
title = {Music Playlist Title Generation: A Machine-Translation Approach},
type = {Audio-Text},
year = {2021}
}
@article{ferraro2021enriched,
author = {Ferraro, Andres and Favory, Xavier and Drossos, Konstantinos and Kim, Yuntae and Bogdanov, Dmitry},
code = {https://github.com/andrebola/contrastive-mir-learning},
journal = {IEEE Signal Processing Letters},
link = {https://arxiv.org/abs/2104.00437},
pages = {733--737},
publisher = {IEEE},
title = {Enriched music representations with multiple cross-modal contrastive learning},
volume = {28},
year = {2021}
}
@inproceedings{Manco2021,
author = {Manco, Ilaria and Benetos, Emmanouil and Quinton, Elio and Fazekas, Gyorgy},
booktitle = {2021 {International} {Joint} {Conference} on {Neural} {Networks} ({IJCNN})},
code = {https://github.com/ilaria-manco/muscaps},
link = {https://arxiv.org/abs/2104.11984},
publisher = {IEEE},
task = {music captioning},
title = {MusCaps: Generating Captions for Music Audio},
type = {Audio-Text},
year = {2021}
}
@inproceedings{won2021multimodal,
author = {Won, Minz and Oramas, Sergio and Nieto, Oriol and Gouyon, Fabien and Serra, Xavier},
booktitle = {ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
code = {https://github.com/minzwon/tag-based-music-retrieval},
link = {https://arxiv.org/pdf/2010.16030.pdf},
organization = {IEEE},
pages = {591--595},
task = {music retrieval},
title = {Multimodal metric learning for tag-based music retrieval},
year = {2021}
}
@article{doh2022toward,
author = {Doh, SeungHeon and Won, Minz and Choi, Keunwoo and Nam, Juhan},
code = {https://github.com/SeungHeonDoh/music-text-representation},
journal = {arXiv preprint arXiv:2211.14558},
link = {https://arxiv.org/abs/2211.14558},
title = {Toward Universal Text-to-Music Retrieval},
type = {Audio-Text},
year = {2022}
}
@article{elizalde2022clap,
author = {Elizalde, Benjamin and Deshmukh, Soham and Ismail, Mahmoud Al and Wang, Huaming},
code = {https://github.com/microsoft/CLAP},
journal = {arXiv preprint arXiv:2206.04769},
link = {https://arxiv.org/abs/2206.04769},
title = {Clap: Learning audio concepts from natural language supervision},
type = {Audio-Text},
year = {2022}
}
@inproceedings{Gabbolini2022,
author = {Gabbolini, Giovanni and Hennequin, Romain and Epure, Elena},
booktitle = {Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing (EMNLP)},
code = {https://github.com/deezer/playntell},
link = {https://preview.aclanthology.org/emnlp-22-ingestion/2022.emnlp-main.784},
task = {playlist captioning},
title = {Data-Efficient Playlist Captioning With Musical and Linguistic Knowledge},
type = {Audio-Text},
year = {2022}
}
@article{he2022recap,
author = {He, Zihao and Hao, Weituo and Song, Xuchen},
journal = {arXiv preprint arXiv:2212.10901},
link = {https://arxiv.org/abs/2212.10901v1},
task = {music captioning},
title = {RECAP: Retrieval Augmented Music Captioner},
type = {Audio-Text},
year = {2022}
}
@inproceedings{huang2022mulan,
author = {Huang, Qingqing and Jansen, Aren and Lee, Joonseok and Ganti, Ravi and Li, Judith Yue and Ellis, Daniel PW},
booktitle = {Proceedings of the 23rd International Society for Music Information Retrieval Conference (ISMIR)},
link = {https://arxiv.org/abs/2208.12415},
title = {Mulan: A joint embedding of music audio and natural language},
type = {Audio-Text},
year = {2022}
}
@inproceedings{manco2022learning,
author = {Manco, Ilaria and Benetos, Emmanouil and Quinton, Elio and Fazekas, Gy{\"o}rgy},
booktitle = {ICASSP 2022-2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
code = {https://github.com/ilaria-manco/mulap},
link = {https://arxiv.org/abs/2112.04214},
organization = {IEEE},
pages = {456--460},
title = {Learning music audio representations via weak language supervision},
type = {Audio-Text},
year = {2022}
}
@inproceedings{manco2022contrastive,
author = {Manco, Ilaria and Benetos, Emmanouil and Quinton, Elio and Fazekas, Gy{\"o}rgy},
booktitle = {Proceedings of the 23rd International Society for Music Information Retrieval Conference (ISMIR)},
code = {https://github.com/ilaria-manco/muscall},
link = {https://arxiv.org/abs/2208.12208},
task = {music retrieval},
title = {Contrastive audio-language learning for music},
type = {Audio-Text},
year = {2022}
}
@inproceedings{leszczynski2022conversational,
author = {Megan Eileen Leszczynski and Ravi Ganti and Shu Zhang and Krisztian Balog and Filip Radlinski and Fernando Pereira and Arun Tejasvi Chaganty},
booktitle = {Second Workshop on Interactive Learning for Natural Language Processing at NeurIPS 2022},
link = {https://research.google/pubs/pub51943/},
title = {Conversational Music Retrieval with Synthetic Data},
type = {Audio-Text},
year = {2022}
}
@inproceedings{suris2022s,
author = {Sur{\'\i}s, D{\'\i}dac and Vondrick, Carl and Russell, Bryan and Salamon, Justin},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
link = {https://arxiv.org/abs/2206.07148},
pages = {10564--10574},
task = {music-to-video retrieval},
title = {It's Time for Artistic Correspondence in Music and Video},
type = {Audio-Video},
year = {2022}
}
@inproceedings{zhang2022interpreting,
author = {Zhang, Yixiao and Jiang, Junyan and Xia, Gus and Dixon, Simon},
booktitle = {Proceedings of the 23rd International Society for Music Information Retrieval Conference (ISMIR)},
link = {https://arxiv.org/abs/2208.11671},
task = {music retrieval},
title = {Interpreting Song Lyrics with an Audio-Informed Pre-trained Language Model},
type = {Audio-Text},
year = {2022}
}