diff --git a/README.md b/README.md index a6120bb2..6788a1f0 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,7 @@ You can use datasets easily because you can access each dataset with multiple wa * House of Councillors of Japan * House of Representatives of Japan * Iris Dataset +* JMRD (Japanese Movie Recommendation Dialogue Dataset) * Libsvm * MNIST database * Mushroom diff --git a/example/jmrd.rb b/example/jmrd.rb new file mode 100755 index 00000000..daf63a7f --- /dev/null +++ b/example/jmrd.rb @@ -0,0 +1,36 @@ +#!/usr/bin/env ruby + +require 'datasets' + +jmrd = Datasets::JMRD.new(type: :train) + +jmrd.each do |dialogue| + puts "=" * 80 + puts "Dialogue ID: #{dialogue.dialog_id}" + puts "Movie: #{dialogue.movie_title}" + puts "First Speaker: #{dialogue.first_speaker}" + puts + + if dialogue.knowledge + puts "Knowledge:" + puts " Title: #{dialogue.knowledge.title}" + puts " Year: #{dialogue.knowledge.year}" + puts " Director: #{dialogue.knowledge.director_name}" + puts " Genres: #{dialogue.knowledge.genres.join(', ')}" if dialogue.knowledge.genres + puts + end + + puts "Dialogue:" + dialogue.utterances.each do |utterance| + speaker_label = utterance.speaker == "recommender" ? "[R]" : "[S]" + puts " #{speaker_label} #{utterance.text}" + + if utterance.checked_knowledge && !utterance.checked_knowledge.empty? + knowledge_types = utterance.checked_knowledge.map { |ck| ck.type }.join(", ") + puts " (knowledge: #{knowledge_types})" + end + end + + # Show only first dialogue as example + break +end diff --git a/lib/datasets/jmrd.rb b/lib/datasets/jmrd.rb new file mode 100644 index 00000000..cae2822b --- /dev/null +++ b/lib/datasets/jmrd.rb @@ -0,0 +1,178 @@ +require "json" + +require_relative "dataset" + +module Datasets + class JMRD < Dataset + Dialogue = Struct.new( + :dialog_id, + :movie_title, + :first_speaker, + :questionnaire, + :knowledge, + :utterances + ) + + Questionnaire = Struct.new( + :recommender, + :seeker + ) + + QuestionnaireAnswers = Struct.new( + :q1, + :q2, + :q3, + :q4, + :q5 + ) + + Knowledge = Struct.new( + :title, + :year, + :director_name, + :director_description, + :cast_names, + :cast_descriptions, + :genres, + :reviews, + :synopsis + ) + + Utterance = Struct.new( + :utterance_id, + :speaker, + :text, + :checked_knowledge + ) + + CheckedKnowledge = Struct.new( + :type, + :content + ) + + def initialize(type: :train) + super() + @metadata.id = "jmrd" + @metadata.name = "Japanese Movie Recommendation Dialogue Dataset (JMRD)" + @metadata.url = "https://github.com/ku-nlp/JMRD" + @metadata.licenses = ["CC-BY-SA-4.0"] + @metadata.description = <<~DESCRIPTION + JMRD (Japanese Movie Recommendation Dialogue Dataset) is a Japanese + knowledge-grounded dialogue dataset consisting of annotated movie + recommendation dialogues between humans. Every recommender's utterance + is associated with movie information as external knowledge. + + The dataset consists of about 5,000 dialogues between crowdworkers, + each of which consists of 23 utterances on average. All dialogues in + this dataset are divided into the train (4,575 dialogues), valid + (200 dialogues), and test sets (300 dialogues). + + Published in the 2nd DialDoc Workshop on Document-grounded Dialogue + and Conversational Question Answering, 2022. + + Reference: + Takashi Kodama, Ribeka Tanaka, and Sadao Kurohashi. + "Construction of Hierarchical Structured Knowledge-based Recommendation + Dialogue Dataset and Dialogue System." + DESCRIPTION + + unless [:train, :valid, :test].include?(type) + raise ArgumentError, ":type must be one of [:train, :valid, :test]: #{type.inspect}" + end + @type = type + end + + def each + return to_enum(__method__) unless block_given? + + open_data do |json_data| + json_data.each do |dialogue_data| + yield parse_dialogue(dialogue_data) + end + end + end + + private + + def open_data + data_path = cache_dir_path + "#{@type}.json" + data_url = "https://raw.githubusercontent.com/ku-nlp/JMRD/main/data/#{@type}.json" + download(data_path, data_url) + + json_data = JSON.parse(File.read(data_path)) + yield json_data + end + + def parse_dialogue(data) + Dialogue.new( + data["dialog_id"], + data["movie_title"], + data["first_speaker"], + parse_questionnaire(data["questionnaire"]), + parse_knowledge(data["knowledge"]), + parse_utterances(data["dialog"]) + ) + end + + def parse_questionnaire(data) + return nil if data.nil? + + Questionnaire.new( + parse_questionnaire_answers(data["recommender"]), + parse_questionnaire_answers(data["seeker"]) + ) + end + + def parse_questionnaire_answers(data) + return nil if data.nil? + + QuestionnaireAnswers.new( + data["Q1"], + data["Q2"], + data["Q3"], + data["Q4"], + data["Q5"] + ) + end + + def parse_knowledge(data) + return nil if data.nil? + + Knowledge.new( + data["タイトル"], + data["製作年度"], + data["監督名"], + data["監督説明"], + data["キャスト名"], + data["キャスト説明"], + data["ジャンル"], + data["レビュー"], + data["あらすじ"] + ) + end + + def parse_utterances(data) + return [] if data.nil? + + data.map do |utterance_data| + parse_utterance(utterance_data) + end + end + + def parse_utterance(data) + checked_knowledge = nil + if data["checked_knowledge"] + checked_knowledge = data["checked_knowledge"].map do |ck| + CheckedKnowledge.new(ck["type"], ck["content"]) + end + end + + Utterance.new( + data["utterance_id"], + data["speaker"], + data["text"], + checked_knowledge + ) + end + end +end diff --git a/lib/datasets/lazy.rb b/lib/datasets/lazy.rb index 204a240b..27351b96 100644 --- a/lib/datasets/lazy.rb +++ b/lib/datasets/lazy.rb @@ -61,6 +61,7 @@ def const_missing(name) LAZY_LOADER.register(:HouseOfRepresentative, "datasets/house-of-representative") LAZY_LOADER.register(:Iris, "datasets/iris") LAZY_LOADER.register(:ITACorpus, "datasets/ita-corpus") + LAZY_LOADER.register(:JMRD, "datasets/jmrd") LAZY_LOADER.register(:KuzushijiMNIST, "datasets/kuzushiji-mnist") LAZY_LOADER.register(:LIBSVM, "datasets/libsvm") LAZY_LOADER.register(:LIBSVMDatasetList, "datasets/libsvm-dataset-list") diff --git a/sig/datasets/jmrd.rbs b/sig/datasets/jmrd.rbs new file mode 100644 index 00000000..22fb64b3 --- /dev/null +++ b/sig/datasets/jmrd.rbs @@ -0,0 +1,116 @@ +module Datasets + class JMRD < Dataset + class CheckedKnowledge < Struct[untyped] + attr_accessor type(): String? + attr_accessor content(): String? + + def initialize: (?String? type, ?String? content) -> void + end + + class Utterance < Struct[untyped] + attr_accessor utterance_id(): String? + attr_accessor speaker(): String? + attr_accessor text(): String? + attr_accessor checked_knowledge(): Array[CheckedKnowledge]? + + def initialize: ( + ?String? utterance_id, + ?String? speaker, + ?String? text, + ?Array[CheckedKnowledge]? checked_knowledge + ) -> void + end + + class Knowledge < Struct[untyped] + attr_accessor title(): String? + attr_accessor year(): String? + attr_accessor director_name(): String? + attr_accessor director_description(): String? + attr_accessor cast_names(): Array[String]? + attr_accessor cast_descriptions(): Array[String]? + attr_accessor genres(): Array[String]? + attr_accessor reviews(): Array[String]? + attr_accessor synopsis(): Array[String]? + + def initialize: ( + ?String? title, + ?String? year, + ?String? director_name, + ?String? director_description, + ?Array[String]? cast_names, + ?Array[String]? cast_descriptions, + ?Array[String]? genres, + ?Array[String]? reviews, + ?Array[String]? synopsis + ) -> void + end + + class QuestionnaireAnswers < Struct[untyped] + attr_accessor q1(): Integer? + attr_accessor q2(): Integer? + attr_accessor q3(): Integer? + attr_accessor q4(): Integer? + attr_accessor q5(): Integer? + + def initialize: ( + ?Integer? q1, + ?Integer? q2, + ?Integer? q3, + ?Integer? q4, + ?Integer? q5 + ) -> void + end + + class Questionnaire < Struct[untyped] + attr_accessor recommender(): QuestionnaireAnswers? + attr_accessor seeker(): QuestionnaireAnswers? + + def initialize: ( + ?QuestionnaireAnswers? recommender, + ?QuestionnaireAnswers? seeker + ) -> void + end + + class Dialogue < Struct[untyped] + attr_accessor dialog_id(): String? + attr_accessor movie_title(): String? + attr_accessor first_speaker(): String? + attr_accessor questionnaire(): Questionnaire? + attr_accessor knowledge(): Knowledge? + attr_accessor utterances(): Array[Utterance] + + def initialize: ( + ?String? dialog_id, + ?String? movie_title, + ?String? first_speaker, + ?Questionnaire? questionnaire, + ?Knowledge? knowledge, + ?Array[Utterance] utterances + ) -> void + end + + @type: Symbol + @metadata: Metadata + + def initialize: (?type: Symbol) -> void + + def each: () { (Dialogue) -> void } -> void + | () -> Enumerator[Dialogue, void] + + private + + def open_data: () { (Array[Hash[String, untyped]]) -> void } -> void + + def parse_dialogue: (Hash[String, untyped] data) -> Dialogue + + def parse_questionnaire: (Hash[String, untyped]? data) -> Questionnaire? + + def parse_questionnaire_answers: (Hash[String, untyped]? data) -> QuestionnaireAnswers? + + def parse_knowledge: (Hash[String, untyped]? data) -> Knowledge? + + def parse_utterances: (Array[Hash[String, untyped]]? data) -> Array[Utterance] + + def parse_utterance: (Hash[String, untyped] data) -> Utterance + end +end diff --git a/test/test-jmrd.rb b/test/test-jmrd.rb new file mode 100644 index 00000000..0023636b --- /dev/null +++ b/test/test-jmrd.rb @@ -0,0 +1,104 @@ +class JMRDTest < Test::Unit::TestCase + sub_test_case("type") do + test("train") do + dataset = Datasets::JMRD.new(type: :train) + dialogues = dataset.to_a + + assert_equal(4575, dialogues.size) + + first_dialogue = dialogues[0] + assert_equal("01884", first_dialogue.dialog_id) + assert_equal("時をかける少女", first_dialogue.movie_title) + assert_equal("recommender", first_dialogue.first_speaker) + + # Check questionnaire + assert_not_nil(first_dialogue.questionnaire) + assert_equal(5, first_dialogue.questionnaire.recommender.q1) + assert_equal(4, first_dialogue.questionnaire.seeker.q1) + + # Check knowledge + assert_not_nil(first_dialogue.knowledge) + assert_equal("時をかける少女", first_dialogue.knowledge.title) + assert_equal("2006年", first_dialogue.knowledge.year) + assert_equal("細田守", first_dialogue.knowledge.director_name) + + # Check utterances + assert_equal(26, first_dialogue.utterances.size) + assert_equal("01884_00", first_dialogue.utterances[0].utterance_id) + assert_equal("recommender", first_dialogue.utterances[0].speaker) + assert_equal("こんにちは", first_dialogue.utterances[0].text) + assert_not_nil(first_dialogue.utterances[0].checked_knowledge) + assert_equal(1, first_dialogue.utterances[0].checked_knowledge.size) + assert_equal("[知識なし]", first_dialogue.utterances[0].checked_knowledge[0].type) + end + + test("valid") do + dataset = Datasets::JMRD.new(type: :valid) + dialogues = dataset.to_a + + assert_equal(200, dialogues.size) + + first_dialogue = dialogues[0] + assert_not_nil(first_dialogue.dialog_id) + assert_not_nil(first_dialogue.movie_title) + assert(["recommender", "seeker"].include?(first_dialogue.first_speaker)) + end + + test("test") do + dataset = Datasets::JMRD.new(type: :test) + dialogues = dataset.to_a + + assert_equal(300, dialogues.size) + + first_dialogue = dialogues[0] + assert_not_nil(first_dialogue.dialog_id) + assert_not_nil(first_dialogue.movie_title) + assert(["recommender", "seeker"].include?(first_dialogue.first_speaker)) + end + + test("invalid") do + message = ":type must be one of [:train, :valid, :test]: :invalid" + assert_raise(ArgumentError.new(message)) do + Datasets::JMRD.new(type: :invalid) + end + end + end + + sub_test_case("#metadata") do + test("#id") do + dataset = Datasets::JMRD.new(type: :train) + assert_equal("jmrd", dataset.metadata.id) + end + + test("#name") do + dataset = Datasets::JMRD.new(type: :train) + assert_equal("Japanese Movie Recommendation Dialogue Dataset (JMRD)", + dataset.metadata.name) + end + + test("#url") do + dataset = Datasets::JMRD.new(type: :train) + assert_equal("https://github.com/ku-nlp/JMRD", dataset.metadata.url) + end + + test("#licenses") do + dataset = Datasets::JMRD.new(type: :train) + assert_equal([Datasets::License.new("CC-BY-SA-4.0")], + dataset.metadata.licenses) + end + + test("#description") do + dataset = Datasets::JMRD.new(type: :train) + description = dataset.metadata.description + assert do + description.include?("Japanese Movie Recommendation Dialogue Dataset") + end + assert do + description.include?("5,000 dialogues") + end + assert do + description.include?("knowledge-grounded") + end + end + end +end