-
Notifications
You must be signed in to change notification settings - Fork 27
Add support for JMRD #256
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Add support for JMRD #256
Changes from all commits
e6ae201
27913d3
a4f72fa
762411f
a0742a6
9a59b2c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,36 @@ | ||
| #!/usr/bin/env ruby | ||
|
|
||
| require 'datasets' | ||
|
|
||
| jmrd = Datasets::JMRD.new(type: :train) | ||
|
|
||
| jmrd.each do |dialogue| | ||
| puts "=" * 80 | ||
| puts "Dialogue ID: #{dialogue.dialog_id}" | ||
| puts "Movie: #{dialogue.movie_title}" | ||
| puts "First Speaker: #{dialogue.first_speaker}" | ||
| puts | ||
|
|
||
| if dialogue.knowledge | ||
| puts "Knowledge:" | ||
| puts " Title: #{dialogue.knowledge.title}" | ||
| puts " Year: #{dialogue.knowledge.year}" | ||
| puts " Director: #{dialogue.knowledge.director_name}" | ||
| puts " Genres: #{dialogue.knowledge.genres.join(', ')}" if dialogue.knowledge.genres | ||
| puts | ||
| end | ||
|
|
||
| puts "Dialogue:" | ||
| dialogue.utterances.each do |utterance| | ||
| speaker_label = utterance.speaker == "recommender" ? "[R]" : "[S]" | ||
| puts " #{speaker_label} #{utterance.text}" | ||
|
|
||
| if utterance.checked_knowledge && !utterance.checked_knowledge.empty? | ||
| knowledge_types = utterance.checked_knowledge.map { |ck| ck.type }.join(", ") | ||
| puts " (knowledge: #{knowledge_types})" | ||
| end | ||
| end | ||
|
|
||
| # Show only first dialogue as example | ||
| break | ||
| end | ||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,178 @@ | ||||||
| require "json" | ||||||
|
|
||||||
| require_relative "dataset" | ||||||
|
|
||||||
| module Datasets | ||||||
| class JMRD < Dataset | ||||||
| Dialogue = Struct.new( | ||||||
| :dialog_id, | ||||||
| :movie_title, | ||||||
| :first_speaker, | ||||||
| :questionnaire, | ||||||
| :knowledge, | ||||||
| :utterances | ||||||
| ) | ||||||
|
|
||||||
| Questionnaire = Struct.new( | ||||||
| :recommender, | ||||||
| :seeker | ||||||
| ) | ||||||
|
|
||||||
| QuestionnaireAnswers = Struct.new( | ||||||
| :q1, | ||||||
| :q2, | ||||||
| :q3, | ||||||
| :q4, | ||||||
| :q5 | ||||||
| ) | ||||||
|
|
||||||
| Knowledge = Struct.new( | ||||||
| :title, | ||||||
| :year, | ||||||
| :director_name, | ||||||
| :director_description, | ||||||
| :cast_names, | ||||||
| :cast_descriptions, | ||||||
| :genres, | ||||||
| :reviews, | ||||||
| :synopsis | ||||||
| ) | ||||||
|
Comment on lines
+29
to
+39
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As I see the data in JSON, there is data like
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems that https://github.com/ku-nlp/JMRD?tab=readme-ov-file#format
It seems that It seems that the following should be satisfied: dialogue.utterances.all? do |utterance|
dialog.knowledge.respond_to?(utterance.checked_knowledge.type)
end |
||||||
|
|
||||||
| Utterance = Struct.new( | ||||||
| :utterance_id, | ||||||
| :speaker, | ||||||
| :text, | ||||||
| :checked_knowledge | ||||||
| ) | ||||||
|
|
||||||
| CheckedKnowledge = Struct.new( | ||||||
| :type, | ||||||
| :content | ||||||
| ) | ||||||
|
|
||||||
| def initialize(type: :train) | ||||||
| super() | ||||||
| @metadata.id = "jmrd" | ||||||
| @metadata.name = "Japanese Movie Recommendation Dialogue Dataset (JMRD)" | ||||||
| @metadata.url = "https://github.com/ku-nlp/JMRD" | ||||||
| @metadata.licenses = ["CC-BY-SA-4.0"] | ||||||
| @metadata.description = <<~DESCRIPTION | ||||||
| JMRD (Japanese Movie Recommendation Dialogue Dataset) is a Japanese | ||||||
| knowledge-grounded dialogue dataset consisting of annotated movie | ||||||
| recommendation dialogues between humans. Every recommender's utterance | ||||||
| is associated with movie information as external knowledge. | ||||||
|
|
||||||
| The dataset consists of about 5,000 dialogues between crowdworkers, | ||||||
| each of which consists of 23 utterances on average. All dialogues in | ||||||
| this dataset are divided into the train (4,575 dialogues), valid | ||||||
| (200 dialogues), and test sets (300 dialogues). | ||||||
|
|
||||||
| Published in the 2nd DialDoc Workshop on Document-grounded Dialogue | ||||||
| and Conversational Question Answering, 2022. | ||||||
|
|
||||||
| Reference: | ||||||
| Takashi Kodama, Ribeka Tanaka, and Sadao Kurohashi. | ||||||
| "Construction of Hierarchical Structured Knowledge-based Recommendation | ||||||
| Dialogue Dataset and Dialogue System." | ||||||
| DESCRIPTION | ||||||
|
|
||||||
| unless [:train, :valid, :test].include?(type) | ||||||
| raise ArgumentError, ":type must be one of [:train, :valid, :test]: #{type.inspect}" | ||||||
| end | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: How about validating "type" before calling super() to avoid unnecessary setup if the argument is |
||||||
| @type = type | ||||||
| end | ||||||
|
|
||||||
| def each | ||||||
| return to_enum(__method__) unless block_given? | ||||||
|
|
||||||
| open_data do |json_data| | ||||||
| json_data.each do |dialogue_data| | ||||||
| yield parse_dialogue(dialogue_data) | ||||||
| end | ||||||
| end | ||||||
| end | ||||||
|
|
||||||
| private | ||||||
|
|
||||||
| def open_data | ||||||
| data_path = cache_dir_path + "#{@type}.json" | ||||||
| data_url = "https://raw.githubusercontent.com/ku-nlp/JMRD/main/data/#{@type}.json" | ||||||
| download(data_path, data_url) | ||||||
|
|
||||||
| json_data = JSON.parse(File.read(data_path)) | ||||||
| yield json_data | ||||||
| end | ||||||
|
|
||||||
| def parse_dialogue(data) | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about using other word than
otegami marked this conversation as resolved.
|
||||||
| Dialogue.new( | ||||||
| data["dialog_id"], | ||||||
| data["movie_title"], | ||||||
| data["first_speaker"], | ||||||
| parse_questionnaire(data["questionnaire"]), | ||||||
| parse_knowledge(data["knowledge"]), | ||||||
| parse_utterances(data["dialog"]) | ||||||
| ) | ||||||
| end | ||||||
|
|
||||||
| def parse_questionnaire(data) | ||||||
| return nil if data.nil? | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: Maybe we don't have to specify here.
Suggested change
|
||||||
|
|
||||||
| Questionnaire.new( | ||||||
| parse_questionnaire_answers(data["recommender"]), | ||||||
| parse_questionnaire_answers(data["seeker"]) | ||||||
| ) | ||||||
| end | ||||||
|
|
||||||
| def parse_questionnaire_answers(data) | ||||||
| return nil if data.nil? | ||||||
|
|
||||||
| QuestionnaireAnswers.new( | ||||||
| data["Q1"], | ||||||
| data["Q2"], | ||||||
| data["Q3"], | ||||||
| data["Q4"], | ||||||
| data["Q5"] | ||||||
| ) | ||||||
| end | ||||||
|
|
||||||
| def parse_knowledge(data) | ||||||
| return nil if data.nil? | ||||||
|
|
||||||
| Knowledge.new( | ||||||
| data["タイトル"], | ||||||
| data["製作年度"], | ||||||
| data["監督名"], | ||||||
| data["監督説明"], | ||||||
| data["キャスト名"], | ||||||
| data["キャスト説明"], | ||||||
| data["ジャンル"], | ||||||
| data["レビュー"], | ||||||
| data["あらすじ"] | ||||||
| ) | ||||||
| end | ||||||
|
|
||||||
| def parse_utterances(data) | ||||||
| return [] if data.nil? | ||||||
|
|
||||||
| data.map do |utterance_data| | ||||||
| parse_utterance(utterance_data) | ||||||
| end | ||||||
| end | ||||||
|
|
||||||
| def parse_utterance(data) | ||||||
| checked_knowledge = nil | ||||||
| if data["checked_knowledge"] | ||||||
| checked_knowledge = data["checked_knowledge"].map do |ck| | ||||||
| CheckedKnowledge.new(ck["type"], ck["content"]) | ||||||
| end | ||||||
| end | ||||||
|
Comment on lines
+163
to
+168
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about creating |
||||||
|
|
||||||
| Utterance.new( | ||||||
| data["utterance_id"], | ||||||
| data["speaker"], | ||||||
| data["text"], | ||||||
| checked_knowledge | ||||||
| ) | ||||||
| end | ||||||
| end | ||||||
| end | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,116 @@ | ||
| module Datasets | ||
| class JMRD < Dataset | ||
| class CheckedKnowledge < Struct[untyped] | ||
| attr_accessor type(): String? | ||
| attr_accessor content(): String? | ||
|
|
||
| def initialize: (?String? type, ?String? content) -> void | ||
| end | ||
|
|
||
| class Utterance < Struct[untyped] | ||
| attr_accessor utterance_id(): String? | ||
| attr_accessor speaker(): String? | ||
| attr_accessor text(): String? | ||
| attr_accessor checked_knowledge(): Array[CheckedKnowledge]? | ||
|
|
||
| def initialize: ( | ||
| ?String? utterance_id, | ||
| ?String? speaker, | ||
| ?String? text, | ||
| ?Array[CheckedKnowledge]? checked_knowledge | ||
| ) -> void | ||
| end | ||
|
|
||
| class Knowledge < Struct[untyped] | ||
| attr_accessor title(): String? | ||
| attr_accessor year(): String? | ||
| attr_accessor director_name(): String? | ||
| attr_accessor director_description(): String? | ||
| attr_accessor cast_names(): Array[String]? | ||
| attr_accessor cast_descriptions(): Array[String]? | ||
| attr_accessor genres(): Array[String]? | ||
| attr_accessor reviews(): Array[String]? | ||
| attr_accessor synopsis(): Array[String]? | ||
|
|
||
| def initialize: ( | ||
| ?String? title, | ||
| ?String? year, | ||
| ?String? director_name, | ||
| ?String? director_description, | ||
| ?Array[String]? cast_names, | ||
| ?Array[String]? cast_descriptions, | ||
| ?Array[String]? genres, | ||
| ?Array[String]? reviews, | ||
| ?Array[String]? synopsis | ||
| ) -> void | ||
| end | ||
|
|
||
| class QuestionnaireAnswers < Struct[untyped] | ||
| attr_accessor q1(): Integer? | ||
| attr_accessor q2(): Integer? | ||
| attr_accessor q3(): Integer? | ||
| attr_accessor q4(): Integer? | ||
| attr_accessor q5(): Integer? | ||
|
|
||
| def initialize: ( | ||
| ?Integer? q1, | ||
| ?Integer? q2, | ||
| ?Integer? q3, | ||
| ?Integer? q4, | ||
| ?Integer? q5 | ||
| ) -> void | ||
| end | ||
|
|
||
| class Questionnaire < Struct[untyped] | ||
| attr_accessor recommender(): QuestionnaireAnswers? | ||
| attr_accessor seeker(): QuestionnaireAnswers? | ||
|
|
||
| def initialize: ( | ||
| ?QuestionnaireAnswers? recommender, | ||
| ?QuestionnaireAnswers? seeker | ||
| ) -> void | ||
| end | ||
|
|
||
| class Dialogue < Struct[untyped] | ||
| attr_accessor dialog_id(): String? | ||
| attr_accessor movie_title(): String? | ||
| attr_accessor first_speaker(): String? | ||
| attr_accessor questionnaire(): Questionnaire? | ||
| attr_accessor knowledge(): Knowledge? | ||
| attr_accessor utterances(): Array[Utterance] | ||
|
|
||
| def initialize: ( | ||
| ?String? dialog_id, | ||
| ?String? movie_title, | ||
| ?String? first_speaker, | ||
| ?Questionnaire? questionnaire, | ||
| ?Knowledge? knowledge, | ||
| ?Array[Utterance] utterances | ||
| ) -> void | ||
| end | ||
|
Comment on lines
+74
to
+90
|
||
|
|
||
| @type: Symbol | ||
| @metadata: Metadata | ||
|
|
||
| def initialize: (?type: Symbol) -> void | ||
|
|
||
| def each: () { (Dialogue) -> void } -> void | ||
| | () -> Enumerator[Dialogue, void] | ||
|
|
||
| private | ||
|
|
||
| def open_data: () { (Array[Hash[String, untyped]]) -> void } -> void | ||
|
|
||
| def parse_dialogue: (Hash[String, untyped] data) -> Dialogue | ||
|
|
||
| def parse_questionnaire: (Hash[String, untyped]? data) -> Questionnaire? | ||
|
|
||
| def parse_questionnaire_answers: (Hash[String, untyped]? data) -> QuestionnaireAnswers? | ||
|
|
||
| def parse_knowledge: (Hash[String, untyped]? data) -> Knowledge? | ||
|
|
||
| def parse_utterances: (Array[Hash[String, untyped]]? data) -> Array[Utterance] | ||
|
|
||
| def parse_utterance: (Hash[String, untyped] data) -> Utterance | ||
| end | ||
| end | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can simplify this: