Preview the data by searching by keyword below:
+ + + { if @state.fetching_keyword +No matches yet for "{@state.searched_query.keyword}"
+ + else if @state.results.length > 0 +Found {@state.results[0].getMeta('total')} matches
+ +Sorry, but public data exports are not enabled for this project yet.
+Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here.
+ } + + { @renderSearch() } + +Sorry, but public data exports are not enabled for this project yet.
+Participants have made {@state.project.classification_count.toLocaleString()} contributions to {@state.project.title} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here.
+ + + +For help interpretting the data, see Scribe WIKI on Data Exports.
+ +To browse past releases and/or to be notified when new releases are made, you may wish to subscribe to the ATOM Feed of Data Releases
+ +These data points represent numerous individual classifications that have been merged and lightly cleaned up to adhere to {@state.project.title}'s data model.
+ + { for field,i in @state.set.export_document.export_fields + if field.assertion_ids + assertion = subject = null + for s in @state.set.subjects + for a in s.assertions + if field.assertion_ids.indexOf(a.id) >= 0 + assertion = a + subject = s + if assertion && subject +These data points represent all distinct assertions made upon this {@props.project.term('subject set')} - without cleanup. Each assertion may represent several distinct contributions.
+This metadata was imported alongside the source images at the beginning of the project and may include high res source URIs and processing details.
+ ++ Currently, there are no {@props.project.term('subject')}s for you to {@props.workflowName}. + Try {next_workflow.name.capitalize()} instead! +
+ + else +There's nothing more to transcribe in {@props.project.title}!! 🎉 🎉 🎉 +
+Thank you to all the amazing volunteers who worked on this project.
+ + { if @props.project.downloadable_data +The {@props.project.root_subjects_count.toLocaleString()} records can be explored via the Data tab.
+ } +Should the public be able to download the latest from /data/latest and subscribe to the data updates ATOM feed?
+ + + +<% end %> diff --git a/app/views/final_data_exports/index.atom.builder b/app/views/final_data_exports/index.atom.builder new file mode 100644 index 000000000..2b22ca158 --- /dev/null +++ b/app/views/final_data_exports/index.atom.builder @@ -0,0 +1,11 @@ +atom_feed do |feed| + + feed.title("#{Project.current.title} Data Exports") + feed.updated(@exports[0].created_at) if @exports.length > 0 + + @exports.each do |export| + feed.entry(export) do |entry| + entry.title("#{export.updated_at.strftime('%c')}: #{export.num_final_subject_sets} subjects") + end + end +end diff --git a/config/initializers/register_project_static_routes.rb b/config/initializers/register_project_static_routes.rb index 05cb4a14b..9c61b0ad3 100644 --- a/config/initializers/register_project_static_routes.rb +++ b/config/initializers/register_project_static_routes.rb @@ -1,7 +1,11 @@ API::Application.configure do - if Project.current - project_assets_path = "./project/#{Project.current.key}/assets" - puts "Routing static assets from #{project_assets_path}" - Rails.application.config.middleware.insert_after ActionDispatch::Static, ActionDispatch::Static, project_assets_path + begin + if Project.current + project_assets_path = "./project/#{Project.current.key}/assets" + puts "Routing static assets from #{project_assets_path}" + Rails.application.config.middleware.insert_after ActionDispatch::Static, ActionDispatch::Static, project_assets_path + end + rescue + puts "FAILED to register static routing" end end diff --git a/config/routes.rb b/config/routes.rb index b6b9f7781..eb8162aae 100644 --- a/config/routes.rb +++ b/config/routes.rb @@ -8,6 +8,7 @@ get '/projects', to: 'projects#index', defaults: { format: 'json' } + get '/projects/current', to: 'projects#current', defaults: { format: 'json' } get '/workflows', to: 'workflow#index', defaults: { format: 'json' } get '/workflows/:id', to: 'workflow#show', defaults: { format: 'json' } @@ -35,10 +36,16 @@ resources :groups, only: [:show, :index], :defaults => { :format => 'json' } + # Final data: + resources :final_subject_sets, only: [:show, :index], :defaults => { :format => 'json' } + get '/data/latest', to: 'final_data_exports#latest' + resources :final_data_exports, only: [:show, :index], path: "/data" + namespace :admin do resources :subject_sets, :subjects, :classifications, :users get 'dashboard' => 'dashboard#index' get 'data' => 'data#index' + post 'data' => 'data#index' get 'data/download' => 'data#download' get 'signin' => 'auth#signin' post 'stats/recalculate' => 'dashboard#recalculate_stats' diff --git a/lib/tasks/bot.rake b/lib/tasks/bot.rake new file mode 100644 index 000000000..e52cb053e --- /dev/null +++ b/lib/tasks/bot.rake @@ -0,0 +1,46 @@ +namespace :bot do + + desc "Create Bot with name, printing out token to use in HTTP_BOT_AUTH" + task :create, [:name] => :environment do |task, args| + args.with_defaults name: 'ScribeBot' + + ret = BotUser.create args[:name] + + if ! ret[:token].blank? + puts "Created #{ret[:user].name}. Use HTTP header to authenticate:" + puts " #{BotUser::AUTH_HEADER}=#{BotUser::pack_auth_header(ret[:user].id, ret[:token])}" + else + puts "#{ret[:user].name} already exists, so token can not be read but may be reset. Use bot:reset to reset token." + end + end + + desc "Reset Bot token with name, printing out token to use in HTTP_ROBOT_AUTH" + task :reset, [:name] => :environment do |task, args| + args.with_defaults name: 'ScribeBot' + + user = BotUser.find_by name: args[:name] + token = user.reset_token! + + if token + puts "Reset #{user.name}. Use HTTP header to authenticate:" + puts " #{BotUser::AUTH_HEADER}=#{BotUser::pack_auth_header(user.id, token)}" + end + end + + desc "Delete Bot by name" + task :delete, [:name] => :environment do |task, args| + if args[:name].blank? + puts "No name given. Aborting." + exit + end + + user = BotUser.find_by name: args[:name] + if user + user.destroy + puts "Removed #{user.name}" + else + puts "Bot user #{args[:name]} could not be found" + end + end + +end diff --git a/lib/tasks/project.rake b/lib/tasks/project.rake index 6bf1f6ed7..faeb4898a 100644 --- a/lib/tasks/project.rake +++ b/lib/tasks/project.rake @@ -126,70 +126,34 @@ namespace :project do # load project_file_path project = Project.find_or_create_by key: project_key - # Establish some defaults so that if they're not set in the project hash, we overwrite the old value with the null default - project_defaults = { - background: nil, - logo: nil, - favicon: nil, - terms_map: {}, - team_emails: [], - team: [], - organizations: [], - analytics: nil, - forum: nil, - menus: {}, - partials: {} - } + load_export_specs(project, project_hash['export_specs']) if project_hash['export_specs'] + # Set all valid fields from hash: - project_hash = project_hash.inject(project_defaults) { |h, (k,v)| h[k] = v if Project.fields.keys.include?(k.to_s); h } + project_hash = project_hash.inject({}) { |h, (k,v)| h[k] = v if Project.fields.keys.include?(k.to_s); h } project.update project_hash - puts "Created project: #{project.title}" - # Load pages from content/*: content_path = Rails.root.join('project', project_key, 'content') puts "Loading pages from #{content_path}:" - prev_pages = project.pages project.pages = [] - Dir.foreach(content_path).each do |file| - path = Rails.root.join content_path, file - next if File.directory? path - next if ! ['.html','.erb','.md'].include? path.extname - ext = path.extname - page_key = file.split('.').first - name = page_key.capitalize - content = File.read path - - puts " Loading page: \"#{name}\" (#{content.size}b)" - if page_key == 'home' - project.home_page_content = content - + # Dir.foreach(content_path).each do |file| + # path = Rails.root.join content_path, file + # next if File.directory? path + # next if ! ['.html','.erb','.md'].include? path.extname + + # Load legacy pages from content folder directly: + Dir.glob("#{content_path}/*.{erb,html,md}").each do |path| + load_page project, path + end + + # Also load anything inside content/pages: + Dir.glob("#{content_path}/pages/*").each do |path| + if File.directory?(path) + load_page_group project, path else - # Set updated at if content changed: - updated_at = Time.now - if ! prev_pages.nil? && ! prev_pages.empty? - previous_page = prev_pages.select { |p| p[:key] == page_key } - if ! previous_page.empty? && (previous_page = previous_page.first) - updated_at = ! previous_page[:updated_at].nil? && previous_page[:content] == content ? previous_page[:updated_at] : Time.now - end - end - - # Check if we should include group browser content - group_match = //.match(content) - group_browser = '' - if group_match && !group_match.captures.empty? - group_browser = group_match.captures[0] - end - - project.pages << { - key: page_key, - name: name, - content: content, - updated_at: updated_at, - group_browser: group_browser - } + load_page project, path end end @@ -216,6 +180,72 @@ namespace :project do project end + def load_page_group(project, path) + base_key = File.basename path + + nav_content = nil + nav_path = File.join(path, "_nav.md") + if File.exist?(nav_path) + nav_content = File.read nav_path + puts "got nav: #{nav_content}" + end + + Dir.glob("#{path}/*.{erb,html,md}").each do |path| + load_page project, path, {base_key: base_key, nav: nav_content} unless File.basename(path).match(/^_/) + end + end + + def load_page(project, path, options = {}) + filename = File.basename path + + page_key = filename.split('.').first + name = page_key.capitalize + name = "#{options[:base_key].capitalize} | #{name}" if options[:base_key] + content = File.read path + + if page_key == 'home' + project.home_page_content = content + + else + # Set updated at if content changed: + updated_at = Time.now + if ! project.pages.nil? && ! project.pages.empty? + previous_page = project.pages.select { |p| p[:key] == page_key } + if ! previous_page.empty? && (previous_page = previous_page.first) + updated_at = ! previous_page[:updated_at].nil? && previous_page[:content] == content ? previous_page[:updated_at] : Time.now + end + end + + # PB 20160219 deprecating this cause doesn't appear in use + # Check if we should include group browser content + # group_match = //.match(content) + # group_browser = '' + # if group_match && !group_match.captures.empty? + # group_browser = group_match.captures[0] + # end + + # Place page nav in special page_navs hash by base key: + project.page_navs = {} if options[:nav] + project.page_navs[options[:base_key]] = options[:nav] if options[:nav] + + project.pages << { + key: ( options[:base_key].nil? ? '' : "#{options[:base_key]}/" ) + page_key, + name: name, + content: content, + updated_at: updated_at + # group_browser: group_browser + } + end + puts " Loaded page: \"#{options[:base_key]}/#{name}\" (#{content.size}b)" + + end + + def load_export_specs(project, config) + project.export_document_specs = config.map do |h| + Export::Spec::Document.from_hash h, project + end + end + def load_styles(project) load_images(project.key) @@ -400,8 +430,148 @@ namespace :project do end + desc "Build final_subject* data in database" + task :build_final_data, [:project_key, :rebuild, :start, :limit] => :environment do |task, args| + args.with_defaults rebuild: true, start: 0, limit: Float::INFINITY + rebuild = args[:rebuild] != 'false' + start = args[:start].to_i + limit = args[:limit].to_f + + project = project_by_key args[:project_key] + + start_time = Time.now + count = project.subject_sets.count + last_index = [count, start + limit - 1].min + step = [100, limit].min + built = 0 + + # puts "set: #{SubjectSet.find("5637a11432623300030a0100").inspect}" + # FinalSubjectSet.assert_for_set SubjectSet.find("56b115677061755afb539701"), rebuild + # FinalSubjectSet.assert_for_set FinalSubjectSet.find('56b118e07061755afbfcd801').subject_set, rebuild + # exit + + # Do any of this project's workflow tasks have configured export_names? If not, warn: + has_export_names = ! project.workflows.map { |w| w.tasks }.flatten.select { |t| ! t.export_name.blank? }.empty? + puts "WARNING: No export_names found in workflow configuration. This may make it tricky to interpret the field-level data. See `export_name` documentation in https://github.com/zooniverse/scribeAPI/wiki/Project-Workflows#tasks" if ! has_export_names + + if project.export_document_specs.blank? + puts "No export_spec configured; Add one before building" + exit + end + + # Rebuild indexes + FinalSubjectSet.rebuild_indexes Project.current + + (start..last_index).step(step).each do |offset| + sets = project.subject_sets.offset(offset).limit(step).each_with_index do |set, i| + + final_set = FinalSubjectSet.assert_for_set set, rebuild + built += 1 + + ellapsed = Time.now - start_time + per_set = ellapsed / built + remaining = per_set * (count - (offset + i+1)) / 60 / 60 + complete = (offset + i+1).to_f / count * 100 + $stderr.print "\r#{'%.8f' % complete}% complete. #{'%.1f' % remaining}h remaining. Built item #{offset +i+1} of #{count}" + end + end + + end + + desc "Using data in final_subject* collections, generate a series of JSON exports and attempt to create a downloadable ZIP" + task :export_final_data, [:project_key] => :environment do |task, args| + project = project_by_key args[:project_key] + # Make sure user has run build_final_data first: + if project.final_subject_sets.empty? + puts "No FinalSubjectSets found." + exit + end + + missing_env_keys = ['S3_EXPORT_BUCKET','S3_EXPORT_PATH','AWS_REGION','AWS_ACCESS_KEY_ID','AWS_SECRET_ACCESS_KEY'].select { |k| ENV[k].nil? } + if ! missing_env_keys.empty? + puts "Can not export data without setting #{missing_env_keys.join ", "}" + exit + end + s3client = Aws::S3::Client.new + + local_export_base = "#{Rails.root}/tmp/export/#{project.key}" + + # Remove previous: + # `rm -rf #{local_export_base}` if File.exists?(local_export_base) + + FileUtils.mkdir_p(local_export_base) unless File.exists?(local_export_base) + start = Time.now + built = 0 + limit = 100 + count = FinalSubjectSet.count + + (0..count).step(limit).each do |offset| + project.final_subject_sets.offset(offset).limit(limit).each_with_index do |set, i| + path = "#{local_export_base}/#{set.subject_set_id}.json" + content = FinalSubjectSetSerializer.new(set, root:false).to_json + File.open path, "w" do |f| + f << content + end + built += 1 + + # puts "Wrote #{i+1} of #{count}: #{content.size}b to #{path}" + ellapsed = Time.now - start + per_set = ellapsed / built + remaining = per_set * (count - (offset + i+1)) / 60 + complete = (offset + i+1).to_f / count * 100 + $stderr.print "\r#{'%.8f' % complete}% complete. #{'%.1f' % remaining}m remaining. Built #{offset +i+1} of #{count}" + end + end + + # Generate timestamped filename with random suffix so it can't be guessed: + rand_suffix = (('a'..'z').to_a + (0..9).to_a).shuffle[0,16].join + max_updated = project.final_subject_sets.max(:updated_at) + filename = "scribe-#{project.key}-#{max_updated.strftime("%F")}-#{rand_suffix}.tar.gz" + + # Zip it up + Rails.logger.info "Rake Complete, Begin GZIP, Go to S3" + sh %{cd #{local_export_base}; tar cfvz #{filename} --exclude '*.gz' .;} + Rails.logger.info "Tar-ing Complete" + + # Upload it to S3 + s3client = Aws::S3::Client.new + local_path = "#{local_export_base}/#{filename}" + remote_path = "#{ENV['S3_EXPORT_PATH']}/#{filename}" + + Rails.logger.info "Uploading #{local_path} to #{ENV['S3_EXPORT_BUCKET']}#{remote_path}" + s3client.put_object({ + acl: 'public-read', + bucket: ENV['S3_EXPORT_BUCKET'], + key: remote_path, + body: File.read(local_path) + }) + + # Remove local temp files + sh %{rm -rf #{local_export_base};} + + # Create the final-data-export record so it appears on /#/data/exports + s3_url = "http://#{ENV['S3_EXPORT_BUCKET']}/#{remote_path}" + FinalDataExport.create path: s3_url, num_final_subject_sets: count, project: project + + puts "Finished building exports. Download at: #{s3_url}" + + end + + desc "Convenience method that, in one call, builds all data JSONs and zips them up into a single ZIP release" + task :build_and_export_final_data, [:project_key, :rebuild, :ensure_day_of_week_is] => :environment do |task, args| + # If ensure_day_of_week_is given, proceed with execution only if weekday matches value + # (Important for heroku scheduler, which can schedule daily but not weekly) + if ! args[:ensure_day_of_week_is].blank? + if Date.today.strftime("%A").downcase != args[:ensure_day_of_week_is].downcase + puts "Aborting because today is not #{args[:ensure_day_of_week_is]}" + exit + end + end + Rake::Task['project:build_final_data'].invoke(args[:project_key], args[:rebuild]) + Rake::Task['project:export_final_data'].invoke(args[:project_key]) + end def translate_pick_one_tool_config(task_hash) config = task_hash[:tool_config] || {} @@ -487,5 +657,10 @@ namespace :project do end end + def project_by_key(key, default=Project.current) + p = Project.find_by key: key + p = default if ! p + p + end end diff --git a/package.json b/package.json index e08165df2..060c3cebf 100644 --- a/package.json +++ b/package.json @@ -15,6 +15,7 @@ "coffee-reactify": "^4.0.0", "coffee-script": "^1.9.3", "json-api-client": "^0.4.4", + "marked": "^0.3.5", "normalize-styl": "^3.0.3", "normalize.css": "^3.0.3", "react": "^0.13.3", diff --git a/project/emigrant/assets/css/styles.css b/project/emigrant/assets/css/styles.css index 44c2582c3..597f47159 100644 --- a/project/emigrant/assets/css/styles.css +++ b/project/emigrant/assets/css/styles.css @@ -98,6 +98,7 @@ html, body, width: 960px; margin: 20px auto; box-sizing: border-box; + padding: 0px; } @media screen and (max-width: 999px) { .page-content { @@ -111,7 +112,7 @@ html, body, background: rgba(255,255,255,0.85); padding: 20px 40px; } -.page-content.custom-page > div:nth-child(2) > *:first-child { +.page-content.custom-page h2 { color: #fff; background: #3f5765; margin-top: -20px; @@ -120,7 +121,21 @@ html, body, padding: 40px 0; border-top-left-radius: 6px; border-top-right-radius: 6px; + text-align: center; +} + +.page-content.custom-page .with-nav { + padding-left: 20px; } + +.page-content.custom-page .with-nav h2 { + background: transparent; + color: #3f5765; + text-align: left; + margin: 0; + padding: 10px 0; +} + .page-content > div.updated-at { border-top-left-radius: 0; border-top-right-radius: 0; @@ -137,7 +152,6 @@ html, body, .page-content h2 { font-size: 36px; font-weight: 400; - text-align: center; } .page-content h3 { font-size: 28px; diff --git a/project/emigrant/assets/images/pageuri.png b/project/emigrant/assets/images/pageuri.png new file mode 100644 index 000000000..fa75f2334 Binary files /dev/null and b/project/emigrant/assets/images/pageuri.png differ diff --git a/project/emigrant/assets/images/viewrecorddc.mp4 b/project/emigrant/assets/images/viewrecorddc.mp4 new file mode 100644 index 000000000..bb252a545 Binary files /dev/null and b/project/emigrant/assets/images/viewrecorddc.mp4 differ diff --git a/project/emigrant/bot-example.rb b/project/emigrant/bot-example.rb new file mode 100644 index 000000000..dcdd14512 --- /dev/null +++ b/project/emigrant/bot-example.rb @@ -0,0 +1,151 @@ + +require 'open-uri' +require 'json' +require 'cgi' + +# Useful extension to Hash to create query strings: +class Hash + def to_params + params = '' + stack = [] + + each do |k, v| + if v.is_a?(Hash) + stack << [k,v] + elsif v.is_a?(Array) + stack << [k,Hash.from_array(v)] + else + params << "#{k}=#{v}&" + end + end + + stack.each do |parent, hash| + hash.each do |k, v| + if v.is_a?(Hash) + stack << ["#{parent}[#{k}]", v] + else + params << "#{parent}[#{k}]=#{v}&" + end + end + end + + params.chop! + params + end + + def self.from_array(array = []) + h = Hash.new + array.size.times do |t| + h[t] = array[t] + end + h + end + +end + +# Example Scribe bot class: +class ScribeBot + + def initialize(scribe_endpoint) + @classifications_endpoint = scribe_endpoint + end + + # Post classification for a known subject_id + def classify_subject_by_id(subject_id, workflow_name, task_key, data) + params = { + workflow: { + name: workflow_name + }, + classifications: { + annotation: data, + task_key: task_key, + subject_id: subject_id + } + } + + submit_classification params + end + + # Post classification for subject specified by URL: + def classify_subject_by_url(subject_url, workflow_name, task_key, data) + params = { + subject: { + location: { + standard: CGI::escape(subject_url) + } + }, + workflow: { + name: workflow_name + }, + classifications: { + annotation: data, + task_key: task_key + } + } + + submit_classification params + end + + # Posts params as-is to classifications endpoint: + def submit_classification(params) + + require 'uri' + require "net/http" + + uri = URI(@classifications_endpoint) + + req = Net::HTTP::Post.new(uri.path, {'BOT_AUTH' => ENV['SCRIBE_BOT_TOKEN']}) + req.body = params.to_params + http = Net::HTTP.new(uri.host, uri.port) + + response = http.start {|http| http.request(req) } + + begin + JSON.parse response.body + rescue + nil + end + end +end + +# This simple script demonstrates use of the Scribe Classifications endpoint to generate data +# +# Useage: +# ruby bot-example.rb [-scribe-endpoint="http://localhost:3000"] +# + +options = Hash[ ARGV.join(' ').scan(/--?([^=\s]+)(?:=(\S+))?/) ] +options["scribe-endpoint"] = "http://localhost:3000/classifications" if ! options["scribe-endpoint"] + +args = ARGV.select { |a| ! a.match /^-/ } + +bot = ScribeBot.new options["scribe-endpoint"] + +# The following generates generates two classfiications: One mark classification +# and one transcription classification (applied to the subject generated by the +# mark classification). + +# Specify subject by standard URL (since this is a bot classification, it will be created automatically if it doesn't exist) +image_uri = "https://s3.amazonaws.com/scribe.nypl.org/emigrant-s4/full/619aed10-23fd-0133-16de-58d385a7bbd0.right-bottom.jpg" + +# Must manually specify workflow name ('mark'), and task_key ('mark_primary') +classification = bot.classify_subject_by_url( image_uri, "mark", "mark_primary", { + x: 100, + y: 200, + width: 300, + height: 200, + subToolIndex: 0 # Must specify subToolIndex (integer index into the tools array configured for workflow task) +})['classification'] + +# Response should contain a classification with a nested child_subject: +puts "Created classification: #{classification.to_json}" + +# Assuming above was successful, use the returned, generated subject_id to create next classification: +mark_id = classification['child_subject']['id'] +# Subjects generated in Mark tend to have `type`s that correspond to Transcribe task keys: +transcribe_task_key = classification['child_subject']['type'] +# Create transcription classification: +classification = bot.classify_subject_by_id( mark_id, "transcribe", transcribe_task_key, { value: 'foo' }) + +# Response should contain a classification with a nested verify subject (or orphaned subject if there is no Verify workflow) +puts "Created transcription classification: #{classification.to_json}" diff --git a/project/emigrant/content/about.html.erb b/project/emigrant/content/about.html.erb index cfc8ca848..de3e4191f 100644 --- a/project/emigrant/content/about.html.erb +++ b/project/emigrant/content/about.html.erb @@ -3,6 +3,9 @@ ### Contact information Questions? Comments? Contact us at emigrantcity@nypl.org or reach out to us on [Twitter](https://twitter.com/nypl_labs). +### Data +This is an active project and we’re continuing to gather data from the records. Every two weeks, we build a merged, anonymized dump of that data. You can browse or download the entire data set on the Data page. + ### About Emigrant Bank [Emigrant Bank](https://www.emigrant.com/Information/aboutus/AboutUs.jsp) was founded in 1850 by members of the Irish Emigrant society to serve the needs of the Irish immigrant community in New York. In its early history, the bank grew to become the seventh largest bank in the nation, and it made major investments in the growth of New York City by underwriting loans for such important initiatives as the construction of St. PatrickĂs Cathedral and a public works project that ultimately became Central Park. In 1995, Emigrant Bank generously donated to The New York Public Library [extensive archival records](http://archives.nypl.org/mss/925) that are valuable historical and genealogical resources documenting the lives of immigrant families. The Library microfilmed the entire collection and compiled a [detailed finding aid](http://archives.nypl.org/uploads/documents/documentation/collection_1837_mss925-extra.pdf) that outlines the full scope of the Emigrant Savings Bank records. This heavily trafficked collection is housed in the [Manuscripts and Archives Division](http://www.nypl.org/locations/divisions/manuscripts-division). Though, users primarily encounter it through the [Irma and Paul Milstein Division of United States History, Local History and Genealogy](http://www.nypl.org/locations/divisions/milstein). diff --git a/project/emigrant/content/data.html.erb b/project/emigrant/content/pages/data.md similarity index 77% rename from project/emigrant/content/data.html.erb rename to project/emigrant/content/pages/data.md index d711efd30..b0d4a9733 100644 --- a/project/emigrant/content/data.html.erb +++ b/project/emigrant/content/pages/data.md @@ -1,6 +1,6 @@ ## Data exports -With help from volunteers like yourself, we are extracting structured, building-level information from about 6,400 mortgages contained in the Emigrant Savings Bank Records at The New York Public Library. Of course, the ultimate goal is to make this data publicly available. Having a keyword-searchable and structured index of names and mortgage details will of great use to genealogists, historians, digital humanities researchers, and others interested in exploring historical data sources. +Participants have made {{project.classification_count}} contributions to Emigrant City to date. This project periodically builds a merged, anonymized dump of that data, which is made public here. This is an active project and we’re continuing to gather data from the records. (You can [help](/#/intro)!) The data made available here is refreshed weekly. After enough transcriptions are made, our team will determine the best way to get the data to you in an easy and accessible way. In the meantime, feel free to explore the collection assets themselves on the Library's [Digital Collections](http://digitalcollections.nypl.org/collections/emigrant-savings-bank-records) website. In this project, we're working with the subset of these digitized materials, the volumes containing Mortgage and Bond records. The volumes appearing in Emigrant City are: * [Bond and Mortgage Record Book 1 (1 to 1,555)](http://digitalcollections.nypl.org/items/df712aa0-00b1-0133-fbd7-58d385a7bbd0) @@ -15,6 +15,6 @@ After enough transcriptions are made, our team will determine the best way to ge * [Real Estate Loans No. 13](http://digitalcollections.nypl.org/items/3edf3050-24cd-0133-e6df-58d385a7b928) -You may also be interested to explore how we’ve been experimenting with opening up data from other crowdsourcing projects: +You may also be interested to explore how we’ve been experimenting with opening up data from other participatory projects: * [What's on the Menu?](http://menus.nypl.org/data) * [Building Inspector](http://buildinginspector.nypl.org/data) diff --git a/project/emigrant/content/pages/data/_nav.md b/project/emigrant/content/pages/data/_nav.md new file mode 100644 index 000000000..e8b747d38 --- /dev/null +++ b/project/emigrant/content/pages/data/_nav.md @@ -0,0 +1,4 @@ + * [About](/#/data) + * [Browse](/#/data/browse) + * [Download](/#/data/download) + * [Tips & Tricks](/#/data/tips) diff --git a/project/emigrant/content/pages/data/download.md b/project/emigrant/content/pages/data/download.md new file mode 100644 index 000000000..8ee57ac6c --- /dev/null +++ b/project/emigrant/content/pages/data/download.md @@ -0,0 +1,11 @@ +## Download + +Participants have made {{project.classification_count}} contributions to {{project.title}} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here. + +This is a large dataset in json format containing all the assertions, confidence ratings, and fields for the {{project.root_subjects_count}} records in {{project.title}}. We are actively gathering contributions to the data set and it is refreshed weekly. The last dump was {{project.latest_export.created_at}}. + +Download Latest Raw Data + +For help interpretting the data, see Scribe WIKI on Data Exports. + +To browse past releases and/or to be notified when new releases are made, you may wish to subscribe to the ATOM Feed of Data Releases diff --git a/project/emigrant/content/pages/data/tips.md b/project/emigrant/content/pages/data/tips.md new file mode 100644 index 000000000..22b42e93f --- /dev/null +++ b/project/emigrant/content/pages/data/tips.md @@ -0,0 +1,48 @@ +## Tips & Tricks + +### Search +* **Use quotation marks to search for full phrases.** +[Search results](http://emigrantcity.nypl.org/?#/data/browse?keyword=margaret%20armstrong) for Margaret Armstrong will display records with Margaret and/or Armstrong in the record. However, [search results](http://emigrantcity.nypl.org/?#/data/browse?keyword=%22margaret%20armstrong%22) for "Margaret Armstrong" display only records where the full name "Margaret Armstrong" appear. + +* **Capitalization does not matter in search.** +"MARGARET," "margaret," and "Margaret" entered into the search box, will yield the same results. + +* **Consider abbreviations.** +We asked users to transcribe exactly what they see written on the records including abbreviations. The word Brooklyn may appear as “BKLYN” in one record and “Brooklyn” in another. + +### Reading a Record +The data in these records have been communally created through the Emigrant City participatory project. See the Intro for more information on the steps for publicly and collaboratively creating this resource. As you'll see below, we've tried to be transparent about the collaborative nature of this resource with notations about confidence and status. + +* **Best Data versus All Data** +A record's best data consolidates and lightly cleans all the fields for that record. During the transcription process, the same field may have been marked multiple times and made it all the way through the Scribe work-flow resulting in duplications of that unique field. Best data consolidates these duplications. There is also minimal formatting cleanup. For instance, in Best Data Amount Loaned is represented as a dollar amount rather than just a number. + +* **What is the Source Metadata?** +This page contains technical details related to the transcription interface and includes a link to the high res version of the record page. + +* **What is the confidence field?** +Record fields were created through contributions from many users. A result, we can gauge how confident we are about each field's accuracy. Fields with an 100% confidence ratings are fields for which every transcription was the same. Lower confidence ratings mean that there was disagreement on how best to transcribe a field. + +* **What is a field's status?** +The status of each field is displayed. This corresponds with where the field's transcription is in the work-flow of the project be it Mark, Transcribe, or Verify. + +* **What are distinct transcriptions?** +Each field is annotated with the number of distinct transcriptions they have received during the project's run. A distinct transcription of 1 means everyone transcribed the same thing for that field. + +* **High res images of the records** +Emigrant City is a project and resource to browse historic mortgage records from the Emigrant Savings Bank. You may view and/or download high resolution images of these records on our Digital Collections site. If you would like to view the high resolution version of a record, navigate to the Source Metadata tab and click on the Page Uri Link. A high resolution image of the record's full page will open in a new tab. + + + + + +* **Which fields appear?** +This project intended to create a index to enable further discovery. With this project, the goal is to create an index of the bond and mortgage records to enable further discovery and use. We’ve worked with experts on the material to scope the data collected to fields which appear with greater regularity across documents: Record Date, Record Number, Mortgager Name, Street Address, Amount Loaned, Valuation, and Dimension & Description. Collecting and verifying these fields from the records creates resource that opens up the search of these materials. However, the data being gathered is not an exhaustive. Rather than creating a complete transcription of each record, the transcribed fields create an index to give some toeholds for future reference and querying of these rich materials which previously were largely invisible and difficult to search. + +* **How often is the data refreshed?** +Contributions are still actively being made to Emigrant City. The records available in browse and download will be refreshed weekly with these new contributions. (You can still contribute to the project by transcribing and verifying data.) + + + diff --git a/project/emigrant/content/pages/data_new.md b/project/emigrant/content/pages/data_new.md new file mode 100644 index 000000000..d7320981f --- /dev/null +++ b/project/emigrant/content/pages/data_new.md @@ -0,0 +1,21 @@ +## Data exports + +Participants have made {{project.classification_count}} contributions to Emigrant City to date. This project periodically builds a merged, anonymized dump of that data, which is made public here. This is an active project and we’re continuing to gather data from the records. The data made available here is refreshed weekly. + +## Source Assets +Feel free to explore the collection assets themselves on the Library's [Digital Collections](http://digitalcollections.nypl.org/collections/emigrant-savings-bank-records) website. In this project, we're working with the subset of these digitized materials, the volumes containing Mortgage and Bond records. The volumes appearing in Emigrant City are: +* [Bond and Mortgage Record Book 1 (1 to 1,555)](http://digitalcollections.nypl.org/items/df712aa0-00b1-0133-fbd7-58d385a7bbd0) +* [Bond and Mortgage Record Book 2 (1,556 to 2, 721)](http://digitalcollections.nypl.org/items/c0c38370-015a-0133-065e-58d385a7bbd0) +* [Bond and Mortgage Record Book 3 (2,722 to 3,699)](http://digitalcollections.nypl.org/items/5bb969d0-0241-0133-f196-58d385a7b928) +* [Bond and Mortgage Record Book 4 (3,700 to 4,499)](http://digitalcollections.nypl.org/items/109c0900-02e7-0133-03cf-58d385a7bbd0) +* [Bond and Mortgage Record Book 5 (4,500 to 5,499)](http://digitalcollections.nypl.org/items/e53b4fe0-02fc-0133-0e0d-58d385a7bbd0) +* [Bond and Mortgage Record Book 6 (5,500 to 6,403)](http://digitalcollections.nypl.org/items/20aa00a0-0311-0133-9d30-58d385a7bbd0) +* [Real Estate Loans No. 9](http://digitalcollections.nypl.org/items/6cf0ed60-23ef-0133-6b54-58d385a7b928) +* [Real Estate Loans No. 10](http://digitalcollections.nypl.org/items/59b0a100-23fd-0133-b24f-58d385a7bbd0) +* [Real Estate Loans No. 11](http://digitalcollections.nypl.org/items/cf0c3ee0-24bd-0133-5e2d-58d385a7b928) +* [Real Estate Loans No. 13](http://digitalcollections.nypl.org/items/3edf3050-24cd-0133-e6df-58d385a7b928) + + +You may also be interested to explore how we’ve been experimenting with opening up data from other crowdsourcing projects: +* [What's on the Menu?](http://menus.nypl.org/data) +* [Building Inspector](http://buildinginspector.nypl.org/data) diff --git a/project/emigrant/content/pages/data_new/_nav.md b/project/emigrant/content/pages/data_new/_nav.md new file mode 100644 index 000000000..c7568b707 --- /dev/null +++ b/project/emigrant/content/pages/data_new/_nav.md @@ -0,0 +1,4 @@ + * [About](/#/data_new) + * [Browse](/#/data_new/browse) + * [Download](/#/data_new/download) + * [Tips & Tricks](/#/data_new/tips) diff --git a/project/emigrant/content/pages/data_new/download.md b/project/emigrant/content/pages/data_new/download.md new file mode 100644 index 000000000..8ee57ac6c --- /dev/null +++ b/project/emigrant/content/pages/data_new/download.md @@ -0,0 +1,11 @@ +## Download + +Participants have made {{project.classification_count}} contributions to {{project.title}} to date. This project periodically builds a merged, anonymized dump of that data, which is made public here. + +This is a large dataset in json format containing all the assertions, confidence ratings, and fields for the {{project.root_subjects_count}} records in {{project.title}}. We are actively gathering contributions to the data set and it is refreshed weekly. The last dump was {{project.latest_export.created_at}}. + +Download Latest Raw Data + +For help interpretting the data, see Scribe WIKI on Data Exports. + +To browse past releases and/or to be notified when new releases are made, you may wish to subscribe to the ATOM Feed of Data Releases diff --git a/project/emigrant/content/pages/data_new/tips.md b/project/emigrant/content/pages/data_new/tips.md new file mode 100644 index 000000000..22b42e93f --- /dev/null +++ b/project/emigrant/content/pages/data_new/tips.md @@ -0,0 +1,48 @@ +## Tips & Tricks + +### Search +* **Use quotation marks to search for full phrases.** +[Search results](http://emigrantcity.nypl.org/?#/data/browse?keyword=margaret%20armstrong) for Margaret Armstrong will display records with Margaret and/or Armstrong in the record. However, [search results](http://emigrantcity.nypl.org/?#/data/browse?keyword=%22margaret%20armstrong%22) for "Margaret Armstrong" display only records where the full name "Margaret Armstrong" appear. + +* **Capitalization does not matter in search.** +"MARGARET," "margaret," and "Margaret" entered into the search box, will yield the same results. + +* **Consider abbreviations.** +We asked users to transcribe exactly what they see written on the records including abbreviations. The word Brooklyn may appear as “BKLYN” in one record and “Brooklyn” in another. + +### Reading a Record +The data in these records have been communally created through the Emigrant City participatory project. See the Intro for more information on the steps for publicly and collaboratively creating this resource. As you'll see below, we've tried to be transparent about the collaborative nature of this resource with notations about confidence and status. + +* **Best Data versus All Data** +A record's best data consolidates and lightly cleans all the fields for that record. During the transcription process, the same field may have been marked multiple times and made it all the way through the Scribe work-flow resulting in duplications of that unique field. Best data consolidates these duplications. There is also minimal formatting cleanup. For instance, in Best Data Amount Loaned is represented as a dollar amount rather than just a number. + +* **What is the Source Metadata?** +This page contains technical details related to the transcription interface and includes a link to the high res version of the record page. + +* **What is the confidence field?** +Record fields were created through contributions from many users. A result, we can gauge how confident we are about each field's accuracy. Fields with an 100% confidence ratings are fields for which every transcription was the same. Lower confidence ratings mean that there was disagreement on how best to transcribe a field. + +* **What is a field's status?** +The status of each field is displayed. This corresponds with where the field's transcription is in the work-flow of the project be it Mark, Transcribe, or Verify. + +* **What are distinct transcriptions?** +Each field is annotated with the number of distinct transcriptions they have received during the project's run. A distinct transcription of 1 means everyone transcribed the same thing for that field. + +* **High res images of the records** +Emigrant City is a project and resource to browse historic mortgage records from the Emigrant Savings Bank. You may view and/or download high resolution images of these records on our Digital Collections site. If you would like to view the high resolution version of a record, navigate to the Source Metadata tab and click on the Page Uri Link. A high resolution image of the record's full page will open in a new tab. + + + + + +* **Which fields appear?** +This project intended to create a index to enable further discovery. With this project, the goal is to create an index of the bond and mortgage records to enable further discovery and use. We’ve worked with experts on the material to scope the data collected to fields which appear with greater regularity across documents: Record Date, Record Number, Mortgager Name, Street Address, Amount Loaned, Valuation, and Dimension & Description. Collecting and verifying these fields from the records creates resource that opens up the search of these materials. However, the data being gathered is not an exhaustive. Rather than creating a complete transcription of each record, the transcribed fields create an index to give some toeholds for future reference and querying of these rich materials which previously were largely invisible and difficult to search. + +* **How often is the data refreshed?** +Contributions are still actively being made to Emigrant City. The records available in browse and download will be refreshed weekly with these new contributions. (You can still contribute to the project by transcribing and verifying data.) + + + diff --git a/project/emigrant/project.json b/project/emigrant/project.json index 9a4f45bf8..ba0284c87 100644 --- a/project/emigrant/project.json +++ b/project/emigrant/project.json @@ -46,6 +46,8 @@ "google_analytics_client_id": "UA-69673163-1" }, + "discuss_url": "http://forum.emigrantcity.nypl.org", + "forum": { "type": "discourse", "base_url": "http://forum.emigrantcity.nypl.org" @@ -58,5 +60,43 @@ {"label": "About", "page": "about"} ], "footer": [] - } + }, + + "export_specs": [ + { + "name": "Record", + "spec_fields": [ + {"name": "Mortgager", "repeats": false}, + {"name": "Street Address", "format": "address", "repeats": false}, + {"name": "Record Date", "format": "date", "format_options": {"range": [1850,1950]}, "repeats": false}, + { + "name": "Land & Building Dimensions", + "format": { + "em_survey_land_dimensions": "dimensions", + "em_survey_building_dimensions": "dimensions" + }, + "repeats": false + }, + {"name": "Amount Loaned", "format": "monetary", "repeats": false}, + {"name": "Stories & Materials", "format": {"em_record_stories": "numeric"}, "repeats": false}, + { + "name": "Valuation", + "select": "Total Value", + "repeats": true, + "sub_fields": [ + {"name": "Total Value", "format": "monetary", "repeats": false}, + {"name": "Date", "select": "Valuation Date", "format": "date", "repeats": false}, + { + "name": "Land & Building Value", + "select": "Land & Building Value", + "format": {"em_valuation_ground": "monetary", "em_valuation_building": "monetary"}, + "repeats": false + } + ] + }, + {"name": "Record Number", "format": "numeric", "repeats": false}, + {"name": "Additional Info", "repeats": true} + ] + } + ] } diff --git a/project/emigrant/scripts/query_subjects.rb b/project/emigrant/scripts/query_subjects.rb index a4a8340d7..589ca7ce3 100644 --- a/project/emigrant/scripts/query_subjects.rb +++ b/project/emigrant/scripts/query_subjects.rb @@ -11,7 +11,8 @@ client = NyplRepo::Client.new ENV['DC_API_KEY'] item_uuids = [ - "be6d6300-ecf4-0132-456e-58d385a7b928", # Book 1 (1 to 1,555) http://digitalcollections.nypl.org/items/df712aa0-00b1-0133-fbd7-58d385a7bbd0 +=begin +"be6d6300-ecf4-0132-456e-58d385a7b928", # Book 1 (1 to 1,555) http://digitalcollections.nypl.org/items/df712aa0-00b1-0133-fbd7-58d385a7bbd0 "bf0c1890-ecf4-0132-faa2-58d385a7b928", # Book 2 (1,556 to 2, 721) http://digitalcollections.nypl.org/items/c0c38370-015a-0133-065e-58d385a7bbd0 "bfe9fbe0-ecf4-0132-7e52-58d385a7b928", # Book 3 (2,722 to 3,699) http://digitalcollections.nypl.org/items/5bb969d0-0241-0133-f196-58d385a7b928 "c0921750-ecf4-0132-1737-58d385a7b928", # Book 4 (3,700 to 4,499) http://digitalcollections.nypl.org/items/109c0900-02e7-0133-03cf-58d385a7bbd0 @@ -20,6 +21,8 @@ "c53c32d0-ecf4-0132-b51f-58d385a7b928", # Real Estate Loans No. 9 http://digitalcollections.nypl.org/items/6cf0ed60-23ef-0133-6b54-58d385a7b928 "c5d23760-ecf4-0132-8bed-58d385a7b928", # Real Estate Loans No. 10 http://digitalcollections.nypl.org/items/59b0a100-23fd-0133-b24f-58d385a7bbd0 "c6697fe0-ecf4-0132-b1fc-58d385a7b928", # Real Estate Loans No. 11 http://digitalcollections.nypl.org/items/cf0c3ee0-24bd-0133-5e2d-58d385a7b928 +=end + "c7d4b670-ecf4-0132-5854-58d385a7b928", # Real Estate Loans No. 11 http://digitalcollections.nypl.org/items/01af2f60-8701-0133-b22c-00505686a51c ] @@ -45,7 +48,7 @@ end end -out_path = "#{File.dirname(File.dirname(__FILE__))}/subjects/subjects_from_api.building.csv" +out_path = "#{File.dirname(File.dirname(__FILE__))}/subjects/subjects_from_api.book14.csv" CSV.open(out_path, "wb") do |csv| csv << subjects.first.keys diff --git a/project/emigrant/workflows/transcribe.json b/project/emigrant/workflows/transcribe.json index 2bb1f9f0d..ffdc47d1b 100644 --- a/project/emigrant/workflows/transcribe.json +++ b/project/emigrant/workflows/transcribe.json @@ -19,7 +19,8 @@ "help": { "file": "t_record_date" }, - "generates_subject_type": "em_transcribed_date" + "generates_subject_type": "em_transcribed_date", + "export_name": "Record Date" }, "em_record_number": { @@ -30,7 +31,8 @@ "help": { "file": "t_record_number" }, - "generates_subject_type": "em_transcribed_record_number" + "generates_subject_type": "em_transcribed_record_number", + "export_name": "Record Number" }, "em_record_mortgager": { @@ -40,7 +42,8 @@ "generates_subject_type": "em_transcribed_mortgager", "help": { "file": "t_record_mortgager" - } + }, + "export_name": "Mortgager" }, "em_record_street_address": { @@ -51,7 +54,8 @@ "generates_subject_type": "em_transcribed_address", "help": { "file": "t_record_street_address" - } + }, + "export_name": "Street Address" }, "em_record_amount_loaned": { @@ -62,7 +66,8 @@ "generates_subject_type": "em_transcribed_amount_loaned", "help": { "file": "t_record_amount_loaned" - } + }, + "export_name": "Amount Loaned" }, "em_record_valuation": { @@ -74,7 +79,8 @@ "file": "t_record_valuation" }, "generates_subject_type": "em_transcribed_valuation_date", - "next_task": "em_record_valuation_ground_building" + "next_task": "em_record_valuation_ground_building", + "export_name": "Valuation Date" }, "em_record_valuation_ground_building": { @@ -101,7 +107,8 @@ }, "generates_subject_type": "em_transcribed_valuation_itemized", "instruction": "Sometimes valuations include itemized dollar values for \"ground\" and \"building\". Enter these amounts if you can find them. In the next screen, you'll enter the total valuation.", - "next_task": "em_record_valuation_total" + "next_task": "em_record_valuation_total", + "export_name": "Land & Building Value" }, "em_record_valuation_total": { @@ -113,7 +120,8 @@ "file": "t_record_valuation" }, "generates_subject_type": "em_transcribed_valuation_total", - "next_task": null + "next_task": null, + "export_name": "Total Value" }, "em_record_survey": { @@ -138,7 +146,8 @@ }, "generates_subject_type": "em_transcribed_survey", "instruction": "Enter, as they appear, any land and building dimensions that were recorded. In the next screen, you'll enter the descriptive information.", - "next_task": "em_record_survey_stories_materials" + "next_task": "em_record_survey_stories_materials", + "export_name": "Land & Building Dimensions" }, "em_record_survey_stories_materials": { @@ -163,7 +172,8 @@ }, "generates_subject_type": "em_transcribed_stories_materials", "instruction": "Enter, as they appear, the number of stories and the building materials.", - "next_task": "em_record_survey_additional_info" + "next_task": "em_record_survey_additional_info", + "export_name": "Stories & Materials" }, "em_record_survey_additional_info": { @@ -175,7 +185,8 @@ "file": "t_record_additional_info" }, "generates_subject_type": null, - "next_task": null + "next_task": null, + "export_name": "Additional Info" } } }