diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md index 16024ced91..bcd753729a 100644 --- a/.github/CONTRIBUTING.md +++ b/.github/CONTRIBUTING.md @@ -1,7 +1,7 @@ Our [technical contributions wiki](https://github.com/programminghistorian/jekyll/wiki/Making-technical-contributions) describes best practices for modifying the code that generates _The Programming Historian_ itself. -If you are interested in making a _content_ contribution like a new lesson, please see our [pages on contributing as an author or an editor](http://programminghistorian.org/contribute). +If you are interested in making a _content_ contribution like a new lesson, please see our [pages on contributing as an author or an editor](https://programminghistorian.org/contribute). ## Anti-harassment Policy -The *Programming Historian* is dedicated to providing an open scholarly environment that offers community participants the freedom to thoroughly scrutinize ideas, to ask questions, make suggestions, or to requests for clarification, but also provides a harassment-free space for all contributors to the project, regardless of gender, gender identity and expression, sexual orientation, disability, physical appearance, body size, race, age or religion, or technical experience. We do not tolerate harassment or ad hominem attacks of community participants in any form. Participants violating these rules may be expelled from the community at the discretion of the editorial board. If anyone witnesses or feels they have been the victim of the above described activity, please contact our ombudspeople (see the contact for each language on our Project Team page: ). Thank you for helping us to create a safe space. +The *Programming Historian* is dedicated to providing an open scholarly environment that offers community participants the freedom to thoroughly scrutinize ideas, to ask questions, make suggestions, or to requests for clarification, but also provides a harassment-free space for all contributors to the project, regardless of gender, gender identity and expression, sexual orientation, disability, physical appearance, body size, race, age or religion, or technical experience. We do not tolerate harassment or ad hominem attacks of community participants in any form. Participants violating these rules may be expelled from the community at the discretion of the editorial board. If anyone witnesses or feels they have been the victim of the above described activity, please contact our ombudspeople (see the contact for each language on our Project Team page: ). Thank you for helping us to create a safe space. diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 917cf03e3c..6077e8d94b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,7 +19,7 @@ jobs: if : github.event.action != 'closed' uses: ruby/setup-ruby@v1 with: - ruby-version: 2.6 + ruby-version: 3.2.2 bundler-cache: true # runs 'bundle install' and caches installed gems automatically # We then check if we can build our Jekyll site (all this logic is built-in with Jekyll and this prevents us from merging in any syntax errors). - name: Jekyll build @@ -29,13 +29,17 @@ jobs: bundle exec jekyll build # Next we check all the links in our site to make sure we aren't pushing up broken links. - name: Check HTML - uses: zoeleblanc/htmlproofer@master - if : github.event.action != 'closed' - with: - directory: "./_site" - # The directory to scan - arguments: --assume-extension --empty-alt-ignore --alt-ignore "/.*/" --file-ignore "/assets/,/retired/,/retirada/,/retrait/,/posts/" --timeframe "30d" --only-4xx --http-status-ignore 429,403,400 --url-ignore "/http://www.gutenberg.org/*/,/https://github.com/programminghistorian/*/,/https://github.com/orgs/programminghistorian/*/,/\#/,/espanol/,/deprecated/,/collection.britishmuseum.org/,/analytics.hathitrust.org/,/fr.wikipedia.org/wiki/,https://web.archive.org/web/20180831094856/http://www.dlsi.ua.es/~borja/riilua/6.TopicModeling_v02.pdf" --allow-hash-href - # The arguments to pass to HTMLProofer + if: github.event.action != 'closed' + run: | + bundle exec htmlproofer ./_site \ + --assume-extension .html \ + --ignore-missing-alt \ + --ignore-empty-alt \ + --only-4xx \ + --ignore-status-codes 429,403,400,415 \ + --ignore-files "/assets/,/en/lessons/retired/,/es/lecciones/retirada/,/fr/lecons/retrait/,/pt/licoes/retiradas/,/posts/,/blog/" \ + --ignore-urls "/github\.com\/programminghistorian/,/gutenberg\.org/,/espanol/,/deprecated/,/collection\.britishmuseum\.org/,/analytics\.hathitrust\.org/,/fr\.wikipedia\.org\/wiki/,/docnow\.io/,/doxygen\.nl/,/doi\.org\/10\.34190\/JEL\.17\.3\.002/,/doi\.org\/10\.22134\/trace\.82\.2022\.819/,/rubenalcaraz\.es\/manual-omeka\/?/,/web\.archive\.org\/web\/20180831094856\/http:\/\/www\.dlsi\.ua\.es\/~borja\/riilua\/6\.TopicModeling_v02\.pdf/" \ + --allow-hash-href # Finally if we are successful in building, we trigger a rebuild of our search index in the search-index repository - name: Trigger next workflow if: success() && github.event.action == 'closed' @@ -45,4 +49,3 @@ jobs: repository: programminghistorian/search-index event-type: trigger-search-build client-payload: '{"ref": "${{ github.ref }}", "sha": "${{ github.sha }}"}' - diff --git a/.htmlproofer.yml b/.htmlproofer.yml new file mode 100644 index 0000000000..74ae9cafd3 --- /dev/null +++ b/.htmlproofer.yml @@ -0,0 +1,47 @@ +assume_extension: true +only_4xx: true +ignore_missing_alt: true +ignore_empty_alt: true +allow_hash_href: true +check_external_hash: false +http_status_ignore: + - 429 + - 403 + - 400 +url_ignore: + - /\/\/www.gutenberg.org\/.*/ + - /https:\/\/github.com\/programminghistorian\/.*/ + - /https:\/\/github.com\/orgs\/programminghistorian\/.*/ + - /\#/ + - /espanol/ + - /deprecated/ + - /collection.britishmuseum.org/ + - /analytics.hathitrust.org/ + - /fr.wikipedia.org\/wiki/ + - /https:\/\/web.archive.org\/web\/20180831094856\/http:\/\/www.dlsi.ua.es\/~borja\/riilua\/6.TopicModeling_v02.pdf/ + - https://github.com/programminghistorian/jekyll/commits/gh-pages + - https://github.com/programminghistorian/jekyll/commits/gh-pages/.* + - /\/images\/intro-a-google-maps-y-google-earth\/geo-es\d+\.png/ + - /http:\/\/humanidadesdigitaleshispanicas\.es\/.*/ + - /http:\/\/dhawards\.org\/.*/ + - /\/en\/lessons\/building-static-sites-with-jekyll-github-pages#(writing-pages-and-posts|hosting-on-github-pages|where-and-what-is-everything)/ + - /\/es\/guia-para-autores#paso-1-proponer-una-nueva-lección/ + - /\/fr\/consignes-auteurs#étape-1-proposer-une-nouvelle-leçon/ + - /\/pt\/directrizes-autor#etapa-1-propor-uma-nova-lição/ +ignore_files: + - /_site\/assets\/.*/ + - /_site\/retired\/.*/ + - /_site\/retirada\/.*/ + - /_site\/retrait\/.*/ + - /_site\/posts\/.*/ + - /_site\/blog\/.*/ +ignore_elements: + - pre + - code + - script +typhoeus: + connecttimeout: 30 + timeout: 60 +hydra: + max_concurrency: 2 +log_level: debug \ No newline at end of file diff --git a/.ruby-version b/.ruby-version index d5724cd41b..acf9bf09db 100644 --- a/.ruby-version +++ b/.ruby-version @@ -1 +1 @@ -2.6.2 \ No newline at end of file +3.2.2 \ No newline at end of file diff --git a/FixingLinksNotebook.ipynb b/FixingLinksNotebook.ipynb new file mode 100644 index 0000000000..2e3fe2485d --- /dev/null +++ b/FixingLinksNotebook.ipynb @@ -0,0 +1,444 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "61f40e40", + "metadata": {}, + "source": [ + "# Fixing Links Notebook" + ] + }, + { + "cell_type": "markdown", + "id": "fac6fe6c", + "metadata": {}, + "source": [ + "Before running this notebook, make sure to run the following command in the terminal to install the required packages:\n", + "\n", + "```bash\n", + "bundle install\n", + "make all\n", + "ruby parse_htmlproofer_log.rb \n", + "```\n", + "\n", + "Each command should be run separately and the final two commands create files for all the htmlproofer errors and warnings. This notebook loads the final csv file to help you see what links exists. You will also need to install the `pandas` library if you haven't already. You can do this by running:\n", + "\n", + "```bash\n", + "pip install pandas\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "ae22f5c9", + "metadata": {}, + "source": [ + "## Load Libraries and Data" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7106f439", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "6f0bb84a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of errors: 10\n" + ] + } + ], + "source": [ + "df = pd.read_csv(\"htmlproofer-report.csv\")\n", + "# Lower case the column names\n", + "df.columns = df.columns.str.lower()\n", + "print(f\"Number of errors: {len(df)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f7e9ea41", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unique error messages: 4\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
messagecount
0External link https://central.github.com/mac/l...4
1External link https://medici2023.sciencesconf....4
\n", + "
" + ], + "text/plain": [ + " message count\n", + "0 External link https://central.github.com/mac/l... 4\n", + "1 External link https://medici2023.sciencesconf.... 4" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "message_counts = df.message.value_counts().reset_index()\n", + "print(f\"Number of unique error messages: {len(message_counts)}\")\n", + "message_counts[(message_counts['count']>1)]" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e07121b3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of external link errors: 9\n", + "Number of internal link errors: 0\n" + ] + } + ], + "source": [ + "external_links = df[df['message'].str.contains(\"External link\")].copy()\n", + "internal_links = df[df['message'].str.contains(\"internally linking\")].copy()\n", + "print(f\"Number of external link errors: {len(external_links)}\")\n", + "print(f\"Number of internal link errors: {len(internal_links)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e1c9cab2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of unique files with errors: 8\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filecount
0_site/en/lessons/retired/getting-started-with-...2
1_site/es/lecciones/retirada/introduccion-contr...2
\n", + "
" + ], + "text/plain": [ + " file count\n", + "0 _site/en/lessons/retired/getting-started-with-... 2\n", + "1 _site/es/lecciones/retirada/introduccion-contr... 2" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "file_counts = df.file.value_counts().reset_index()\n", + "print(f\"Number of unique files with errors: {len(file_counts)}\")\n", + "file_counts[file_counts['count']>1]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "7fbda16b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filecountcount_index
0_site/en/lessons/retired/getting-started-with-...20
1_site/es/lecciones/retirada/introduccion-contr...21
2_site/en/lessons/retired/OCR-with-Tesseract-an...12
3_site/en/lessons/retired/intro-to-augmented-re...13
4_site/en/research/index.html14
5_site/es/investigacion/index.html15
6_site/fr/recherche/index.html16
7_site/pt/pesquisa/index.html17
\n", + "
" + ], + "text/plain": [ + " file count count_index\n", + "0 _site/en/lessons/retired/getting-started-with-... 2 0\n", + "1 _site/es/lecciones/retirada/introduccion-contr... 2 1\n", + "2 _site/en/lessons/retired/OCR-with-Tesseract-an... 1 2\n", + "3 _site/en/lessons/retired/intro-to-augmented-re... 1 3\n", + "4 _site/en/research/index.html 1 4\n", + "5 _site/es/investigacion/index.html 1 5\n", + "6 _site/fr/recherche/index.html 1 6\n", + "7 _site/pt/pesquisa/index.html 1 7" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "file_counts_df = df.file.value_counts().reset_index()\n", + "file_counts_df['count_index'] = file_counts_df.index\n", + "\n", + "file_counts_df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6352df82", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df = df.merge(file_counts_df, on='file', how='outer').sort_values(by=\"count_index\", ascending=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "49d471d2", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df.to_clipboard()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "95f87012", + "metadata": {}, + "outputs": [], + "source": [ + "# import os\n", + "# import re\n", + "\n", + "# EXTENSIONS = (\".yml\")\n", + "\n", + "# def replace_links_preserving_code_blocks(file_path):\n", + "# with open(file_path, \"r\", encoding=\"utf-8\") as f:\n", + "# content = f.read()\n", + "\n", + "# # Match code blocks (triple backticks) and inline code (`...`)\n", + "# code_blocks = list(re.finditer(r\"(```.*?```|`[^`]*`)\", content, re.DOTALL))\n", + "# modified = content\n", + "# offset = 0\n", + "\n", + "# for match in code_blocks:\n", + "# start, end = match.span()\n", + "# segment = content[start:end]\n", + "\n", + "# # Temporarily mark this section to skip\n", + "# placeholder = f\"%%CODEBLOCK{start}%%\"\n", + "# modified = modified[:start + offset] + placeholder + modified[end + offset:]\n", + "# offset += len(placeholder) - (end - start)\n", + "\n", + "# # Replace all http:// with https://\n", + "# modified = re.sub(r\"http://\", \"https://\", modified)\n", + "\n", + "# # Restore code blocks untouched\n", + "# for match in code_blocks:\n", + "# start = match.start()\n", + "# placeholder = f\"%%CODEBLOCK{start}%%\"\n", + "# modified = modified.replace(placeholder, match.group(0))\n", + "\n", + "# if content != modified:\n", + "# print(f\"✅ Updated: {file_path}\")\n", + "# with open(file_path, \"w\", encoding=\"utf-8\") as f:\n", + "# f.write(modified)\n", + "\n", + "# def process_all_files(root=\".\"):\n", + "# for dirpath, _, filenames in os.walk(root):\n", + "# for fname in filenames:\n", + "# if fname.endswith(EXTENSIONS) and \"ph_authors\" in fname:\n", + "# replace_links_preserving_code_blocks(os.path.join(dirpath, fname))\n", + "\n", + "# process_all_files()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "66f11201", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "spring-2026-env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/Gemfile b/Gemfile index 4779aebab5..4bbb9cb5f6 100644 --- a/Gemfile +++ b/Gemfile @@ -2,4 +2,4 @@ source 'https://rubygems.org' gem 'github-pages' gem 'jekyll-redirect-from' gem 'jekyll-paginate' -gem 'html-proofer', "~> 3.0", "< 3.18" \ No newline at end of file +gem 'html-proofer' \ No newline at end of file diff --git a/Gemfile.lock b/Gemfile.lock index 4484dd0b56..e7eb21dff4 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,47 +1,81 @@ GEM remote: https://rubygems.org/ specs: - activesupport (6.0.5.1) - concurrent-ruby (~> 1.0, >= 1.0.2) - i18n (>= 0.7, < 2) - minitest (~> 5.1) - tzinfo (~> 1.1) - zeitwerk (~> 2.2, >= 2.2.2) - addressable (2.8.0) - public_suffix (>= 2.0.2, < 5.0) + Ascii85 (2.0.1) + activesupport (8.1.3) + base64 + bigdecimal + concurrent-ruby (~> 1.0, >= 1.3.1) + connection_pool (>= 2.2.5) + drb + i18n (>= 1.6, < 2) + json + logger (>= 1.4.2) + minitest (>= 5.1) + securerandom (>= 0.3) + tzinfo (~> 2.0, >= 2.0.5) + uri (>= 0.13.1) + addressable (2.9.0) + public_suffix (>= 2.0.2, < 8.0) + afm (1.0.0) + async (2.37.0) + console (~> 1.29) + fiber-annotation + io-event (~> 1.11) + metrics (~> 0.12) + traces (~> 0.18) + base64 (0.3.0) + benchmark (0.5.0) + bigdecimal (3.3.1) coffee-script (2.4.1) coffee-script-source execjs - coffee-script-source (1.11.1) + coffee-script-source (1.12.2) colorator (1.1.0) - commonmarker (0.23.5) - concurrent-ruby (1.1.10) - dnsruby (1.61.9) - simpleidn (~> 0.1) + commonmarker (0.23.12) + concurrent-ruby (1.3.6) + connection_pool (3.0.2) + console (1.34.3) + fiber-annotation + fiber-local (~> 1.1) + json + csv (3.3.5) + dnsruby (1.73.1) + base64 (>= 0.2) + logger (~> 1.6) + simpleidn (~> 0.2.1) + drb (2.2.3) em-websocket (0.5.3) eventmachine (>= 0.12.9) http_parser.rb (~> 0) - ethon (0.15.0) + ethon (0.18.0) ffi (>= 1.15.0) + logger eventmachine (1.2.7) - execjs (2.8.1) - faraday (2.3.0) - faraday-net_http (~> 2.0) - ruby2_keywords (>= 0.0.4) - faraday-net_http (2.0.3) - ffi (1.15.5) + execjs (2.10.1) + faraday (2.14.1) + faraday-net_http (>= 2.0, < 3.5) + json + logger + faraday-net_http (3.4.2) + net-http (~> 0.5) + ffi (1.17.4-arm64-darwin) + fiber-annotation (0.2.0) + fiber-local (1.1.0) + fiber-storage + fiber-storage (1.0.1) forwardable-extended (2.6.0) - gemoji (3.0.1) - github-pages (227) - github-pages-health-check (= 1.17.9) - jekyll (= 3.9.2) - jekyll-avatar (= 0.7.0) - jekyll-coffeescript (= 1.1.1) - jekyll-commonmark-ghpages (= 0.2.0) - jekyll-default-layout (= 0.1.4) - jekyll-feed (= 0.15.1) + gemoji (4.1.0) + github-pages (232) + github-pages-health-check (= 1.18.2) + jekyll (= 3.10.0) + jekyll-avatar (= 0.8.0) + jekyll-coffeescript (= 1.2.2) + jekyll-commonmark-ghpages (= 0.5.1) + jekyll-default-layout (= 0.1.5) + jekyll-feed (= 0.17.0) jekyll-gist (= 1.5.0) - jekyll-github-metadata (= 2.13.0) + jekyll-github-metadata (= 2.16.1) jekyll-include-cache (= 0.2.1) jekyll-mentions (= 1.6.0) jekyll-optional-front-matter (= 0.3.2) @@ -68,40 +102,46 @@ GEM jekyll-theme-tactile (= 0.2.0) jekyll-theme-time-machine (= 0.2.0) jekyll-titles-from-headings (= 0.5.3) - jemoji (= 0.12.0) - kramdown (= 2.3.2) + jemoji (= 0.13.0) + kramdown (= 2.4.0) kramdown-parser-gfm (= 1.1.0) - liquid (= 4.0.3) + liquid (= 4.0.4) mercenary (~> 0.3) minima (= 2.5.1) - nokogiri (>= 1.13.6, < 2.0) - rouge (= 3.26.0) + nokogiri (>= 1.16.2, < 2.0) + rouge (= 3.30.0) terminal-table (~> 1.4) - github-pages-health-check (1.17.9) + webrick (~> 1.8) + github-pages-health-check (1.18.2) addressable (~> 2.3) dnsruby (~> 1.60) - octokit (~> 4.0) - public_suffix (>= 3.0, < 5.0) + octokit (>= 4, < 8) + public_suffix (>= 3.0, < 6.0) typhoeus (~> 1.3) - html-pipeline (2.14.2) + hashery (2.1.2) + html-pipeline (2.14.3) activesupport (>= 2) nokogiri (>= 1.4) - html-proofer (3.17.4) + html-proofer (5.2.1) addressable (~> 2.3) - mercenary (~> 0.3) - nokogumbo (~> 2.0) - parallel (~> 1.3) + async (~> 2.1) + benchmark (~> 0.5) + nokogiri (~> 1.13) + pdf-reader (~> 2.11) rainbow (~> 3.0) typhoeus (~> 1.3) yell (~> 2.0) - http_parser.rb (0.8.0) - i18n (0.9.5) + zeitwerk (~> 2.5) + http_parser.rb (0.8.1) + i18n (1.14.8) concurrent-ruby (~> 1.0) - jekyll (3.9.2) + io-event (1.11.2) + jekyll (3.10.0) addressable (~> 2.4) colorator (~> 1.0) + csv (~> 3.0) em-websocket (~> 0.5) - i18n (~> 0.7) + i18n (>= 0.7, < 2) jekyll-sass-converter (~> 1.0) jekyll-watch (~> 2.0) kramdown (>= 1.17, < 3) @@ -110,27 +150,28 @@ GEM pathutil (~> 0.9) rouge (>= 1.7, < 4) safe_yaml (~> 1.0) - jekyll-avatar (0.7.0) + webrick (>= 1.0) + jekyll-avatar (0.8.0) jekyll (>= 3.0, < 5.0) - jekyll-coffeescript (1.1.1) + jekyll-coffeescript (1.2.2) coffee-script (~> 2.2) - coffee-script-source (~> 1.11.1) + coffee-script-source (~> 1.12) jekyll-commonmark (1.4.0) commonmarker (~> 0.22) - jekyll-commonmark-ghpages (0.2.0) - commonmarker (~> 0.23.4) - jekyll (~> 3.9.0) + jekyll-commonmark-ghpages (0.5.1) + commonmarker (>= 0.23.7, < 1.1.0) + jekyll (>= 3.9, < 4.0) jekyll-commonmark (~> 1.4.0) - rouge (>= 2.0, < 4.0) - jekyll-default-layout (0.1.4) - jekyll (~> 3.0) - jekyll-feed (0.15.1) + rouge (>= 2.0, < 5.0) + jekyll-default-layout (0.1.5) + jekyll (>= 3.0, < 5.0) + jekyll-feed (0.17.0) jekyll (>= 3.7, < 5.0) jekyll-gist (1.5.0) octokit (~> 4.2) - jekyll-github-metadata (2.13.0) + jekyll-github-metadata (2.16.1) jekyll (>= 3.4, < 5.0) - octokit (~> 4.0, != 4.4.0) + octokit (>= 4, < 7, != 4.4.0) jekyll-include-cache (0.2.1) jekyll (>= 3.7, < 5.0) jekyll-mentions (1.6.0) @@ -201,79 +242,91 @@ GEM jekyll (>= 3.3, < 5.0) jekyll-watch (2.2.1) listen (~> 3.0) - jemoji (0.12.0) - gemoji (~> 3.0) + jemoji (0.13.0) + gemoji (>= 3, < 5) html-pipeline (~> 2.2) jekyll (>= 3.0, < 5.0) - kramdown (2.3.2) + json (2.19.3) + kramdown (2.4.0) rexml kramdown-parser-gfm (1.1.0) kramdown (~> 2.0) - liquid (4.0.3) - listen (3.7.1) + liquid (4.0.4) + listen (3.10.0) + logger rb-fsevent (~> 0.10, >= 0.10.3) rb-inotify (~> 0.9, >= 0.9.10) + logger (1.7.0) mercenary (0.3.6) - mini_portile2 (2.8.0) + metrics (0.15.0) minima (2.5.1) jekyll (>= 3.5, < 5.0) jekyll-feed (~> 0.9) jekyll-seo-tag (~> 2.1) - minitest (5.16.2) - nokogiri (1.13.8) - mini_portile2 (~> 2.8.0) + minitest (6.0.3) + drb (~> 2.0) + prism (~> 1.5) + net-http (0.9.1) + uri (>= 0.11.1) + nokogiri (1.19.2-arm64-darwin) racc (~> 1.4) - nokogumbo (2.0.5) - nokogiri (~> 1.8, >= 1.8.4) octokit (4.25.1) faraday (>= 1, < 3) sawyer (~> 0.9) - parallel (1.22.1) pathutil (0.16.2) forwardable-extended (~> 2.6) - public_suffix (4.0.7) - racc (1.6.0) + pdf-reader (2.15.1) + Ascii85 (>= 1.0, < 3.0, != 2.0.0) + afm (>= 0.2.1, < 2) + hashery (~> 2.0) + ruby-rc4 + ttfunk + prism (1.9.0) + public_suffix (5.1.1) + racc (1.8.1) rainbow (3.1.1) - rb-fsevent (0.11.1) - rb-inotify (0.10.1) + rb-fsevent (0.11.2) + rb-inotify (0.11.1) ffi (~> 1.0) - rexml (3.2.5) - rouge (3.26.0) - ruby2_keywords (0.0.5) - rubyzip (2.3.2) + rexml (3.4.4) + rouge (3.30.0) + ruby-rc4 (0.1.5) + rubyzip (2.4.1) safe_yaml (1.0.5) sass (3.7.4) sass-listen (~> 4.0.0) sass-listen (4.0.0) rb-fsevent (~> 0.9, >= 0.9.4) rb-inotify (~> 0.9, >= 0.9.7) - sawyer (0.9.2) + sawyer (0.9.3) addressable (>= 2.3.5) faraday (>= 0.17.3, < 3) - simpleidn (0.2.1) - unf (~> 0.1.4) + securerandom (0.4.1) + simpleidn (0.2.3) terminal-table (1.8.0) unicode-display_width (~> 1.1, >= 1.1.1) - thread_safe (0.3.6) - typhoeus (1.4.0) - ethon (>= 0.9.0) - tzinfo (1.2.10) - thread_safe (~> 0.1) - unf (0.1.4) - unf_ext - unf_ext (0.0.8.2) + traces (0.18.2) + ttfunk (1.8.0) + bigdecimal (~> 3.1) + typhoeus (1.6.0) + ethon (>= 0.18.0) + tzinfo (2.0.6) + concurrent-ruby (~> 1.0) unicode-display_width (1.8.0) + uri (1.1.1) + webrick (1.9.2) yell (2.2.2) - zeitwerk (2.6.0) + zeitwerk (2.7.5) PLATFORMS - ruby + arm64-darwin-24 + x86_64-linux DEPENDENCIES github-pages - html-proofer (~> 3.0, < 3.18) + html-proofer jekyll-paginate jekyll-redirect-from BUNDLED WITH - 2.1.4 + 2.4.22 diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000..40eb3aa194 --- /dev/null +++ b/Makefile @@ -0,0 +1,30 @@ +# Makefile to build Jekyll and run HTMLProofer without YAML + +SITE_DIR := _site +LOG_FILE := htmlproofer-output.txt +BUNDLE ?= bundle + +all: clean build check + +build: + @echo "🔨 Building Jekyll site..." + $(BUNDLE) exec jekyll build + +check: + @echo "⏱️ Checking HTML links in $(SITE_DIR)..." + @start=$$(date +%s); \ + $(BUNDLE) exec htmlproofer $(SITE_DIR) \ + --assume-extension .html \ + --ignore-missing-alt \ + --ignore-empty-alt \ + --only-4xx \ + --ignore-status-codes "429,403,400,415" \ + --ignore-urls "/github\.com\/programminghistorian/,/gutenberg\.org/,/espanol/,/deprecated/,/collection\.britishmuseum\.org/,/analytics\.hathitrust\.org/,/docnow\.io/,/doxygen\.nl/,/doi\.org\/10\.34190\/JEL\.17\.3\.002/,/doi\.org\/10\.22134\/trace\.82\.2022\.819/,/rubenalcaraz\.es\/manual-omeka\/?/,/web\.archive\.org\/web\/20180831094856\/http:\/\/www\.dlsi\.ua\.es\/~borja\/riilua\/6\.TopicModeling_v02\.pdf/" \ + --ignore-files "/_site\/assets\//,/_site\/blog\//,/_site\/posts\//,/_site\/en\/lessons\/retired\//,/_site\/es\/lecciones\/retirada\//,/_site\/fr\/lecons\/retrait\//,/_site\/pt\/licoes\/retiradas\//" \ + > $(LOG_FILE) 2>&1 \ + || echo "❌ HTMLProofer found issues. See $(LOG_FILE)"; \ + end=$$(date +%s); \ + echo "✅ Finished in $$((end-start)) seconds" + +clean: + rm -rf $(SITE_DIR) $(LOG_FILE) diff --git a/README.md b/README.md index fd752c8cff..edfaf5601f 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,13 @@ [![Current Build Status](https://img.shields.io/github/workflow/status/programminghistorian/jekyll/build_programming_historian)](https://github.com/programminghistorian/jekyll/actions/) [![Netlify Status](https://api.netlify.com/api/v1/badges/806a84e7-ac1c-4722-b9bb-a8d519e8bf47/deploy-status)](https://app.netlify.com/sites/ph-preview/deploys) [![DOI](https://zenodo.org/badge/12707869.svg)](https://zenodo.org/badge/latestdoi/12707869) --- -This is the main repository for the _Programming Historian_ (), where we keep the files for the live website. +This is the main repository for the _Programming Historian_ (), where we keep the files for the live website. For tutorials in submission, please see: [_Programming Historian Submissions_](https://github.com/programminghistorian/ph-submissions). If you have suggestions for the site or project, please click on the [Issues Tab](https://github.com/programminghistorian/jekyll/issues) above, and click [New Issue](https://github.com/programminghistorian/jekyll/issues/new) to describe your idea. Please note this will be public. If you would like to correspond with us privately, please contact [one of the current managing editors listed on our Project Team page](https://programminghistorian.org/en/project-team). -If you would like to contribute to the project, you can find detailed instructions for authors, reviewers, and editors on the [contributions page](http://programminghistorian.org/contribute) of the website. +If you would like to contribute to the project, you can find detailed instructions for authors, reviewers, and editors on the [contributions page](https://programminghistorian.org/contribute) of the website. For technical contributions to the project or to read about our policies for new language sub teams interested in creating a translation, please read our [Project Wiki](https://github.com/programminghistorian/jekyll/wiki). diff --git a/_build/build.sh b/_build/build.sh index 3ddf87a961..7821e7984c 100755 --- a/_build/build.sh +++ b/_build/build.sh @@ -2,7 +2,15 @@ # Build site, and then run htmlproofer to check for broken internal and external links -bundle exec jekyll build && htmlproofer _site --assume-extension --empty-alt-ignore --alt-ignore '/.*/' --file-ignore "/assets/,/retired/,/retirada/,/retrait/,/posts/" --timeframe '30d' --only-4xx --http-status-ignore 429,403,400 --url-ignore "/http://www.gutenberg.org/*/,/https://github.com/programminghistorian/jekyll/(commits|blob)/*/,/\#/,/espanol/,/deprecated/,/collection.britishmuseum.org/,/analytics.hathitrust.org/,/fr.wikipedia.org/wiki/,https://web.archive.org/web/20180831094856/http://www.dlsi.ua.es/~borja/riilua/6.TopicModeling_v02.pdf" --allow-hash-href +# bundle exec jekyll build && htmlproofer _site --assume-extension --empty-alt-ignore --alt-ignore '/.*/' --file-ignore "/assets/,/retired/,/retirada/,/retrait/,/posts/" --timeframe '30d' --only-4xx --http-status-ignore 429,403,400 --url-ignore "/http://www.gutenberg.org/*/,/https://github.com/programminghistorian/jekyll/(commits|blob)/*/,/\#/,/espanol/,/deprecated/,/collection.britishmuseum.org/,/analytics.hathitrust.org/,/fr.wikipedia.org/wiki/,https://web.archive.org/web/20180831094856/http://www.dlsi.ua.es/~borja/riilua/6.TopicModeling_v02.pdf" --allow-hash-href + +bundle exec jekyll build && bundle exec htmlproofer _site \ + --assume-extension .html \ + --check-img-alt=false \ + --only-4xx \ + --http-status-ignore 429 403 400 415 \ + --file-ignore "/assets/" "/en/lessons/retired/" "/es/lecciones/retirada/" "/fr/lecons/retrait/" "/pt/licoes/retiradas/" "/posts/" "/blog/" \ + --url-ignore "/https?:\\/\\/www\\.gutenberg\\.org\\/.*?/" "/https://github.com/programminghistorian/.*/" "/https://github.com/orgs/programminghistorian/.*/" "/#/" "/espanol/" "/deprecated/" "/collection.britishmuseum.org/" "/analytics.hathitrust.org/" "/fr.wikipedia.org/wiki/" "/docnow.io/" "/doxygen.nl/" "/doi.org\\/10.34190\\/JEL.17.3.002/" "/doi.org\\/10.22134\\/trace.82.2022.819/" "/rubenalcaraz.es\\/manual-omeka\\/?/" "/https://web.archive.org/web/20180831094856/http://www.dlsi.ua.es/~borja/riilua/6.TopicModeling_v02.pdf/" ## Updated version of htmlproofer commands (internal linking still doesn't work for some reason). # htmlproofer ./_site \ diff --git a/_config.yml b/_config.yml index 637451c402..2d4a9aa27b 100755 --- a/_config.yml +++ b/_config.yml @@ -2,7 +2,7 @@ name: Programming Historian description: Introductory and intermediate programming lessons for humanists # Used to insert the live url of the site in places that require a full link such as suggested citations and XML feed liveurl: https://programminghistorian.org -permalink: :categories/:title +# permalink: :categories/:title exclude: - deprecated - vendor @@ -19,70 +19,80 @@ plugins: paginate: 7 paginate_path: "/blog/page:num/" defaults: - - scope: - path: "" - values: - lang: "en" - scope: path: "en" values: lang: "en" + permalink: /en/:basename/ - scope: path: "es" values: lang: "es" + permalink: /es/:basename/ - scope: path: "fr" values: lang: "fr" + permalink: /fr/:basename/ - scope: path: "pt" values: lang: "pt" + permalink: /pt/:basename/ - scope: path: "en/lessons" values: lesson: true + permalink: /en/lessons/:basename/ - scope: path: "en/lessons/retired" values: lesson: false retired: true + permalink: /en/lessons/retired/:basename/ - scope: - path: "es/lecciones/retirada" - values: - lesson: false - retired: true - - scope: - path: "fr/lecons/retrait" + path: "es/lecciones" values: - lesson: false - retired: true + lesson: true + permalink: /es/lecciones/:basename/ - scope: - path: "pt/licoes/retirado" + path: "es/lecciones/retirada" values: lesson: false retired: true + permalink: /es/lecciones/retirada/:basename/ - scope: - path: "es/lecciones" + path: "fr/lecons" values: lesson: true + permalink: /fr/lecons/:basename/ - scope: - path: "fr/lecons" + path: "fr/lecons/retrait" values: - lesson: true + lesson: false + retired: true + permalink: /fr/lecons/retrait/:basename/ - scope: path: "pt/licoes" values: lesson: true + permalink: /pt/licoes/:basename/ - scope: - path: "fr/lecons" + path: "pt/licoes/retirado" values: - lesson: true + lesson: false + retired: true + permalink: /pt/licoes/retirado/:basename/ - scope: path: _posts values: category: posts + permalink: /:categories/:title/ + - scope: + path: "" + type: "pages" + values: + permalink: /:path/ # Flags to enable/disable "Support PH" alerts on lessons, and on sitewide banner above the navigation bar lesson_donation_alerts: true diff --git a/_data/ph_authors.yml b/_data/ph_authors.yml index 771759d5a4..d24e0ca4f2 100644 --- a/_data/ph_authors.yml +++ b/_data/ph_authors.yml @@ -21,8 +21,7 @@ - name: Adam Crymble github: acrymble email: a.crymble@ucl.ac.uk - twitter: Adam_Crymble - url: "http://adamcrymble.org" + url: "https://adamcrymble.org" team: true team_start: 2011 institution: University College London, United Kingdom @@ -111,9 +110,8 @@ Fred Gibbs é professor assistente de História na University of New Mexico. - name: Maria José Afanador-Llach - url: "http://www.mariajoseafanador.com" + url: "https://www.mariajoseafanador.com" email: mj.afanador28@uniandes.edu.co - twitter: mariajoafana github: mariajoafana orcid: 0000-0003-2652-5799 team: false @@ -143,7 +141,6 @@ - name: Víctor Gayol url: "https://victorgayol.net" email: vgayol@colmich.edu.mx - twitter: victor_gayol github: vgayolrs team: false team_start: 2016 @@ -176,9 +173,8 @@ status: volunteer - name: Antonio Rojas Castro - url: "http://antoniorojascastro.com" + url: "https://antoniorojascastro.com" email: rojas.castro.antonio@gmail.com - twitter: RojasCastroA github: arojascastro team: false team_start: 2016 @@ -232,7 +228,6 @@ Evan Taparata é doutoranda em História pela University of Minnesota. - name: Taryn Dewar - twitter: dtdewar team: false bio: en: | @@ -324,7 +319,6 @@ institution: University of Southampton github: drjwbaker email: j.w.baker@soton.ac.uk - twitter: j_w_baker url: "https://cradledincaricature.com/" sortname: Baker orcid: 0000-0002-2682-6922 @@ -541,11 +535,11 @@ team: false bio: en: | - Marten Düring is a historian, works as researcher in the Digital Humanities Lab at CVCE Luxembourg, runs http://historicalnetworkresearch.org and regularly teaches workshops on network analysis. + Marten Düring is a historian, works as researcher in the Digital Humanities Lab at CVCE Luxembourg, runs https://historicalnetworkresearch.org and regularly teaches workshops on network analysis. es: | - Marten Düring es historiador y trabaja como investigador en el Laboratorio de Humanidades Digitales del CVCE Luxembourg. Dirige http://historicalnetworkresearch.org y ofrece talleres sobre análisis de redes. + Marten Düring es historiador y trabaja como investigador en el Laboratorio de Humanidades Digitales del CVCE Luxembourg. Dirige https://historicalnetworkresearch.org y ofrece talleres sobre análisis de redes. pt: | - Marten Düring é historiador e trabalha como pesquisador no Laboratório de Humanidades Digitais do CVCE Luxemburgo. Dirige http://historicalnetworkresearch.org e regularmente leciona workshops sobre análise de redes. + Marten Düring é historiador e trabalha como pesquisador no Laboratório de Humanidades Digitais do CVCE Luxemburgo. Dirige https://historicalnetworkresearch.org e regularmente leciona workshops sobre análise de redes. - name: Max De Wilde team: false @@ -656,8 +650,7 @@ - name: Sarah Melton github: svmelton email: sarah.melton1@gmail.com - twitter: worldcatlady - url: "http://sarahvmelton.com" + url: "https://sarahvmelton.com" team: false team_start: 2019 team_end: 2022 @@ -732,9 +725,8 @@ - name: Brandon Walsh github: walshbr - twitter: walshbr email: walsh.brandon.michael@gmail.com - url: "http://walshbr.com/" + url: "https://walshbr.com/" team: false team_start: 2017 team_end: 2021 @@ -838,8 +830,7 @@ - name: Jessica Parr github: JMParr email: jparr1129@gmail.com - twitter: ProvAtlantic - url: "http://jessicaparr.org/" + url: "https://jessicaparr.org/" team: false team_start: 2017 team_end: 2025 @@ -932,7 +923,6 @@ - name: Nabeel Siddiqui github: nabsiddiqui orcid: 0000-0002-6126-5833 - twitter: nabsiddiqui email: siddiqui@susqu.edu url: "https://www.nabeelsiddiqui.net/" team: true @@ -988,7 +978,6 @@ - name: Anna-Maria Sichani github: amsichani email: amsichani@gmail.com - twitter: amsichani url: "https://amsichani.github.io" team: true team_start: 2018 @@ -1102,8 +1091,7 @@ Zoë Wilkinson Saldaña é licencianda pela University of Michigan School of Information, onde se foca na interseção de literacias críticas de dados, bibliotecas académicas e aprendizagem com informação de dados. - name: José Antonio Motilla - url: "http://www.bibliografiapotosina.mx/about/" - twitter: jamotilla + url: "https://www.bibliografiapotosina.mx/about/" email: jamotilla@gmail.com github: jamotilla team: false @@ -1136,7 +1124,6 @@ - name: Jennifer Isasi url: "https://jenniferisasi.github.io/" - twitter: jenniferisve email: espanol@programminghistorian.org github: jenniferisasi team: true @@ -1186,8 +1173,7 @@ - name: François Dominic Laramée github: fdlaramee - twitter: fdlaramee - url: "http://www.francoisdominiclaramee.com" + url: "https://www.francoisdominiclaramee.com" email: fdl@francoisdominiclaramee.com team: false team_start: 2018 @@ -1224,9 +1210,8 @@ - name: Martin Grandjean email: martin.grandjean@unil.ch - twitter: grandjeanmartin github: grandjeanmartin - url: "http://www.martingrandjean.ch" + url: "https://www.martingrandjean.ch" team: false team_start: 2019 team_end: 2021 @@ -1258,7 +1243,6 @@ - name: Sofia Papastamkou email: sofia.papastamkou@uni.lu - twitter: s_papastamkou github: spapastamkou team: false team_start: 2018 @@ -1307,7 +1291,6 @@ - name: Marie-Christine Boucher email: marie.c.boucher@ggk.uni-giessen.de - twitter: marie_c_boucher github: mariechristineb team: false team_start: 2020 @@ -1378,9 +1361,8 @@ - name: Zoe LeBlanc github: ZoeLeBlanc - twitter: zoe_leblanc email: zgleblanc@gmail.com - url: "http://zoeleblanc.com" + url: "https://zoeleblanc.com" team: false team_start: 2018 team_end: 2025 @@ -1428,7 +1410,6 @@ Stephen Krewson é doutorando em Inglês na Yale University. As suas pesquisas exploram a interação entre teorias progressivas de educação e a imprensa no início do século XIX. Tem um mestrado em Ciências da Computação (também em Yale) e começou a trabalhar em métodos eficientes para filtrar e recuperar tarefas nas bibliotecas digitais de larga escala. - name: Riva Quiroga - twitter: rivaquiroga email: rivaquiroga@gmail.com github: rivaquiroga url: "https://rivaquiroga.cl/" @@ -1458,7 +1439,6 @@ status: volunteer - name: Joshua G. Ortiz Baco - twitter: jgob email: joshuaortizbaco@gmail.com github: joshuagob orcid: 0000-0002-9723-4262 @@ -1493,7 +1473,6 @@ - name: Silvia Gutiérrez De la Torre github: silviaegt - twitter: espejolento email: silviaegt@gmail.com url: "https://sgutierrez.seewes.de/" orcid: 0000-0001-8717-2291 @@ -1619,7 +1598,6 @@ Jairo Melo é doutorado em História pelo El Colegio de Michoacán. É pesquisador independente em História Digital e Humanidades Digitais. - name: Daniel Alves - twitter: danielalvesfcsh email: dra@fcsh.unl.pt github: DanielAlvesLABDH orcid: 0000-0002-3541-8197 @@ -1697,7 +1675,6 @@ Miguel Cuadros is a historian from the Industrial University of Santander and Master of Arts in History, State University of New York (United States) and Professor at the Industrial University of Santander. He has teaching and research experience in topics related to cartography and geographical knowledge. - name: Aracele Torres - twitter: araceletorres email: aracele@protonmail.com url: "https://cibermundi.wordpress.com/" github: aracele @@ -1725,7 +1702,6 @@ - name: Hélène Huet email: hhuet@ufl.edu - twitter: superHH url: "https://helenehuet.org" orcid: 0000-0002-2907-1887 team: true @@ -1757,9 +1733,8 @@ - name: Alex Wermer-Colan github: hawc2 - twitter: alexwermercolan email: english@programminghistorian.org - url: "http://www.alexwermercolan.com/" + url: "https://www.alexwermercolan.com/" orcid: 0000-0001-7030-6070 team: true team_start: 2020 @@ -1789,7 +1764,6 @@ status: volunteer - name: Danielle Sanches - twitter: dani_sanches email: daniellesanches@fcsh.unl.pt url: "https://dhlab.unl.pt/" github: daniellesanchesDH @@ -1822,7 +1796,6 @@ status: volunteer - name: Joana Vieira Paulino - twitter: email: jpaulino@fcsh.unl.pt orcid: 0000-0002-9433-2799 github: joanacvp @@ -1925,7 +1898,6 @@ Andrew Akhlaghi é candidato a doutoramento no Departamento de História da University of Texas at Austin, especializado em Médio Oriente e humanidades digitais. - name: Jimmy Medeiros - twitter: jimmy_medeiros email: jimmy.medeiros@fgv.br url: "https://cpdoc.fgv.br/equipe/JimmyMedeiros" github: JimmyMedeiros82 @@ -1954,7 +1926,6 @@ status: volunteer - name: Matthias Gille Levenson - twitter: email: matthias.gille-levenson@ens-lyon.fr url: "https://cv.hal.science/matthias-gille-levenson" orcid: 0000-0001-9488-5986 @@ -1982,7 +1953,6 @@ - name: Gwenaëlle Patat email: gwenaelle.patat@gmail.com - twitter: InsSalom5 github: InesSalome orcid: 0000-0002-8520-3485 team: false @@ -2014,7 +1984,6 @@ Gwenaëlle Patat é engenheira na Maison des Sciences de l'Homme na Bretanha (MSHB) e correspondente local da infraestrutura de pesquisa TGIR Huma-Num. Mais precisamente, ela está encarregada da edição do corpus digital. Sua principal missão é fornecer suporte técnico a projetos de pesquisa rotulados pela MSHB e aconselhar sobre as boas práticas em gerenciamento de dados. - name: Antoine Henry - twitter: "@i_colab" email: antoine.henry@univ-lille.fr url: "https://pro.univ-lille.fr/antoine-henry/" orcid: 0000-0003-4722-4610 @@ -2048,7 +2017,6 @@ status: volunteer - name: Célian Ringwald - twitter: ringwald_c email: celian.ringwald@hotmail.fr github: datalogism team: false @@ -2143,7 +2111,6 @@ - name: Anisa Hawes github: anisa-hawes email: admin@programminghistorian.org - twitter: AnisaHawes url: "https://anisahawes.github.io/about/" team: true team_start: 2021 @@ -2210,7 +2177,6 @@ institution: Programming Historian github: tiagosousagarcia email: tiagosousagarcia@gmail.com - twitter: tiagosousagarci url: "https://tiagosousagarcia.co.uk/" sortname: Sousa Garcia orcid: 0000-0001-5694-7285 @@ -2239,7 +2205,6 @@ - name: Isabelle Gribomont orcid: 0000-0001-7443-5849 email: isabelle.gribomont@uclouvain.be - twitter: IsaGribomont github: isag91 team: true team_start: 2021 @@ -2266,7 +2231,6 @@ - name: Nicolás Llano Linares email: llano.nicolas@gmail.com - twitter: enetreseles github: nllano orcid: 0000-0001-7040-0673 team: true @@ -2337,7 +2301,6 @@ - name: Eric Brasil - twitter: ericbrasiln email: portugues@programminghistorian.org url: "https://ericbrasiln.github.io" github: ericbrasiln @@ -2393,7 +2356,6 @@ institution: University of Texas at Austin github: lizfischer email: lizzy.m.fischer@gmail.com - twitter: lizfischer0 url: "https://www.lizmfischer.com" sortname: Fischer orcid: 0000-0003-3749-094X @@ -2427,7 +2389,6 @@ institution: University of North Carolina at Chapel Hill github: rolrodr email: rolando@ad.unc.edu - twitter: bibliothiccario url: "https://rolandorodriguez.dev/" sortname: Rodriguez affiliation: @@ -2461,7 +2422,6 @@ institution: University of Helsinki github: yann-ryan email: yann.ryan@gmail.com - twitter: lievesofgrass url: "https://yann-ryan.github.io/" sortname: Ryan orcid: 0000-0003-1878-4838 @@ -2496,7 +2456,6 @@ institution: California State University, Northridge github: scottkleinman email: scott.kleinman@csun.edu - twitter: sekleinman url: https://scottkleinman.com/ sortname: Kleinman orcid: 0000-0001-7477-1308 @@ -2530,7 +2489,6 @@ institution: Denison University github: jrladd email: hello@jrladd.com - twitter: johnrladd url: "https://jrladd.com/" sortname: Ladd orcid: 0000-0002-5440-062X @@ -2564,7 +2522,6 @@ institution: Northeastern University github: giuliataurino email: g.taurino@northeastern.edu - twitter: giulia_taurino url: "https://ai.northeastern.edu/our-people/giulia-taurino" sortname: Taurino orcid: 0000-0002-1065-840X @@ -2873,7 +2830,6 @@ - name: David Valentine email: david.valentine@umontreal.ca github: davvalent - twitter: davvalent mastodon: mas.to/@davvalent orcid: 0000-0003-3410-7677 team: false @@ -2903,8 +2859,6 @@ - name: Émilien Schultz email: emilien.schultz@ensae.fr github: emilienschultz - twitter: emilienschultz - mastodon: orcid: 0000-0002-6215-3606 team: true team_start: 2023 @@ -3030,7 +2984,6 @@ orcid: 0000-0002-2461-9946 github: digitalkosovski email: acosovschi@gmail.com - twitter: cosovschi team: true team_start: 2024 institution: Inštitut za novejšo zgodovino @@ -3059,7 +3012,6 @@ orcid: 0000-0003-2208-5108 github: carlonim email: massimiliano.carloni@oeaw.ac.at - twitter: maxcarlons team: true team_start: 2024 institution: Austrian Centre for Digital Humanities and Cultural Heritage (ACDH-CH), Österreichische Akademie der Wissenschaften diff --git a/_data/snippets.yml b/_data/snippets.yml index 365018bad8..e4a25dec9d 100644 --- a/_data/snippets.yml +++ b/_data/snippets.yml @@ -209,13 +209,13 @@ menu-contribute-support-donate: link: /en/individual es: title: Apóyanos - Donaciones - link: /es/apoyanos#donaciones + link: /es/donaciones fr: title: Dons individuels - link: /fr/nous-soutenir#dons + link: /fr/dons pt: title: Apoie-nos - Doações - link: /pt/apoie-nos#doacoes + link: /pt/doacoes menu-lessons: en: title: Lessons diff --git a/_includes/analytics.html b/_includes/analytics.html index 80e43f8f6f..4c510e8f57 100644 --- a/_includes/analytics.html +++ b/_includes/analytics.html @@ -2,7 +2,7 @@ Enables google-analytics site-wide {% endcomment %} - --> diff --git a/_includes/contact-info.html b/_includes/contact-info.html index 547f8fef79..4285b52918 100644 --- a/_includes/contact-info.html +++ b/_includes/contact-info.html @@ -49,10 +49,10 @@ {{ member.email }} {% endif %} {% if member.twitter %} - {{ member.twitter }} + {{ member.twitter }} {% endif %} {% if member.github %} - {{ member.github }} + {{ member.github }} {% endif %} {% if member.orcid %} {% include orcid.html author=member %} diff --git a/_includes/figure.html b/_includes/figure.html index eeec53ce09..9d60547990 100644 --- a/_includes/figure.html +++ b/_includes/figure.html @@ -1,5 +1,5 @@ {% comment %} -figure tags without plugin: http://stackoverflow.com/questions/19331362/using-an-image-caption-in-markdown-jekyll +figure tags without plugin: https://stackoverflow.com/questions/19331362/using-an-image-caption-in-markdown-jekyll If figure.html is being called from a lesson page, it collects the lesson slug from lesson-slug.html in order to compute the correct path to the image. Otherwise, it just appends the basepath and constructs the figure tag normally. diff --git a/_includes/footer.html b/_includes/footer.html index 0922b24251..51eec4f9f5 100644 --- a/_includes/footer.html +++ b/_includes/footer.html @@ -40,17 +40,17 @@ - + diff --git a/_includes/header.html b/_includes/header.html index fefad5ed55..d128c3c5ce 100644 --- a/_includes/header.html +++ b/_includes/header.html @@ -10,7 +10,7 @@ {% include twitter-card.html %} {% endif %} - + diff --git a/_includes/lesson-index.html b/_includes/lesson-index.html index 2eedd4b9d9..721beac7dc 100644 --- a/_includes/lesson-index.html +++ b/_includes/lesson-index.html @@ -4,7 +4,7 @@ {% endcomment %}
- @@ -86,13 +86,13 @@

{{ site.data.snippets.filtering-results[page.lang] }}:

- - + + {% if page.lang != "en" %} - + {% endif %} diff --git a/_includes/menu.html b/_includes/menu.html index 6c1cc9512d..3c9a57bf8d 100644 --- a/_includes/menu.html +++ b/_includes/menu.html @@ -26,82 +26,82 @@ aria-label="Toggle navigation"> - Programming + Programming Historian
@@ -74,15 +74,15 @@

London's Central Criminal Court, 1674 to 1913

- - -
- -
BENJAMIN BOWSEY, Breaking Peace > riot, 28th June 1780.

Reference Number: t17800628-33
Offence: Breaking Peace > riot
Verdict: Guilty
Punishment: Death
Navigation: < Previous text (trial account) | Next text (trial account) >

324. BENJAMIN BOWSEY (a blackmoor ) was indicted for that he together with five hundred other persons and more, did, unlawfully, riotously, and tumultuously assemble on the 6th of June to the disturbance of the public peace and did begin to demolish and pull down the dwelling house of Richard Akerman , against the form of the statute, &c.

ROSE JENNINGS , Esq. sworn.

Had you any occasion to be in this part of the town, on the 6th of June in the evening? - I dined with my brother who lives opposite Mr. Akerman's house. They attacked Mr. Akerman's house precisely at seven o'clock; they were preceded by a man better dressed than the rest, who went up to Mr. Akerman's door; he rapped three times, and I believe pulled the bell as often. Mr. Akerman had barrocadoed his house. When the man found that no one came, he went down the steps, made his obeisance to the mob, and pointed to the door, and then retired.

Have you any recollection how that man who you say had a better appearance than the rest was dressed? - I think he had on a dark brown coat and a round ha, but I cannot be particular as to that; the mob immediately following in that formidable manner made such an impression upon me, that I did not take notice. The mob approached about thirty in number, three a-breast, some with paving mattocks, others with iron crows and chissels; and then followed an innumerable company with bludgeons; they seemed to be the spokes of coach-wheels; they divided, some went to Mr. Akerman's door with the mattocks, some to the felons door, and some to the debtor's door. I was struck with the formidable appearance and order in which they divided and proceeded to destroy the place, the men threw their sticks up at the windows, which they broke and demolished, yet notwith standing these sticks were coming down in showers, two men with a bar, such as brewers servants carry on their shoulders,

attacked the parlour window to force it open. The window-shutters were exceedingly tough; they at last forced them partly open, but not quite. I then saw a man in a sailor's jacket helped up, he forced himself neck and heels into the window. They found the house-door still difficult to get open; before it was got open the other parlour window was opened and the mob were throwing the goods out at the window; at last the house-door gave way; about the same time some of the goods and furniture having been thrown out into the street, a fire was kindled.

They proceeded immediately to throw the goods out of the house? - Immediately. An equal degree of activity seemed to exhibit itself on the outside as within, one party to burn, the other to throw out the goods of Mr. Akerman. When the conflagration took place I applied my mind to the mob.

Was Mr. Akerman's house on fire then? - No. I was situated in the one-pair-of-stairs room, and could see what happened. I endeavoured to form a distinction between the active and inactive people. I thought I did so; the inactive people seemed to form a circle. I observed a person better dressed than the rest among those within the circle, who did not meddle, but seemed to be exciting and encouraging others. I saw several genteel looking men, and amongst them a black; there was one genteel man in particular, whose conduct I confess excited my indignation, and I took particular notice of him. I went down amongst the mob; I spoke to him; I made myself master of his voice; I believe if I was out of his sight I could swear to his voice; I have never seen that man since. When I first saw the black I turned to a lady and said, this is a motley crew, and of every colour. Mr. Akerman's house had then catched fire; the house in which I was was in extreme danger; my self with some others went down to desire the mob to prevent the houses of innocent people catching fire; and the mob were as active in saving those as in destroying Mr. Akerman's. I had no opportunity of making any remarks till I went to my station again, then I believe it was near nine o'clock; I heard a cry and a gingling of keys in the hands of some person; there were three or four genteel persons, but who had the keys I cannot say. Amongst them was the prisoner at the bar; he was without his hat, and his hands were down. I thought he might have his hat in his hand. The house I think was at that time destroyed; the roof was fallen in. Then those persons of the genteeler description moved off towards Smithfield, and amongst them was the prisoner.

You had observed the black in the mob before you went down? - I had.

Are you able to say who that black was? - No. Seeing this man afterwards I took it for granted it was him; I was certain to him the second time; he had his hat off in the middle of the mob.

Jury. You said his hands were down, did you see any thing in his hands? - No, I did not; I took it for granted he had his hat in his hand, not having it on his head.

Cross Examination.

There were I believe other blacks in the mob? - I never saw but one; I saw a black at first, but did not remark him so as to swear to him.

You could not swear to him I suppose from the difficulty every man has in his mind to swear to any black? - Yes.

There is more difficulty to swear to a black than to a white man? - No. The second time I made my remark too judiciously to err.

When was it you first saw the black? - After the goods were first set on fire, which was about a quarter after seven o'clock.

What dress had the black on? - Something of a dark colour, but my remark was on his face.

What w as remarkable in that man's face more than another black? - The make of his hair was one thing; the curls were out if he had had any; and his hair smooth on his head. His face was so exposed to my view the second time, that I could not be better situated to make any remark on his face.

His hair was the thing by which you knew him? - His hair and his face.

What was particular in his face? - I cannot distinguish it any other than from the weight of the impression it made on me.

Counsel for the Crown. Have you any doubt about him? - No.

ANN WOOD sworn.

I live at Mr. Jennings's, opposite Mr. Akerman's house.

Was you at home on the Tuesday evening when Mr. Akerman's house was attacked? - I was.

Did you in the course of that evening see the prisoner? - I did. It was a little after seven o'clock; I saw him in Mr. Akerman's two-pair-of-stairs room, he stood against the window with something in his hand and looked at me for some time before I observed particularly what he was doing. I looked at him then, and he took up something off the ground and held it up to me; when he held it up, I went down from the window into the dining-room; I came up again, and he was there still. He seemed to be looking in a drawer upon the floor, and seemed to be doing some thing up into a bundle.

You was in the two-pair-of-stairs room opposite him? - No, I was in the three-pair-of-stairs room.

Did you afterwards see him do any thing else? - He got up and looked at me and nodded his head at me; then I went down stairs.

You saw him again in the course of the evening? - Yes, I saw him an hour or two afterwards in the mob.

From the observation you made of his person are you sure that is the man? - That is the man.

Have you any doubt about it? - No, none at all.

Cross Examination.

What makes you so positive that this is the man? - I know his face perfectly again by his standing and looking at me so long.

You recollect him only by his face? - His face and his hair.

Did you see any other black there? - Yes, I did; not in the house but in the mob.

Could you swear to him? - I do not know that I could. I took more notice of this man than I did of any other.

Court. What were the other people doing when the prisoner was in the two-pair-of-stairs room? - Some of the mob were pulling the house down, and some were running in with the fire to set the parlour on fire.

Jury. How many times did you see this prisoner? - Two or three times.

Had he his hat on when you saw him? - Yes.

ANN LESSAR sworn.

Do you know the prisoner? - Yes.

Where do you live? - I lodge in the same lodging, in which the prisoner lodged; I took the lodging of him and the landlady.

Do you remember his coming to you and bringing you any stockings? - He gave me three pair of stockings to mark.

What mark did he bid you put upon them? - Any kind of mark to distinguish them at the washerwoman's. I put BB, the initials of his name upon them.

Had he left a trunk in the room? - Yes, the trunk was found there by the constable when he came; it was locked, he had the key of it.

Who had the key of the room? - I had; nobody could get at the box without my knowledge.

PERCIVAL PHILLIPS sworn.

I am a constable. I searched the lodging of the prisoner last Tuesday-week.

Did you find a trunk there? - I did.

Did you find any thing in that trunk? - Yes; these stockings, this pocket book, and a handkerchief. (producing them.)

Any thing else? - This key (producing it) was upon the shelf in the lodging.

Mr. RICHARD AKERMAN sworn.

This pocket-book, I believe, has been in my possession thirty years; it was, I believe, in one of the drawers belonging to my wife; here are several of my banker's cheques which had my name to them.

Look at the stockings? - Here is a very remarkable pair which I had made for me, and the maker wove the initials of my

name in them in open work; the prisoner has put the initials of his name (B B) over it; they were in the drawers in a one-pair of stairs room. Here are several others that were marked by my sister, they are mine; I believe the handkerchiefs to be mine, but there are no particular marks on them; there are a pair of stockings that were taken off the prisoner's legs, which has the name cut out.

To Phillips. Did you take them off the prisoner's legs? - I did.

To Mr. Akerman. Is the place that is cut out the place where the name was wove? - Yes. This is a remarkable key; it is a key of the Park, it has a crown and my name at length upon it.

To Lessar. Do you know any thing of the key that was found in the lodging? - No, it was on the shelf when he had the lodging?

Was it there when he left the lodging? - I believe it was there; I saw it once or twice; I never knew the meaning of the key.

Prisoner. My Lord, please to ask that woman if she did not wash the handkerchief the things were tied up in? - I washed a blue-and-white silk handkerchief, I cannot swear it was this, it was all over mud. I washed it on the Thursday, the first week that I was in the house.

Was that after the burning of Newgate? - Yes. I was not in town till it was burnt.

Prisoner. I leave my defence to my counsel and my witnesses.

For the prisoner.

Dr. SANDIMAN sworn.

Do you know the prisoner? - Yes, I knew him five years ago, he lived with a relation of mine; he bore an exceeding good character; he used to come backwards and forwards to my house.

ROBERT GATES sworn.

I am footman to Mr. Goodhousen in Golden Square.

Do you know the prisoner? - I do; I have known him perfectly well from the second day after he came to England, which is six years ago; he lived with a person I knew in America, that person gave him an excellent character, and he has always borne a good character since I knew him.

GRACE ROBERTS sworn.

The prisoner lay at our house the night that the prison was burnt.

What time did you see him that night? - I am not positive to the hour he came in, it was from nine to eleven o'clock.

What time did he come home? - I am not positive to the hour, it was a little after nine.

Are you positive of that? - Yes.

Where do you live? - At No. 3, in Berner's-street.

He came home a little after nine? - Yes, I am certain of it; he continued there all that night till six in the morning, and was never out of the house.

What day was that? - The 6th of June.

What day of the week? - I am not certain.

Are you sure it was the night the prison was burnt? - I am.

What prison? - I am not certain what prison, I heard it mentioned in the family that the prison was burnt down.

Cross Examination.

Who bid you to remember the 6th of June? - I remember it by the people being taken up.

When did you talk of its being the 6th of June? - I know he lay at our house on the 6th of June.

Did you take notice of any other night when he lay there? - No.

Did not he lie there on the 7th and 8th of June? - No, only that night.

You are an acquaintance of his? - Yes.

Is he a married man? - I cannot say.

Did he bring any body with him? - No.

Did he lie by himself? - Yes, I gave him a candle to light him to bed.

Did you know he was to lie there that night? - Yes, he told my fellow servant so.

You are a servant, are you? - Yes.

Did your master know that this man was to lie in the house? - I cannot tell.

Do you let such persons lie in the house without your master's knowledge? - He was an old servant, he lay in the servants hall.

Other servants lie there? - Yes, there was a black lay there.

JOHN NORTHINGTON (a Black) sworn.

I am servant to Mr. Wood.

Did the prisoner lie at your house? - Yes, on the night that Holbourn was on fire.

When the house of Mr. Langdale was on fire? - Yes, the man that lives in Holbourn.

Counsel for the Crown. That was on Wednesday night, the 7th?

To Roberts. Where did the prisoner use to sleep at other times? - In the same bed.

That was when he was a servant there? - Yes.

When he was not a servant there where did he sleep? - He never lay at our house when he was not a servant but that night; I cannot be positive to the night nor the day of the week; I say nothing but the truth.

Prisoner to Ann Wood . What dress had I on that night? - A light brownish coat, a round hat, and a red waistcoat.

GUILTY ( Death .)

Tried by the Second London Jury before Mr. Justice NARES.

-
-

View as XML

- -

+ + +
+ +
BENJAMIN BOWSEY, Breaking Peace > riot, 28th June 1780.

Reference Number: t17800628-33
Offence: Breaking Peace > riot
Verdict: Guilty
Punishment: Death
Navigation: < Previous text (trial account) | Next text (trial account) >

324. BENJAMIN BOWSEY (a blackmoor ) was indicted for that he together with five hundred other persons and more, did, unlawfully, riotously, and tumultuously assemble on the 6th of June to the disturbance of the public peace and did begin to demolish and pull down the dwelling house of Richard Akerman , against the form of the statute, &c.

ROSE JENNINGS , Esq. sworn.

Had you any occasion to be in this part of the town, on the 6th of June in the evening? - I dined with my brother who lives opposite Mr. Akerman's house. They attacked Mr. Akerman's house precisely at seven o'clock; they were preceded by a man better dressed than the rest, who went up to Mr. Akerman's door; he rapped three times, and I believe pulled the bell as often. Mr. Akerman had barrocadoed his house. When the man found that no one came, he went down the steps, made his obeisance to the mob, and pointed to the door, and then retired.

Have you any recollection how that man who you say had a better appearance than the rest was dressed? - I think he had on a dark brown coat and a round ha, but I cannot be particular as to that; the mob immediately following in that formidable manner made such an impression upon me, that I did not take notice. The mob approached about thirty in number, three a-breast, some with paving mattocks, others with iron crows and chissels; and then followed an innumerable company with bludgeons; they seemed to be the spokes of coach-wheels; they divided, some went to Mr. Akerman's door with the mattocks, some to the felons door, and some to the debtor's door. I was struck with the formidable appearance and order in which they divided and proceeded to destroy the place, the men threw their sticks up at the windows, which they broke and demolished, yet notwith standing these sticks were coming down in showers, two men with a bar, such as brewers servants carry on their shoulders,

attacked the parlour window to force it open. The window-shutters were exceedingly tough; they at last forced them partly open, but not quite. I then saw a man in a sailor's jacket helped up, he forced himself neck and heels into the window. They found the house-door still difficult to get open; before it was got open the other parlour window was opened and the mob were throwing the goods out at the window; at last the house-door gave way; about the same time some of the goods and furniture having been thrown out into the street, a fire was kindled.

They proceeded immediately to throw the goods out of the house? - Immediately. An equal degree of activity seemed to exhibit itself on the outside as within, one party to burn, the other to throw out the goods of Mr. Akerman. When the conflagration took place I applied my mind to the mob.

Was Mr. Akerman's house on fire then? - No. I was situated in the one-pair-of-stairs room, and could see what happened. I endeavoured to form a distinction between the active and inactive people. I thought I did so; the inactive people seemed to form a circle. I observed a person better dressed than the rest among those within the circle, who did not meddle, but seemed to be exciting and encouraging others. I saw several genteel looking men, and amongst them a black; there was one genteel man in particular, whose conduct I confess excited my indignation, and I took particular notice of him. I went down amongst the mob; I spoke to him; I made myself master of his voice; I believe if I was out of his sight I could swear to his voice; I have never seen that man since. When I first saw the black I turned to a lady and said, this is a motley crew, and of every colour. Mr. Akerman's house had then catched fire; the house in which I was was in extreme danger; my self with some others went down to desire the mob to prevent the houses of innocent people catching fire; and the mob were as active in saving those as in destroying Mr. Akerman's. I had no opportunity of making any remarks till I went to my station again, then I believe it was near nine o'clock; I heard a cry and a gingling of keys in the hands of some person; there were three or four genteel persons, but who had the keys I cannot say. Amongst them was the prisoner at the bar; he was without his hat, and his hands were down. I thought he might have his hat in his hand. The house I think was at that time destroyed; the roof was fallen in. Then those persons of the genteeler description moved off towards Smithfield, and amongst them was the prisoner.

You had observed the black in the mob before you went down? - I had.

Are you able to say who that black was? - No. Seeing this man afterwards I took it for granted it was him; I was certain to him the second time; he had his hat off in the middle of the mob.

Jury. You said his hands were down, did you see any thing in his hands? - No, I did not; I took it for granted he had his hat in his hand, not having it on his head.

Cross Examination.

There were I believe other blacks in the mob? - I never saw but one; I saw a black at first, but did not remark him so as to swear to him.

You could not swear to him I suppose from the difficulty every man has in his mind to swear to any black? - Yes.

There is more difficulty to swear to a black than to a white man? - No. The second time I made my remark too judiciously to err.

When was it you first saw the black? - After the goods were first set on fire, which was about a quarter after seven o'clock.

What dress had the black on? - Something of a dark colour, but my remark was on his face.

What w as remarkable in that man's face more than another black? - The make of his hair was one thing; the curls were out if he had had any; and his hair smooth on his head. His face was so exposed to my view the second time, that I could not be better situated to make any remark on his face.

His hair was the thing by which you knew him? - His hair and his face.

What was particular in his face? - I cannot distinguish it any other than from the weight of the impression it made on me.

Counsel for the Crown. Have you any doubt about him? - No.

ANN WOOD sworn.

I live at Mr. Jennings's, opposite Mr. Akerman's house.

Was you at home on the Tuesday evening when Mr. Akerman's house was attacked? - I was.

Did you in the course of that evening see the prisoner? - I did. It was a little after seven o'clock; I saw him in Mr. Akerman's two-pair-of-stairs room, he stood against the window with something in his hand and looked at me for some time before I observed particularly what he was doing. I looked at him then, and he took up something off the ground and held it up to me; when he held it up, I went down from the window into the dining-room; I came up again, and he was there still. He seemed to be looking in a drawer upon the floor, and seemed to be doing some thing up into a bundle.

You was in the two-pair-of-stairs room opposite him? - No, I was in the three-pair-of-stairs room.

Did you afterwards see him do any thing else? - He got up and looked at me and nodded his head at me; then I went down stairs.

You saw him again in the course of the evening? - Yes, I saw him an hour or two afterwards in the mob.

From the observation you made of his person are you sure that is the man? - That is the man.

Have you any doubt about it? - No, none at all.

Cross Examination.

What makes you so positive that this is the man? - I know his face perfectly again by his standing and looking at me so long.

You recollect him only by his face? - His face and his hair.

Did you see any other black there? - Yes, I did; not in the house but in the mob.

Could you swear to him? - I do not know that I could. I took more notice of this man than I did of any other.

Court. What were the other people doing when the prisoner was in the two-pair-of-stairs room? - Some of the mob were pulling the house down, and some were running in with the fire to set the parlour on fire.

Jury. How many times did you see this prisoner? - Two or three times.

Had he his hat on when you saw him? - Yes.

ANN LESSAR sworn.

Do you know the prisoner? - Yes.

Where do you live? - I lodge in the same lodging, in which the prisoner lodged; I took the lodging of him and the landlady.

Do you remember his coming to you and bringing you any stockings? - He gave me three pair of stockings to mark.

What mark did he bid you put upon them? - Any kind of mark to distinguish them at the washerwoman's. I put BB, the initials of his name upon them.

Had he left a trunk in the room? - Yes, the trunk was found there by the constable when he came; it was locked, he had the key of it.

Who had the key of the room? - I had; nobody could get at the box without my knowledge.

PERCIVAL PHILLIPS sworn.

I am a constable. I searched the lodging of the prisoner last Tuesday-week.

Did you find a trunk there? - I did.

Did you find any thing in that trunk? - Yes; these stockings, this pocket book, and a handkerchief. (producing them.)

Any thing else? - This key (producing it) was upon the shelf in the lodging.

Mr. RICHARD AKERMAN sworn.

This pocket-book, I believe, has been in my possession thirty years; it was, I believe, in one of the drawers belonging to my wife; here are several of my banker's cheques which had my name to them.

Look at the stockings? - Here is a very remarkable pair which I had made for me, and the maker wove the initials of my

name in them in open work; the prisoner has put the initials of his name (B B) over it; they were in the drawers in a one-pair of stairs room. Here are several others that were marked by my sister, they are mine; I believe the handkerchiefs to be mine, but there are no particular marks on them; there are a pair of stockings that were taken off the prisoner's legs, which has the name cut out.

To Phillips. Did you take them off the prisoner's legs? - I did.

To Mr. Akerman. Is the place that is cut out the place where the name was wove? - Yes. This is a remarkable key; it is a key of the Park, it has a crown and my name at length upon it.

To Lessar. Do you know any thing of the key that was found in the lodging? - No, it was on the shelf when he had the lodging?

Was it there when he left the lodging? - I believe it was there; I saw it once or twice; I never knew the meaning of the key.

Prisoner. My Lord, please to ask that woman if she did not wash the handkerchief the things were tied up in? - I washed a blue-and-white silk handkerchief, I cannot swear it was this, it was all over mud. I washed it on the Thursday, the first week that I was in the house.

Was that after the burning of Newgate? - Yes. I was not in town till it was burnt.

Prisoner. I leave my defence to my counsel and my witnesses.

For the prisoner.

Dr. SANDIMAN sworn.

Do you know the prisoner? - Yes, I knew him five years ago, he lived with a relation of mine; he bore an exceeding good character; he used to come backwards and forwards to my house.

ROBERT GATES sworn.

I am footman to Mr. Goodhousen in Golden Square.

Do you know the prisoner? - I do; I have known him perfectly well from the second day after he came to England, which is six years ago; he lived with a person I knew in America, that person gave him an excellent character, and he has always borne a good character since I knew him.

GRACE ROBERTS sworn.

The prisoner lay at our house the night that the prison was burnt.

What time did you see him that night? - I am not positive to the hour he came in, it was from nine to eleven o'clock.

What time did he come home? - I am not positive to the hour, it was a little after nine.

Are you positive of that? - Yes.

Where do you live? - At No. 3, in Berner's-street.

He came home a little after nine? - Yes, I am certain of it; he continued there all that night till six in the morning, and was never out of the house.

What day was that? - The 6th of June.

What day of the week? - I am not certain.

Are you sure it was the night the prison was burnt? - I am.

What prison? - I am not certain what prison, I heard it mentioned in the family that the prison was burnt down.

Cross Examination.

Who bid you to remember the 6th of June? - I remember it by the people being taken up.

When did you talk of its being the 6th of June? - I know he lay at our house on the 6th of June.

Did you take notice of any other night when he lay there? - No.

Did not he lie there on the 7th and 8th of June? - No, only that night.

You are an acquaintance of his? - Yes.

Is he a married man? - I cannot say.

Did he bring any body with him? - No.

Did he lie by himself? - Yes, I gave him a candle to light him to bed.

Did you know he was to lie there that night? - Yes, he told my fellow servant so.

You are a servant, are you? - Yes.

Did your master know that this man was to lie in the house? - I cannot tell.

Do you let such persons lie in the house without your master's knowledge? - He was an old servant, he lay in the servants hall.

Other servants lie there? - Yes, there was a black lay there.

JOHN NORTHINGTON (a Black) sworn.

I am servant to Mr. Wood.

Did the prisoner lie at your house? - Yes, on the night that Holbourn was on fire.

When the house of Mr. Langdale was on fire? - Yes, the man that lives in Holbourn.

Counsel for the Crown. That was on Wednesday night, the 7th?

To Roberts. Where did the prisoner use to sleep at other times? - In the same bed.

That was when he was a servant there? - Yes.

When he was not a servant there where did he sleep? - He never lay at our house when he was not a servant but that night; I cannot be positive to the night nor the day of the week; I say nothing but the truth.

Prisoner to Ann Wood . What dress had I on that night? - A light brownish coat, a round hat, and a red waistcoat.

GUILTY ( Death .)

Tried by the Second London Jury before Mr. Justice NARES.

+
+

View as XML

+ +

@@ -195,8 +195,8 @@

Footer

@@ -204,5 +204,5 @@

Footer

- - + + diff --git a/assets/googlemaps-googleearth/UK.Global.Fat.Supply.1894-1896.-.Sheet1.csv.zip b/assets/googlemaps-googleearth/UKGlobalFatSupply1894-1896.csv.zip similarity index 100% rename from assets/googlemaps-googleearth/UK.Global.Fat.Supply.1894-1896.-.Sheet1.csv.zip rename to assets/googlemaps-googleearth/UKGlobalFatSupply1894-1896.csv.zip diff --git a/assets/interactive-text-games-using-twine/First Day in the Office.html b/assets/interactive-text-games-using-twine/First Day in the Office.html deleted file mode 100644 index 10c1ef6e37..0000000000 --- a/assets/interactive-text-games-using-twine/First Day in the Office.html +++ /dev/null @@ -1,195 +0,0 @@ - - - - - -First Day in the Office - - - - - - - - - diff --git a/assets/mapping-with-python-leaflet/exercises/exercise00 - original/mymap.html b/assets/mapping-with-python-leaflet/exercises/exercise00 - original/mymap.html index d4d396c05e..98297c9332 100644 --- a/assets/mapping-with-python-leaflet/exercises/exercise00 - original/mymap.html +++ b/assets/mapping-with-python-leaflet/exercises/exercise00 - original/mymap.html @@ -1,7 +1,7 @@ - - + + diff --git a/assets/mapping-with-python-leaflet/exercises/exercise01/mymap.html b/assets/mapping-with-python-leaflet/exercises/exercise01/mymap.html index 97e0847002..5f6eedcf3b 100644 --- a/assets/mapping-with-python-leaflet/exercises/exercise01/mymap.html +++ b/assets/mapping-with-python-leaflet/exercises/exercise01/mymap.html @@ -1,7 +1,7 @@ - - + + diff --git a/assets/mapping-with-python-leaflet/exercises/exercise02/mymap.html b/assets/mapping-with-python-leaflet/exercises/exercise02/mymap.html index d4d396c05e..98297c9332 100644 --- a/assets/mapping-with-python-leaflet/exercises/exercise02/mymap.html +++ b/assets/mapping-with-python-leaflet/exercises/exercise02/mymap.html @@ -1,7 +1,7 @@ - - + + diff --git a/assets/mapping-with-python-leaflet/exercises/exercise03/mymap.html b/assets/mapping-with-python-leaflet/exercises/exercise03/mymap.html index d4d396c05e..98297c9332 100644 --- a/assets/mapping-with-python-leaflet/exercises/exercise03/mymap.html +++ b/assets/mapping-with-python-leaflet/exercises/exercise03/mymap.html @@ -1,7 +1,7 @@ - - + + diff --git a/assets/mapping-with-python-leaflet/exercises/exercise04/mymap.html b/assets/mapping-with-python-leaflet/exercises/exercise04/mymap.html index 9afce75f7b..e6df973e18 100644 --- a/assets/mapping-with-python-leaflet/exercises/exercise04/mymap.html +++ b/assets/mapping-with-python-leaflet/exercises/exercise04/mymap.html @@ -1,7 +1,7 @@ - - + + diff --git a/assets/mapping-with-python-leaflet/exercises/exercise05/mymap.html b/assets/mapping-with-python-leaflet/exercises/exercise05/mymap.html index 86c3b598cd..dd2ba7580e 100644 --- a/assets/mapping-with-python-leaflet/exercises/exercise05/mymap.html +++ b/assets/mapping-with-python-leaflet/exercises/exercise05/mymap.html @@ -1,7 +1,7 @@ - - + + diff --git a/assets/mapping-with-python-leaflet/map/mymap-onepage.html b/assets/mapping-with-python-leaflet/map/mymap-onepage.html index 335a699363..af5a7fa346 100644 --- a/assets/mapping-with-python-leaflet/map/mymap-onepage.html +++ b/assets/mapping-with-python-leaflet/map/mymap-onepage.html @@ -1,7 +1,7 @@ - - + + diff --git a/assets/mapping-with-python-leaflet/map/mymap.html b/assets/mapping-with-python-leaflet/map/mymap.html index d4d396c05e..98297c9332 100644 --- a/assets/mapping-with-python-leaflet/map/mymap.html +++ b/assets/mapping-with-python-leaflet/map/mymap.html @@ -1,7 +1,7 @@ - - + + diff --git a/assets/normaliser-donnees-textuelles-python/obo-t17800628-33.html b/assets/normaliser-donnees-textuelles-python/obo-t17800628-33.html index b24ffcff1d..d4aed0a70a 100644 --- a/assets/normaliser-donnees-textuelles-python/obo-t17800628-33.html +++ b/assets/normaliser-donnees-textuelles-python/obo-t17800628-33.html @@ -1,19 +1,19 @@ - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + Browse - Central Criminal Court @@ -64,7 +64,7 @@

London's Central Criminal Court, 1674 to 1913

>
  • Historical Background
  • The Project
  • Contact

  • @@ -74,15 +74,15 @@

    London's Central Criminal Court, 1674 to 1913

    - - -
    - -
    BENJAMIN BOWSEY, Breaking Peace > riot, 28th June 1780.

    Reference Number: t17800628-33
    Offence: Breaking Peace > riot
    Verdict: Guilty
    Punishment: Death
    Navigation: < Previous text (trial account) | Next text (trial account) >

    324. BENJAMIN BOWSEY (a blackmoor ) was indicted for that he together with five hundred other persons and more, did, unlawfully, riotously, and tumultuously assemble on the 6th of June to the disturbance of the public peace and did begin to demolish and pull down the dwelling house of Richard Akerman , against the form of the statute, &c.

    ROSE JENNINGS , Esq. sworn.

    Had you any occasion to be in this part of the town, on the 6th of June in the evening? - I dined with my brother who lives opposite Mr. Akerman's house. They attacked Mr. Akerman's house precisely at seven o'clock; they were preceded by a man better dressed than the rest, who went up to Mr. Akerman's door; he rapped three times, and I believe pulled the bell as often. Mr. Akerman had barrocadoed his house. When the man found that no one came, he went down the steps, made his obeisance to the mob, and pointed to the door, and then retired.

    Have you any recollection how that man who you say had a better appearance than the rest was dressed? - I think he had on a dark brown coat and a round ha, but I cannot be particular as to that; the mob immediately following in that formidable manner made such an impression upon me, that I did not take notice. The mob approached about thirty in number, three a-breast, some with paving mattocks, others with iron crows and chissels; and then followed an innumerable company with bludgeons; they seemed to be the spokes of coach-wheels; they divided, some went to Mr. Akerman's door with the mattocks, some to the felons door, and some to the debtor's door. I was struck with the formidable appearance and order in which they divided and proceeded to destroy the place, the men threw their sticks up at the windows, which they broke and demolished, yet notwith standing these sticks were coming down in showers, two men with a bar, such as brewers servants carry on their shoulders,

    attacked the parlour window to force it open. The window-shutters were exceedingly tough; they at last forced them partly open, but not quite. I then saw a man in a sailor's jacket helped up, he forced himself neck and heels into the window. They found the house-door still difficult to get open; before it was got open the other parlour window was opened and the mob were throwing the goods out at the window; at last the house-door gave way; about the same time some of the goods and furniture having been thrown out into the street, a fire was kindled.

    They proceeded immediately to throw the goods out of the house? - Immediately. An equal degree of activity seemed to exhibit itself on the outside as within, one party to burn, the other to throw out the goods of Mr. Akerman. When the conflagration took place I applied my mind to the mob.

    Was Mr. Akerman's house on fire then? - No. I was situated in the one-pair-of-stairs room, and could see what happened. I endeavoured to form a distinction between the active and inactive people. I thought I did so; the inactive people seemed to form a circle. I observed a person better dressed than the rest among those within the circle, who did not meddle, but seemed to be exciting and encouraging others. I saw several genteel looking men, and amongst them a black; there was one genteel man in particular, whose conduct I confess excited my indignation, and I took particular notice of him. I went down amongst the mob; I spoke to him; I made myself master of his voice; I believe if I was out of his sight I could swear to his voice; I have never seen that man since. When I first saw the black I turned to a lady and said, this is a motley crew, and of every colour. Mr. Akerman's house had then catched fire; the house in which I was was in extreme danger; my self with some others went down to desire the mob to prevent the houses of innocent people catching fire; and the mob were as active in saving those as in destroying Mr. Akerman's. I had no opportunity of making any remarks till I went to my station again, then I believe it was near nine o'clock; I heard a cry and a gingling of keys in the hands of some person; there were three or four genteel persons, but who had the keys I cannot say. Amongst them was the prisoner at the bar; he was without his hat, and his hands were down. I thought he might have his hat in his hand. The house I think was at that time destroyed; the roof was fallen in. Then those persons of the genteeler description moved off towards Smithfield, and amongst them was the prisoner.

    You had observed the black in the mob before you went down? - I had.

    Are you able to say who that black was? - No. Seeing this man afterwards I took it for granted it was him; I was certain to him the second time; he had his hat off in the middle of the mob.

    Jury. You said his hands were down, did you see any thing in his hands? - No, I did not; I took it for granted he had his hat in his hand, not having it on his head.

    Cross Examination.

    There were I believe other blacks in the mob? - I never saw but one; I saw a black at first, but did not remark him so as to swear to him.

    You could not swear to him I suppose from the difficulty every man has in his mind to swear to any black? - Yes.

    There is more difficulty to swear to a black than to a white man? - No. The second time I made my remark too judiciously to err.

    When was it you first saw the black? - After the goods were first set on fire, which was about a quarter after seven o'clock.

    What dress had the black on? - Something of a dark colour, but my remark was on his face.

    What w as remarkable in that man's face more than another black? - The make of his hair was one thing; the curls were out if he had had any; and his hair smooth on his head. His face was so exposed to my view the second time, that I could not be better situated to make any remark on his face.

    His hair was the thing by which you knew him? - His hair and his face.

    What was particular in his face? - I cannot distinguish it any other than from the weight of the impression it made on me.

    Counsel for the Crown. Have you any doubt about him? - No.

    ANN WOOD sworn.

    I live at Mr. Jennings's, opposite Mr. Akerman's house.

    Was you at home on the Tuesday evening when Mr. Akerman's house was attacked? - I was.

    Did you in the course of that evening see the prisoner? - I did. It was a little after seven o'clock; I saw him in Mr. Akerman's two-pair-of-stairs room, he stood against the window with something in his hand and looked at me for some time before I observed particularly what he was doing. I looked at him then, and he took up something off the ground and held it up to me; when he held it up, I went down from the window into the dining-room; I came up again, and he was there still. He seemed to be looking in a drawer upon the floor, and seemed to be doing some thing up into a bundle.

    You was in the two-pair-of-stairs room opposite him? - No, I was in the three-pair-of-stairs room.

    Did you afterwards see him do any thing else? - He got up and looked at me and nodded his head at me; then I went down stairs.

    You saw him again in the course of the evening? - Yes, I saw him an hour or two afterwards in the mob.

    From the observation you made of his person are you sure that is the man? - That is the man.

    Have you any doubt about it? - No, none at all.

    Cross Examination.

    What makes you so positive that this is the man? - I know his face perfectly again by his standing and looking at me so long.

    You recollect him only by his face? - His face and his hair.

    Did you see any other black there? - Yes, I did; not in the house but in the mob.

    Could you swear to him? - I do not know that I could. I took more notice of this man than I did of any other.

    Court. What were the other people doing when the prisoner was in the two-pair-of-stairs room? - Some of the mob were pulling the house down, and some were running in with the fire to set the parlour on fire.

    Jury. How many times did you see this prisoner? - Two or three times.

    Had he his hat on when you saw him? - Yes.

    ANN LESSAR sworn.

    Do you know the prisoner? - Yes.

    Where do you live? - I lodge in the same lodging, in which the prisoner lodged; I took the lodging of him and the landlady.

    Do you remember his coming to you and bringing you any stockings? - He gave me three pair of stockings to mark.

    What mark did he bid you put upon them? - Any kind of mark to distinguish them at the washerwoman's. I put BB, the initials of his name upon them.

    Had he left a trunk in the room? - Yes, the trunk was found there by the constable when he came; it was locked, he had the key of it.

    Who had the key of the room? - I had; nobody could get at the box without my knowledge.

    PERCIVAL PHILLIPS sworn.

    I am a constable. I searched the lodging of the prisoner last Tuesday-week.

    Did you find a trunk there? - I did.

    Did you find any thing in that trunk? - Yes; these stockings, this pocket book, and a handkerchief. (producing them.)

    Any thing else? - This key (producing it) was upon the shelf in the lodging.

    Mr. RICHARD AKERMAN sworn.

    This pocket-book, I believe, has been in my possession thirty years; it was, I believe, in one of the drawers belonging to my wife; here are several of my banker's cheques which had my name to them.

    Look at the stockings? - Here is a very remarkable pair which I had made for me, and the maker wove the initials of my

    name in them in open work; the prisoner has put the initials of his name (B B) over it; they were in the drawers in a one-pair of stairs room. Here are several others that were marked by my sister, they are mine; I believe the handkerchiefs to be mine, but there are no particular marks on them; there are a pair of stockings that were taken off the prisoner's legs, which has the name cut out.

    To Phillips. Did you take them off the prisoner's legs? - I did.

    To Mr. Akerman. Is the place that is cut out the place where the name was wove? - Yes. This is a remarkable key; it is a key of the Park, it has a crown and my name at length upon it.

    To Lessar. Do you know any thing of the key that was found in the lodging? - No, it was on the shelf when he had the lodging?

    Was it there when he left the lodging? - I believe it was there; I saw it once or twice; I never knew the meaning of the key.

    Prisoner. My Lord, please to ask that woman if she did not wash the handkerchief the things were tied up in? - I washed a blue-and-white silk handkerchief, I cannot swear it was this, it was all over mud. I washed it on the Thursday, the first week that I was in the house.

    Was that after the burning of Newgate? - Yes. I was not in town till it was burnt.

    Prisoner. I leave my defence to my counsel and my witnesses.

    For the prisoner.

    Dr. SANDIMAN sworn.

    Do you know the prisoner? - Yes, I knew him five years ago, he lived with a relation of mine; he bore an exceeding good character; he used to come backwards and forwards to my house.

    ROBERT GATES sworn.

    I am footman to Mr. Goodhousen in Golden Square.

    Do you know the prisoner? - I do; I have known him perfectly well from the second day after he came to England, which is six years ago; he lived with a person I knew in America, that person gave him an excellent character, and he has always borne a good character since I knew him.

    GRACE ROBERTS sworn.

    The prisoner lay at our house the night that the prison was burnt.

    What time did you see him that night? - I am not positive to the hour he came in, it was from nine to eleven o'clock.

    What time did he come home? - I am not positive to the hour, it was a little after nine.

    Are you positive of that? - Yes.

    Where do you live? - At No. 3, in Berner's-street.

    He came home a little after nine? - Yes, I am certain of it; he continued there all that night till six in the morning, and was never out of the house.

    What day was that? - The 6th of June.

    What day of the week? - I am not certain.

    Are you sure it was the night the prison was burnt? - I am.

    What prison? - I am not certain what prison, I heard it mentioned in the family that the prison was burnt down.

    Cross Examination.

    Who bid you to remember the 6th of June? - I remember it by the people being taken up.

    When did you talk of its being the 6th of June? - I know he lay at our house on the 6th of June.

    Did you take notice of any other night when he lay there? - No.

    Did not he lie there on the 7th and 8th of June? - No, only that night.

    You are an acquaintance of his? - Yes.

    Is he a married man? - I cannot say.

    Did he bring any body with him? - No.

    Did he lie by himself? - Yes, I gave him a candle to light him to bed.

    Did you know he was to lie there that night? - Yes, he told my fellow servant so.

    You are a servant, are you? - Yes.

    Did your master know that this man was to lie in the house? - I cannot tell.

    Do you let such persons lie in the house without your master's knowledge? - He was an old servant, he lay in the servants hall.

    Other servants lie there? - Yes, there was a black lay there.

    JOHN NORTHINGTON (a Black) sworn.

    I am servant to Mr. Wood.

    Did the prisoner lie at your house? - Yes, on the night that Holbourn was on fire.

    When the house of Mr. Langdale was on fire? - Yes, the man that lives in Holbourn.

    Counsel for the Crown. That was on Wednesday night, the 7th?

    To Roberts. Where did the prisoner use to sleep at other times? - In the same bed.

    That was when he was a servant there? - Yes.

    When he was not a servant there where did he sleep? - He never lay at our house when he was not a servant but that night; I cannot be positive to the night nor the day of the week; I say nothing but the truth.

    Prisoner to Ann Wood . What dress had I on that night? - A light brownish coat, a round hat, and a red waistcoat.

    GUILTY ( Death .)

    Tried by the Second London Jury before Mr. Justice NARES.

    -
    -

    View as XML

    - -

    + + +
    + +
    BENJAMIN BOWSEY, Breaking Peace > riot, 28th June 1780.

    Reference Number: t17800628-33
    Offence: Breaking Peace > riot
    Verdict: Guilty
    Punishment: Death
    Navigation: < Previous text (trial account) | Next text (trial account) >

    324. BENJAMIN BOWSEY (a blackmoor ) was indicted for that he together with five hundred other persons and more, did, unlawfully, riotously, and tumultuously assemble on the 6th of June to the disturbance of the public peace and did begin to demolish and pull down the dwelling house of Richard Akerman , against the form of the statute, &c.

    ROSE JENNINGS , Esq. sworn.

    Had you any occasion to be in this part of the town, on the 6th of June in the evening? - I dined with my brother who lives opposite Mr. Akerman's house. They attacked Mr. Akerman's house precisely at seven o'clock; they were preceded by a man better dressed than the rest, who went up to Mr. Akerman's door; he rapped three times, and I believe pulled the bell as often. Mr. Akerman had barrocadoed his house. When the man found that no one came, he went down the steps, made his obeisance to the mob, and pointed to the door, and then retired.

    Have you any recollection how that man who you say had a better appearance than the rest was dressed? - I think he had on a dark brown coat and a round ha, but I cannot be particular as to that; the mob immediately following in that formidable manner made such an impression upon me, that I did not take notice. The mob approached about thirty in number, three a-breast, some with paving mattocks, others with iron crows and chissels; and then followed an innumerable company with bludgeons; they seemed to be the spokes of coach-wheels; they divided, some went to Mr. Akerman's door with the mattocks, some to the felons door, and some to the debtor's door. I was struck with the formidable appearance and order in which they divided and proceeded to destroy the place, the men threw their sticks up at the windows, which they broke and demolished, yet notwith standing these sticks were coming down in showers, two men with a bar, such as brewers servants carry on their shoulders,

    attacked the parlour window to force it open. The window-shutters were exceedingly tough; they at last forced them partly open, but not quite. I then saw a man in a sailor's jacket helped up, he forced himself neck and heels into the window. They found the house-door still difficult to get open; before it was got open the other parlour window was opened and the mob were throwing the goods out at the window; at last the house-door gave way; about the same time some of the goods and furniture having been thrown out into the street, a fire was kindled.

    They proceeded immediately to throw the goods out of the house? - Immediately. An equal degree of activity seemed to exhibit itself on the outside as within, one party to burn, the other to throw out the goods of Mr. Akerman. When the conflagration took place I applied my mind to the mob.

    Was Mr. Akerman's house on fire then? - No. I was situated in the one-pair-of-stairs room, and could see what happened. I endeavoured to form a distinction between the active and inactive people. I thought I did so; the inactive people seemed to form a circle. I observed a person better dressed than the rest among those within the circle, who did not meddle, but seemed to be exciting and encouraging others. I saw several genteel looking men, and amongst them a black; there was one genteel man in particular, whose conduct I confess excited my indignation, and I took particular notice of him. I went down amongst the mob; I spoke to him; I made myself master of his voice; I believe if I was out of his sight I could swear to his voice; I have never seen that man since. When I first saw the black I turned to a lady and said, this is a motley crew, and of every colour. Mr. Akerman's house had then catched fire; the house in which I was was in extreme danger; my self with some others went down to desire the mob to prevent the houses of innocent people catching fire; and the mob were as active in saving those as in destroying Mr. Akerman's. I had no opportunity of making any remarks till I went to my station again, then I believe it was near nine o'clock; I heard a cry and a gingling of keys in the hands of some person; there were three or four genteel persons, but who had the keys I cannot say. Amongst them was the prisoner at the bar; he was without his hat, and his hands were down. I thought he might have his hat in his hand. The house I think was at that time destroyed; the roof was fallen in. Then those persons of the genteeler description moved off towards Smithfield, and amongst them was the prisoner.

    You had observed the black in the mob before you went down? - I had.

    Are you able to say who that black was? - No. Seeing this man afterwards I took it for granted it was him; I was certain to him the second time; he had his hat off in the middle of the mob.

    Jury. You said his hands were down, did you see any thing in his hands? - No, I did not; I took it for granted he had his hat in his hand, not having it on his head.

    Cross Examination.

    There were I believe other blacks in the mob? - I never saw but one; I saw a black at first, but did not remark him so as to swear to him.

    You could not swear to him I suppose from the difficulty every man has in his mind to swear to any black? - Yes.

    There is more difficulty to swear to a black than to a white man? - No. The second time I made my remark too judiciously to err.

    When was it you first saw the black? - After the goods were first set on fire, which was about a quarter after seven o'clock.

    What dress had the black on? - Something of a dark colour, but my remark was on his face.

    What w as remarkable in that man's face more than another black? - The make of his hair was one thing; the curls were out if he had had any; and his hair smooth on his head. His face was so exposed to my view the second time, that I could not be better situated to make any remark on his face.

    His hair was the thing by which you knew him? - His hair and his face.

    What was particular in his face? - I cannot distinguish it any other than from the weight of the impression it made on me.

    Counsel for the Crown. Have you any doubt about him? - No.

    ANN WOOD sworn.

    I live at Mr. Jennings's, opposite Mr. Akerman's house.

    Was you at home on the Tuesday evening when Mr. Akerman's house was attacked? - I was.

    Did you in the course of that evening see the prisoner? - I did. It was a little after seven o'clock; I saw him in Mr. Akerman's two-pair-of-stairs room, he stood against the window with something in his hand and looked at me for some time before I observed particularly what he was doing. I looked at him then, and he took up something off the ground and held it up to me; when he held it up, I went down from the window into the dining-room; I came up again, and he was there still. He seemed to be looking in a drawer upon the floor, and seemed to be doing some thing up into a bundle.

    You was in the two-pair-of-stairs room opposite him? - No, I was in the three-pair-of-stairs room.

    Did you afterwards see him do any thing else? - He got up and looked at me and nodded his head at me; then I went down stairs.

    You saw him again in the course of the evening? - Yes, I saw him an hour or two afterwards in the mob.

    From the observation you made of his person are you sure that is the man? - That is the man.

    Have you any doubt about it? - No, none at all.

    Cross Examination.

    What makes you so positive that this is the man? - I know his face perfectly again by his standing and looking at me so long.

    You recollect him only by his face? - His face and his hair.

    Did you see any other black there? - Yes, I did; not in the house but in the mob.

    Could you swear to him? - I do not know that I could. I took more notice of this man than I did of any other.

    Court. What were the other people doing when the prisoner was in the two-pair-of-stairs room? - Some of the mob were pulling the house down, and some were running in with the fire to set the parlour on fire.

    Jury. How many times did you see this prisoner? - Two or three times.

    Had he his hat on when you saw him? - Yes.

    ANN LESSAR sworn.

    Do you know the prisoner? - Yes.

    Where do you live? - I lodge in the same lodging, in which the prisoner lodged; I took the lodging of him and the landlady.

    Do you remember his coming to you and bringing you any stockings? - He gave me three pair of stockings to mark.

    What mark did he bid you put upon them? - Any kind of mark to distinguish them at the washerwoman's. I put BB, the initials of his name upon them.

    Had he left a trunk in the room? - Yes, the trunk was found there by the constable when he came; it was locked, he had the key of it.

    Who had the key of the room? - I had; nobody could get at the box without my knowledge.

    PERCIVAL PHILLIPS sworn.

    I am a constable. I searched the lodging of the prisoner last Tuesday-week.

    Did you find a trunk there? - I did.

    Did you find any thing in that trunk? - Yes; these stockings, this pocket book, and a handkerchief. (producing them.)

    Any thing else? - This key (producing it) was upon the shelf in the lodging.

    Mr. RICHARD AKERMAN sworn.

    This pocket-book, I believe, has been in my possession thirty years; it was, I believe, in one of the drawers belonging to my wife; here are several of my banker's cheques which had my name to them.

    Look at the stockings? - Here is a very remarkable pair which I had made for me, and the maker wove the initials of my

    name in them in open work; the prisoner has put the initials of his name (B B) over it; they were in the drawers in a one-pair of stairs room. Here are several others that were marked by my sister, they are mine; I believe the handkerchiefs to be mine, but there are no particular marks on them; there are a pair of stockings that were taken off the prisoner's legs, which has the name cut out.

    To Phillips. Did you take them off the prisoner's legs? - I did.

    To Mr. Akerman. Is the place that is cut out the place where the name was wove? - Yes. This is a remarkable key; it is a key of the Park, it has a crown and my name at length upon it.

    To Lessar. Do you know any thing of the key that was found in the lodging? - No, it was on the shelf when he had the lodging?

    Was it there when he left the lodging? - I believe it was there; I saw it once or twice; I never knew the meaning of the key.

    Prisoner. My Lord, please to ask that woman if she did not wash the handkerchief the things were tied up in? - I washed a blue-and-white silk handkerchief, I cannot swear it was this, it was all over mud. I washed it on the Thursday, the first week that I was in the house.

    Was that after the burning of Newgate? - Yes. I was not in town till it was burnt.

    Prisoner. I leave my defence to my counsel and my witnesses.

    For the prisoner.

    Dr. SANDIMAN sworn.

    Do you know the prisoner? - Yes, I knew him five years ago, he lived with a relation of mine; he bore an exceeding good character; he used to come backwards and forwards to my house.

    ROBERT GATES sworn.

    I am footman to Mr. Goodhousen in Golden Square.

    Do you know the prisoner? - I do; I have known him perfectly well from the second day after he came to England, which is six years ago; he lived with a person I knew in America, that person gave him an excellent character, and he has always borne a good character since I knew him.

    GRACE ROBERTS sworn.

    The prisoner lay at our house the night that the prison was burnt.

    What time did you see him that night? - I am not positive to the hour he came in, it was from nine to eleven o'clock.

    What time did he come home? - I am not positive to the hour, it was a little after nine.

    Are you positive of that? - Yes.

    Where do you live? - At No. 3, in Berner's-street.

    He came home a little after nine? - Yes, I am certain of it; he continued there all that night till six in the morning, and was never out of the house.

    What day was that? - The 6th of June.

    What day of the week? - I am not certain.

    Are you sure it was the night the prison was burnt? - I am.

    What prison? - I am not certain what prison, I heard it mentioned in the family that the prison was burnt down.

    Cross Examination.

    Who bid you to remember the 6th of June? - I remember it by the people being taken up.

    When did you talk of its being the 6th of June? - I know he lay at our house on the 6th of June.

    Did you take notice of any other night when he lay there? - No.

    Did not he lie there on the 7th and 8th of June? - No, only that night.

    You are an acquaintance of his? - Yes.

    Is he a married man? - I cannot say.

    Did he bring any body with him? - No.

    Did he lie by himself? - Yes, I gave him a candle to light him to bed.

    Did you know he was to lie there that night? - Yes, he told my fellow servant so.

    You are a servant, are you? - Yes.

    Did your master know that this man was to lie in the house? - I cannot tell.

    Do you let such persons lie in the house without your master's knowledge? - He was an old servant, he lay in the servants hall.

    Other servants lie there? - Yes, there was a black lay there.

    JOHN NORTHINGTON (a Black) sworn.

    I am servant to Mr. Wood.

    Did the prisoner lie at your house? - Yes, on the night that Holbourn was on fire.

    When the house of Mr. Langdale was on fire? - Yes, the man that lives in Holbourn.

    Counsel for the Crown. That was on Wednesday night, the 7th?

    To Roberts. Where did the prisoner use to sleep at other times? - In the same bed.

    That was when he was a servant there? - Yes.

    When he was not a servant there where did he sleep? - He never lay at our house when he was not a servant but that night; I cannot be positive to the night nor the day of the week; I say nothing but the truth.

    Prisoner to Ann Wood . What dress had I on that night? - A light brownish coat, a round hat, and a red waistcoat.

    GUILTY ( Death .)

    Tried by the Second London Jury before Mr. Justice NARES.

    +
    +

    View as XML

    + +

    @@ -195,8 +195,8 @@

    Footer

    @@ -204,5 +204,5 @@

    Footer

    - - + + diff --git a/css/style.css b/css/style.css index bb73c23adb..9e20c2ffa7 100644 --- a/css/style.css +++ b/css/style.css @@ -1,220 +1,245 @@ ---- -skip_concordance: true ---- +--- skip_concordance: true permalink: /css/style.css --- /* ============================================================================= Color Styles ========================================================================== */ :root { - --primary-color: #302AE6; - --secondary-color: #536390; - --primary-header-color: #333; - --secondary-header-color: #fff; - --font-color: #666; - --secondary-font-color: #eee; - --bg-color: #fff; - --secondary-bg-color: #194c75; - --tertiary-bg-color: #1a6875; - --heading-color: #292922; - --above-title-color: #757575; - --a-hover-active-color: #555; - --header-lesson-color: #f5f5f5; - --header-helpers-color: #ccc; - --blockquote-color: #eeeeee; + --primary-color: #302AE6; + --secondary-color: #536390; + --primary-header-color: #333; + --secondary-header-color: #fff; + --font-color: #666; + --secondary-font-color: #eee; + --bg-color: #fff; + --secondary-bg-color: #194c75; + --tertiary-bg-color: #1a6875; + --heading-color: #292922; + --above-title-color: #757575; + --a-hover-active-color: #555; + --header-lesson-color: #f5f5f5; + --header-helpers-color: #ccc; + --blockquote-color: #eeeeee; } [data-theme="night"] { - --primary-color: #9A97F3; - --secondary-color: #818cab; - --primary-header-color: #e1e1ff; - --font-color: #fff; - --bg-color: #161625; - --heading-color: #818cab; - --above-title-color: #e1e1ff; - --a-hover-active-color: #9A97F3; - --header-lesson-color: #171f24; - --header-helpers-color: #444444; - --blockquote-color: #535353; + --primary-color: #9A97F3; + --secondary-color: #818cab; + --primary-header-color: #e1e1ff; + --font-color: #fff; + --bg-color: #161625; + --heading-color: #818cab; + --above-title-color: #e1e1ff; + --a-hover-active-color: #9A97F3; + --header-lesson-color: #171f24; + --header-helpers-color: #444444; + --blockquote-color: #535353; } @media screen { - body { - font-family: 'Quattrocento', Verdana, sans-serif; - font-size:16px; - background-color: var(--bg-color); - } + body { + font-family: 'Quattrocento', Verdana, sans-serif; + font-size: 16px; + background-color: var(--bg-color); + } - .container { - max-width: 48rem; - overflow: hidden; - text-overflow: ellipsis; - } + .container { + max-width: 48rem; + overflow: hidden; + text-overflow: ellipsis; + } -/* ============================================================================= + /* ============================================================================= Helper classes ========================================================================== */ - .noclear { - clear:none; - } + .noclear { + clear: none; + } - .expanded { - max-width: 58rem; - } + .expanded { + max-width: 58rem; + } - .garnish { - width: 23%; - padding:0; - } + .garnish { + width: 23%; + padding: 0; + } - .full-width { - width:80%; - margin: 0 auto; - text-align:center; - } + .full-width { + width: 80%; + margin: 0 auto; + text-align: center; + } - .float-right { - float:right; - margin-left: 1rem; - margin-bottom: 1rem; - } + .float-right { + float: right; + margin-left: 1rem; + margin-bottom: 1rem; + } - .float-left { - margin-right: 1rem; - margin-bottom: 1rem; - } + .float-left { + margin-right: 1rem; + margin-bottom: 1rem; + } -/* ============================================================================= + /* ============================================================================= Home Page ========================================================================== */ - .home-block { - padding:3rem 0; - color: var(--font-color); - } - - .home-block h2 { - margin:0; - font-size:2.8rem; - color: var(--primary-header-color); - text-align:center; - } - - .home-block p { - margin:0rem; - font-family:'Open Sans'; - font-size:1.2rem; - padding-top:2rem; - text-align:justify; - } - - .home-block a:visited { - color: #38c; - } - - .home-stripe-1 { - color: var(--secondary-font-color); - background: var(--secondary-bg-color); - } - - .home-stripe-1 h2, .home-stripe-2 h2 { - color: var(--secondary-header-color); - } - - .home-stripe-1 a:visited, .home-stripe-1 a:link { - color:#72c0ff; - } - - .home-stripe-2 { - color:#fff; - background:var(--tertiary-bg-color); - } - - .home-stripe-2 a:visited, .home-stripe-2 a:link { - color:#74ebff; - } - - .home-image { - width: 75%; - } - - .home-logo img { - width: 200px; - } - - .home-logo a h1 { - color: #fff; - } - - .home-logo { - color: #fff; - } - - .home-logo li { - font-size: 1.2rem; - } - - .en-back { - background-color: {{ site.data.snippets.background_color.en }}; - } - - .es-back { - background-color: {{ site.data.snippets.background_color.es }}; - } - - .fr-back { - background-color: {{ site.data.snippets.background_color.fr }}; - } - - .pt-back { - background-color: {{ site.data.snippets.background_color.pt }}; - } - - - .sitewide-alert { - position: relative; - margin-bottom: 0; - } - -/* ============================================================================= + .home-block { + padding: 3rem 0; + color: var(--font-color); + } + + .home-block h2 { + margin: 0; + font-size: 2.8rem; + color: var(--primary-header-color); + text-align: center; + } + + .home-block p { + margin: 0rem; + font-family: 'Open Sans'; + font-size: 1.2rem; + padding-top: 2rem; + text-align: justify; + } + + .home-block a:visited { + color: #38c; + } + + .home-stripe-1 { + color: var(--secondary-font-color); + background: var(--secondary-bg-color); + } + + .home-stripe-1 h2, + .home-stripe-2 h2 { + color: var(--secondary-header-color); + } + + .home-stripe-1 a:visited, + .home-stripe-1 a:link { + color: #72c0ff; + } + + .home-stripe-2 { + color: #fff; + background: var(--tertiary-bg-color); + } + + .home-stripe-2 a:visited, + .home-stripe-2 a:link { + color: #74ebff; + } + + .home-image { + width: 75%; + } + + .home-logo img { + width: 200px; + } + + .home-logo a h1 { + color: #fff; + } + + .home-logo { + color: #fff; + } + + .home-logo li { + font-size: 1.2rem; + } + + .en-back { + background-color: { + { + site.data.snippets.background_color.en + } + } + + ; + } + + .es-back { + background-color: { + { + site.data.snippets.background_color.es + } + } + + ; + } + + .fr-back { + background-color: { + { + site.data.snippets.background_color.fr + } + } + + ; + } + + .pt-back { + background-color: { + { + site.data.snippets.background_color.pt + } + } + + ; + } + + + .sitewide-alert { + position: relative; + margin-bottom: 0; + } + + /* ============================================================================= Lesson Headers ========================================================================== */ - header { - margin:-3rem 0 3rem 0; - padding:0; - font-family:'Roboto', sans-serif; - color:#ccc; - background: var(--bg-color); - border-top:1px solid #333; - border-bottom:1px solid #333; - text-align:left; - } - - header .container-fluid { - margin:0; - padding:1rem; - background: var(--header-lesson-color); - ; - } - - header h1 { - margin:0; - padding:0; - font-size:1.8rem; - text-align:left; - } - - header h2 { - font-family:'Roboto', sans-serif; - font-size:1.2rem; - color: var(--primary-header-color); - margin: 1.5rem 0 1.5rem 0rem; - text-align:left; - } + header { + margin: -3rem 0 3rem 0; + padding: 0; + font-family: 'Roboto', sans-serif; + color: #ccc; + background: var(--bg-color); + border-top: 1px solid #333; + border-bottom: 1px solid #333; + text-align: left; + } + + header .container-fluid { + margin: 0; + padding: 1rem; + background: var(--header-lesson-color); + ; + } + + header h1 { + margin: 0; + padding: 0; + font-size: 1.8rem; + text-align: left; + } + + header h2 { + font-family: 'Roboto', sans-serif; + font-size: 1.2rem; + color: var(--primary-header-color); + margin: 1.5rem 0 1.5rem 0rem; + text-align: left; + } header h3, header h4 { font: .9rem/1.1rem 'Roboto Condensed', sans-serif; @@ -225,806 +250,971 @@ Lesson Headers margin:.3rem 0 0 0; padding:0; } - - header h4 { - display:inline; - margin:0; - line-height:1.3rem; - } - - header .header-image { - float:left; - border:.2rem solid gray; - margin:0; - padding:0; - max-width: 200px; - } - - header .header-abstract { - font: 1rem/1.4rem 'Roboto', sans-serif; - color: var(--font-color); - margin:1rem 0; - } - - header .header-helpers { - clear:both; - background:var(--header-helpers-color); - color: #fff; - border-top:1px solid #999; - border-bottom:1px solid #999; - } - - header ul { - margin:0; - padding:0; - list-style-type: none; - } - - header li, header .metarow { - font: .9rem/1.1rem 'Roboto Condensed'; - } - - header .metarow { - color:#999; - } - - header .peer-review, header .open-license { - font-size: 0.9rem; - color: var(--font-color); - margin: 0; - } - -/* ============================================================================= + header h3, + header h4 { + font: .9rem/1.1rem 'Roboto Condensed', sans-serif; + text-transform: uppercase; + font-variant: small-caps; + /* letter-spacing:80%; */ + color: #565656; + margin: .3rem 0 0 0; + padding: 0; + } + + header h4 { + display: inline; + margin: 0; + line-height: 1.3rem; + } + + header .header-image { + float: left; + border: .2rem solid gray; + margin: 0; + padding: 0; + max-width: 200px; + } + + header .header-abstract { + font: 1rem/1.4rem 'Roboto', sans-serif; + color: var(--font-color); + margin: 1rem 0; + } + + header .header-helpers { + clear: both; + background: var(--header-helpers-color); + color: #fff; + border-top: 1px solid #999; + border-bottom: 1px solid #999; + } + + header ul { + margin: 0; + padding: 0; + list-style-type: none; + } + + header li, + header .metarow { + font: .9rem/1.1rem 'Roboto Condensed'; + } + + header .metarow { + color: #999; + } + + header .peer-review, + header .open-license { + font-size: 0.9rem; + color: var(--font-color); + margin: 0; + } + + /* ============================================================================= Lessons Index ========================================================================== */ -/***************** + /***************** FILTER BUTTONS ******************/ - ul.filter, ul.sort-by { - margin: 0 0 1rem 0; - padding: 0px; - text-align:center; - } - - li.filter, - li.sort, - #filter-none { - font: .9rem/1.1rem 'Open Sans', sans-serif; - padding: .4rem .6rem; - border:none; - border-radius: 3px; - display:inline-block; - text-transform:uppercase; - text-decoration: none; - } - - .filter li:hover, - .sort-by li:hover, - #filter-none:hover { - cursor: pointer; - } - - .activities li.current:hover, - .filter li.current:hover, - .sort-by li.current:hover { - cursor:default; - } - - .topic li a { - text-decoration: none; - } - - .activities li { - background-color:#38c; - color:#fff; - } - - .activities li:hover { - background-color:#16a; - } - - .activities li.current { - background-color:#059; - } - - .topics li { - background-color:#eee; - color: #38a; - } - - .topics li:hover { - background-color:#ccc; - } - - .topics li.current { - background-color:#aaa; - color: #333; - } - - - #filter-none { - width:99.5%; - clear:both; - text-align:center; - margin-bottom:1rem; - background-color:#fefefe; - color:#666; - border:1px solid #999; - } - - #filter-none:hover { - background-color:#ededed; - } - - /***************** + ul.filter, + ul.sort-by { + margin: 0 0 1rem 0; + padding: 0px; + text-align: center; + } + + li.filter, + li.sort, + #filter-none { + font: .9rem/1.1rem 'Open Sans', sans-serif; + padding: .4rem .6rem; + border: none; + border-radius: 3px; + display: inline-block; + text-transform: uppercase; + text-decoration: none; + } + + .filter li:hover, + .sort-by li:hover, + #filter-none:hover { + cursor: pointer; + } + + .activities li.current:hover, + .filter li.current:hover, + .sort-by li.current:hover { + cursor: default; + } + + .topic li a { + text-decoration: none; + } + + .activities li { + background-color: #38c; + color: #fff; + } + + .activities li:hover { + background-color: #16a; + } + + .activities li.current { + background-color: #059; + } + + .topics li { + background-color: #eee; + color: #38a; + } + + .topics li:hover { + background-color: #ccc; + } + + .topics li.current { + background-color: #aaa; + color: #333; + } + + + #filter-none { + width: 99.5%; + clear: both; + text-align: center; + margin-bottom: 1rem; + background-color: #fefefe; + color: #666; + border: 1px solid #999; + } + + #filter-none:hover { + background-color: #ededed; + } + + /***************** SEARCH *****************/ - .search-input { - width:55%; - clear:both; - margin-bottom:1rem; - background-color:#fefefe; - color:#666; - border:1px solid #999; - font: .9rem/1.1rem 'Open Sans', - sans-serif; - padding: .4rem .6rem; - border-radius: 3px; - display:inline-block; - text-transform:uppercase; - text-decoration: none; - } - - #search-button, - #enable-search-button { - background-color: #efefef; - color: rgb(153, 143, 143); - width: 35%; - font: .9rem/1.1rem 'Open Sans', - sans-serif; - padding: .4rem .6rem; - border: none; - border-radius: 3px; - display: inline-block; - text-transform: uppercase; - text-decoration: none; - } - - @media only screen and (max-width: 767px) { - /* phones */ - #search-button, - #enable-search-button { - width: 80%; - } - } - - - #search-info-button { - padding: 0.5rem; - color: rgb(153, 143, 143); - } - - #search-info { - display: none; - height:0px; - background:#efefef; - overflow:hidden; - transition:0.5s; - -webkit-transition:0.5s; - width: 100%; - text-align: left; - box-sizing: border-box; - } - - #search-info.visible { - display: block; - height: fit-content; - height: -moz-max-content; - padding: 10px; - margin-top: 10px; - } - - /***************** + .search-input { + width: 55%; + clear: both; + margin-bottom: 1rem; + background-color: #fefefe; + color: #666; + border: 1px solid #999; + font: .9rem/1.1rem 'Open Sans', + sans-serif; + padding: .4rem .6rem; + border-radius: 3px; + display: inline-block; + text-transform: uppercase; + text-decoration: none; + } + + #search-button, + #enable-search-button { + background-color: #efefef; + color: rgb(153, 143, 143); + width: 35%; + font: .9rem/1.1rem 'Open Sans', + sans-serif; + padding: .4rem .6rem; + border: none; + border-radius: 3px; + display: inline-block; + text-transform: uppercase; + text-decoration: none; + } + + @media only screen and (max-width: 767px) { + + /* phones */ + #search-button, + #enable-search-button { + width: 80%; + } + } + + + #search-info-button { + padding: 0.5rem; + color: rgb(153, 143, 143); + } + + #search-info { + display: none; + height: 0px; + background: #efefef; + overflow: hidden; + transition: 0.5s; + -webkit-transition: 0.5s; + width: 100%; + text-align: left; + box-sizing: border-box; + } + + #search-info.visible { + display: block; + height: fit-content; + height: -moz-max-content; + padding: 10px; + margin-top: 10px; + } + + /***************** SORT BUTTONS *****************/ - li.sort { - background-color: #efefef; - color:#666; - width:49.5%; - } - - li.sort:hover { - text-decoration: none; - background-color:#cecece; - } - - #current-sort { - font-size:75%; - } - - .sort.my-desc:after, .sort-desc:after { - width: 0; - height: 0; - border-left: .4rem solid transparent; - border-right: .4rem solid transparent; - border-top: .4rem solid; - content:""; - position: relative; - top:.75rem; - right:-.3rem; - } - - .sort.my-asc:after, .sort-asc:after { - width: 0; - height: 0; - border-left: .4rem solid transparent; - border-right: .4rem solid transparent; - border-bottom: .4rem solid; - content:""; - position: relative; - bottom:.75rem; - right:-.3rem; - } - - .sort-desc:after { - top:1rem; - } - - .sort-asc:after { - bottom:1rem; - } - - /***************************** + li.sort { + background-color: #efefef; + color: #666; + width: 49.5%; + } + + li.sort:hover { + text-decoration: none; + background-color: #cecece; + } + + #current-sort { + font-size: 75%; + } + + .sort.my-desc:after, + .sort-desc:after { + width: 0; + height: 0; + border-left: .4rem solid transparent; + border-right: .4rem solid transparent; + border-top: .4rem solid; + content: ""; + position: relative; + top: .75rem; + right: -.3rem; + } + + .sort.my-asc:after, + .sort-asc:after { + width: 0; + height: 0; + border-left: .4rem solid transparent; + border-right: .4rem solid transparent; + border-bottom: .4rem solid; + content: ""; + position: relative; + bottom: .75rem; + right: -.3rem; + } + + .sort-desc:after { + top: 1rem; + } + + .sort-asc:after { + bottom: 1rem; + } + + /***************************** LESSON INDEX RESULTS LIST *****************************/ - h2.results-title { - margin:1rem 0; - font: 1.6rem/2rem 'Roboto Condensed'; - color: var(--font-color); - text-transform:uppercase; - } - - #results-value { - color:#000; - } - - - #lesson-list .list ul { - margin:0; - padding:0; - } - - #lesson-list .list li { - list-style-type:none; - margin:0; - } - - - .lesson-description { - margin-bottom:2rem; - padding:0rem; - min-height:120px; - text-align:left; - } - - .lesson-description img { - width:100%; - } - - .lesson-image { - width:120px; - float:left; - margin-right:1rem; - } - - .above-title { - margin:0 0 .2rem 0; - font: .8rem/1rem 'Roboto Condensed'; - color:var(--above-title-color); - text-transform:uppercase; - clear:none; - } - - .lesson-description h2.title { - font: 1.2rem/1.3rem 'Crete Round', serif; - margin:0 0 .8rem 0; - clear:none; - } - - .list .date, - .lesson-description .activity, - .lesson-description .topics, - .lesson-description .difficulty { - display: none; - } - - #pre-loader { - visibility: hidden; - display: flex; - justify-content: center; - align-items: center; - height: 100vh; - width: 100%; - position: fixed; - top: 0; - left: 0; - z-index: 9999; - transition: opacity 0.3s linear; - background: rgba(211, 211, 211, 0.8); - } -/* ============================================================================= + h2.results-title { + margin: 1rem 0; + font: 1.6rem/2rem 'Roboto Condensed'; + color: var(--font-color); + text-transform: uppercase; + } + + #results-value { + color: #000; + } + + + #lesson-list .list ul { + margin: 0; + padding: 0; + } + + #lesson-list .list li { + list-style-type: none; + margin: 0; + } + + + .lesson-description { + margin-bottom: 2rem; + padding: 0rem; + min-height: 120px; + text-align: left; + } + + .lesson-description img { + width: 100%; + } + + .lesson-image { + width: 120px; + float: left; + margin-right: 1rem; + } + + .above-title { + margin: 0 0 .2rem 0; + font: .8rem/1rem 'Roboto Condensed'; + color: var(--above-title-color); + text-transform: uppercase; + clear: none; + } + + .lesson-description h2.title { + font: 1.2rem/1.3rem 'Crete Round', serif; + margin: 0 0 .8rem 0; + clear: none; + } + + .list .date, + .lesson-description .activity, + .lesson-description .topics, + .lesson-description .difficulty { + display: none; + } + + #pre-loader { + visibility: hidden; + display: flex; + justify-content: center; + align-items: center; + height: 100vh; + width: 100%; + position: fixed; + top: 0; + left: 0; + z-index: 9999; + transition: opacity 0.3s linear; + background: rgba(211, 211, 211, 0.8); + } + + /* ============================================================================= Top Navigation Bar ========================================================================== */ - .navbar { - padding: .6rem 1rem; - margin: 0 0 3rem 0; - } - - .navbar-dark .navbar-nav .nav-link { - font-family:'Open Sans'; - text-transform:uppercase; - color:#fff; - font-size:.9rem; - } - - .btn-group > .btn-secondary { - border-color: #333333; - background-color: #6f6f6f; - } - - .lang { - text-transform:lowercase !important; - } - - .navbar-dark .navbar-nav .nav-link:hover, .navbar-dark .navbar-brand:hover { - color:#39a; - } - - .navbar-toggler-icon { - background-image: url("data:image/svg+xml;charset=utf8,%3Csvg viewBox='0 0 32 32' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath stroke='rgba(255,255,255, 1)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 8h24M4 16h24M4 24h24'/%3E%3C/svg%3E"); - } - - .navbar-collapse { - text-align:center; - } - - .navbar-dark .navbar-brand { - font-family:'Crete Round', serif; - color:#fff; - letter-spacing: .02em; - } - - .btn-group > a.btn { - padding-left: 1rem; - padding-right: 1rem; - } - - a.dropdown-item { - border-bottom:1px solid #ccc; - font-family:'Roboto'; - } - - .dropdown-menu { - position: absolute; - background: #fff; - border: 1px solid #ccc; - margin:0; - padding:0; - } - - .dropdown-menu a { - font-size:.8rem; - line-height:2rem; - text-transform:uppercase; - } - - .dropdown-menu a:last-child { - border-bottom:none; - } - - .dropdown-menu:after, .dropdown-menu:before { - bottom: 100%; - left: 20%; - border: solid transparent; - content: " "; - height: 0; - width: 0; - position: absolute; - pointer-events: none; - } - - .dropdown-menu:after { - border-color: rgba(255, 255, 255, 0); - border-bottom-color: #fff; - border-width: 12px; - margin-left: -12px; - } - .dropdown-menu:before { - border-color: rgba(51, 153, 170, 0); - border-bottom-color: #ccc; - border-width: 13px; - margin-left: -13px; - } - - .navbar-dark .navbar-nav .nav-link:focus { - color: #ccc; - } - - .header-link { - position: absolute; - right: 0.6em; - opacity: 0; - -webkit-transition: opacity 0.2s ease-in-out 0.1s; - -moz-transition: opacity 0.2s ease-in-out 0.1s; - -ms-transition: opacity 0.2s ease-in-out 0.1s; - } - - h2:hover .header-link, - h3:hover .header-link, - h4:hover .header-link, - h5:hover .header-link, - h6:hover .header-link { - opacity: 1; - } - -/* ============================================================================= + .navbar { + padding: .6rem 1rem; + margin: 0 0 3rem 0; + } + + .navbar-dark .navbar-nav .nav-link { + font-family: 'Open Sans'; + text-transform: uppercase; + color: #fff; + font-size: .9rem; + } + + .btn-group>.btn-secondary { + border-color: #333333; + background-color: #6f6f6f; + } + + .lang { + text-transform: lowercase !important; + } + + .navbar-dark .navbar-nav .nav-link:hover, + .navbar-dark .navbar-brand:hover { + color: #39a; + } + + .navbar-toggler-icon { + background-image: url("data:image/svg+xml;charset=utf8,%3Csvg viewBox='0 0 32 32' xmlns='http://www.w3.org/2000/svg'%3E%3Cpath stroke='rgba(255,255,255, 1)' stroke-width='2' stroke-linecap='round' stroke-miterlimit='10' d='M4 8h24M4 16h24M4 24h24'/%3E%3C/svg%3E"); + } + + .navbar-collapse { + text-align: center; + } + + .navbar-dark .navbar-brand { + font-family: 'Crete Round', serif; + color: #fff; + letter-spacing: .02em; + } + + .btn-group>a.btn { + padding-left: 1rem; + padding-right: 1rem; + } + + a.dropdown-item { + border-bottom: 1px solid #ccc; + font-family: 'Roboto'; + } + + .dropdown-menu { + position: absolute; + background: #fff; + border: 1px solid #ccc; + margin: 0; + padding: 0; + } + + .dropdown-menu a { + font-size: .8rem; + line-height: 2rem; + text-transform: uppercase; + } + + .dropdown-menu a:last-child { + border-bottom: none; + } + + .dropdown-menu:after, + .dropdown-menu:before { + bottom: 100%; + left: 20%; + border: solid transparent; + content: " "; + height: 0; + width: 0; + position: absolute; + pointer-events: none; + } + + .dropdown-menu:after { + border-color: rgba(255, 255, 255, 0); + border-bottom-color: #fff; + border-width: 12px; + margin-left: -12px; + } + + .dropdown-menu:before { + border-color: rgba(51, 153, 170, 0); + border-bottom-color: #ccc; + border-width: 13px; + margin-left: -13px; + } + + .navbar-dark .navbar-nav .nav-link:focus { + color: #ccc; + } + + .header-link { + position: absolute; + right: 0.6em; + opacity: 0; + -webkit-transition: opacity 0.2s ease-in-out 0.1s; + -moz-transition: opacity 0.2s ease-in-out 0.1s; + -ms-transition: opacity 0.2s ease-in-out 0.1s; + } + + h2:hover .header-link, + h3:hover .header-link, + h4:hover .header-link, + h5:hover .header-link, + h6:hover .header-link { + opacity: 1; + } + + /* ============================================================================= Lesson Typography ========================================================================== */ - a {text-decoration:none;} - - a:link {color: #2e7ab7;} - a:visited {color: #39a;} - a:hover {color: var(--a-hover-active-color);} - a:active {color: var(--a-hover-active-color);} - - b, strong { font-weight: bold; } - - blockquote { - margin: 1em 2em; - padding: 0 1em 0 1em; - font-style: italic; - border:1px solid #666; - background: var(--blockquote-color); - } - - blockquote p{ - margin-top: 1rem; - } - - hr { - display: block; height: 1px; border: 0; border-top: 1px solid #ccc; margin: 2em 0; padding: 0; } - - img { - max-width:100%; - } - - ins { background: #ff9; color: #000; text-decoration: none; } - - - h1,h2,h3,h4,h5 { - font-family:'Crete Round', serif; - font-weight:normal; - clear:both; - } - - - h1 { - font-size:2rem; - margin-bottom:1.5rem; - letter-spacing:-.03rem; - text-align:center; - color: var(--primary-header-color) - } - - h2 { - font-size:1.6rem; - margin-top:3rem; - letter-spacing:-.02rem; - } - - h3 { - font-size:1.4rem; - margin-top:2.5rem; - } - - h4 { - font-size:1.2rem; - margin-top:1.8rem; - } - - h5 { - font-size:1.0rem; - margin-top:1.4rem; - } - - h1 a, h2 a, h3 a, h4 a, h5 a { - text-decoration:none; - } - - h1 a:link { color: #38c; } - h1 a:visited {color: #39a; } - - - /* select button generated by codeblocks.js */ - .fa-align-left {opacity: 0.2;} - .highlight:hover .fa-align-left {opacity: 1;} - - q { quotes: none; } - q:before, q:after { content: ""; content: none; } - - small { font-size: 85%; } - - /* Position subscript and superscript content without affecting line-height: h5bp.com/k */ - sub, sup { font-size: 75%; line-height: 0; position: relative; vertical-align: baseline; } - sup { top: -0.5em; } - sub { bottom: -0.25em; } - - li { - margin-bottom:.5rem; - line-height:1.4rem; - } - - li.nav-item { - margin-bottom:0; - } - - .alert { - font-family: 'Roboto'; - } - - .alert h2, .alert h3, .alert h4 { - margin-top:0; - } - - -/* ============================================================================= + a { + text-decoration: none; + } + + a:link { + color: #2e7ab7; + } + + a:visited { + color: #39a; + } + + a:hover { + color: var(--a-hover-active-color); + } + + a:active { + color: var(--a-hover-active-color); + } + + b, + strong { + font-weight: bold; + } + + blockquote { + margin: 1em 2em; + padding: 0 1em 0 1em; + font-style: italic; + border: 1px solid #666; + background: var(--blockquote-color); + } + + blockquote p { + margin-top: 1rem; + } + + hr { + display: block; + height: 1px; + border: 0; + border-top: 1px solid #ccc; + margin: 2em 0; + padding: 0; + } + + img { + max-width: 100%; + } + + ins { + background: #ff9; + color: #000; + text-decoration: none; + } + + + h1, + h2, + h3, + h4, + h5 { + font-family: 'Crete Round', serif; + font-weight: normal; + clear: both; + } + + + h1 { + font-size: 2rem; + margin-bottom: 1.5rem; + letter-spacing: -.03rem; + text-align: center; + color: var(--primary-header-color) + } + + h2 { + font-size: 1.6rem; + margin-top: 3rem; + letter-spacing: -.02rem; + } + + h3 { + font-size: 1.4rem; + margin-top: 2.5rem; + } + + h4 { + font-size: 1.2rem; + margin-top: 1.8rem; + } + + h5 { + font-size: 1.0rem; + margin-top: 1.4rem; + } + + h1 a, + h2 a, + h3 a, + h4 a, + h5 a { + text-decoration: none; + } + + h1 a:link { + color: #38c; + } + + h1 a:visited { + color: #39a; + } + + + /* select button generated by codeblocks.js */ + .fa-align-left { + opacity: 0.2; + } + + .highlight:hover .fa-align-left { + opacity: 1; + } + + q { + quotes: none; + } + + q:before, + q:after { + content: ""; + content: none; + } + + small { + font-size: 85%; + } + + /* Position subscript and superscript content without affecting line-height: h5bp.com/k */ + sub, + sup { + font-size: 75%; + line-height: 0; + position: relative; + vertical-align: baseline; + } + + sup { + top: -0.5em; + } + + sub { + bottom: -0.25em; + } + + li { + margin-bottom: .5rem; + line-height: 1.4rem; + } + + li.nav-item { + margin-bottom: 0; + } + + .alert { + font-family: 'Roboto'; + } + + .alert h2, + .alert h3, + .alert h4 { + margin-top: 0; + } + + + /* ============================================================================= Code Highlighting ========================================================================== */ - code { - font-family: monospace, serif; - font-size:.9rem; - } + code { + font-family: monospace, serif; + font-size: .9rem; + } - .highlight { - margin: 1rem 0 1rem 0; - padding:.5rem .2rem; - font-size:.9rem; - white-space: pre; - word-wrap: normal; - overflow: auto; - border: 1px solid #eee; - background: #fafafa; - } + .highlight { + margin: 1rem 0 1rem 0; + padding: .5rem .2rem; + font-size: .9rem; + white-space: pre; + word-wrap: normal; + overflow: auto; + border: 1px solid #eee; + background: #fafafa; + } -/* ============================================================================= + /* ============================================================================= Figures ========================================================================== */ - figure { - margin: 0 auto .5rem; - text-align: center; - display:table; - } - - figcaption { - margin-top:.5rem; - font-family:'Open Sans'; - font-size:0.8em; - color: var(--font-color); - display:block; - caption-side: bottom; - } - - .author-info, .citation-info { - border-top:1px solid #333; - padding-top:1rem; - margin-top:2rem; - } - - .author-name, .suggested-citation-header { - font-family:'Roboto Condensed'; - font-weight: 600; - font-size:1.2rem; - color: var(--font-color); - text-transform:uppercase; - } - - .author-description p, .suggested-citation-text p { - font-size:0.9rem; - font-family:'Open Sans'; - color: var(--font-color); - } - - /* ============================================================================= + figure { + margin: 0 auto .5rem; + text-align: center; + display: table; + } + + figcaption { + margin-top: .5rem; + font-family: 'Open Sans'; + font-size: 0.8em; + color: var(--font-color); + display: block; + caption-side: bottom; + } + + .author-info, + .citation-info { + border-top: 1px solid #333; + padding-top: 1rem; + margin-top: 2rem; + } + + .author-name, + .suggested-citation-header { + font-family: 'Roboto Condensed'; + font-weight: 600; + font-size: 1.2rem; + color: var(--font-color); + text-transform: uppercase; + } + + .author-description p, + .suggested-citation-text p { + font-size: 0.9rem; + font-family: 'Open Sans'; + color: var(--font-color); + } + + /* ============================================================================= Tables ========================================================================== */ - table { - width: 100%; - margin-bottom: 1em; - } - - th, td { - padding: 10px; - text-align: left; - border-bottom: 1px solid #ddd; - } - - thead { - background-color: #535353; - color: #fff; - font-weight: bold; - } - - .table-wrapper { - overflow-x: scroll; - } - - tr:nth-child(even) {background-color: #f2f2f2} - -/* ============================================================================= + table { + width: 100%; + margin-bottom: 1em; + } + + th, + td { + padding: 10px; + text-align: left; + border-bottom: 1px solid #ddd; + } + + thead { + background-color: #535353; + color: #fff; + font-weight: bold; + } + + .table-wrapper { + overflow-x: scroll; + } + + tr:nth-child(even) { + background-color: #f2f2f2 + } + + /* ============================================================================= Blog Index and Layout ========================================================================== */ - .blog-header { - text-align:center; - } - - .blog-header h2 { - margin:0; - line-height: 2rem; - } - - .blog-header h3 { /*author*/ - margin-top:.4rem; - color: #666; - font-size:1rem; - } - - .blog-header h4{ - color: #999; - font-size:1rem; - margin-bottom:.2rem; - font-family:'Roboto Condensed'; - text-transform:uppercase; - } - - .blog-header figure { - max-width:80%; - } - - .blog-header figcaption { - text-align: center; - } - - .blog-page-header { - margin-bottom:3rem; - } - -/* ============================================================================= + .blog-header { + text-align: center; + } + + .blog-header h2 { + margin: 0; + line-height: 2rem; + } + + .blog-header h3 { + /*author*/ + margin-top: .4rem; + color: #666; + font-size: 1rem; + } + + .blog-header h4 { + color: #999; + font-size: 1rem; + margin-bottom: .2rem; + font-family: 'Roboto Condensed'; + text-transform: uppercase; + } + + .blog-header figure { + max-width: 80%; + } + + .blog-header figcaption { + text-align: center; + } + + .blog-page-header { + margin-bottom: 3rem; + } + + /* ============================================================================= Project Team ========================================================================== */ - .contact-box { - margin-bottom:3rem; - } + .contact-box { + margin-bottom: 3rem; + } -/* ============================================================================= + /* ============================================================================= Footer ========================================================================== */ - footer[role="contentinfo"] { - margin-top: 2rem; - padding: 2rem 0; - font-family:'Open Sans'; - font-size:.9rem; - color: #fff; - background-color:#666; - text-align:center; - } + footer[role="contentinfo"] { + margin-top: 2rem; + padding: 2rem 0; + font-family: 'Open Sans'; + font-size: .9rem; + color: #fff; + background-color: #666; + text-align: center; + } + + footer a, + footer a:link, + footer a:visited { + color: #fff; + border-bottom: 1px #eee dotted; + } + + footer a:hover { + text-decoration: none; + border-bottom: 1px #fff solid; + } + + footer .fa { + margin: 0 .2rem 0rem 0rem; + } + + .footer-head { + font-size: 1.1rem; + line-height: 1.4rem; + margin-bottom: 1rem; + } - footer a, footer a:link, footer a:visited { - color: #fff; - border-bottom:1px #eee dotted; - } - - footer a:hover { - text-decoration: none; - border-bottom:1px #fff solid; - } - - footer .fa { - margin: 0 .2rem 0rem 0rem; - } - - .footer-head { - font-size:1.1rem; - line-height:1.4rem; - margin-bottom:1rem; - } +} -} /* end screen */ +/* end screen */ @media only screen and (max-width: 768px) { - .garnish { - display:none; - } - .dropdown-menu:after, .dropdown-menu:before { - display:none; - } + .garnish { + display: none; + } + + .dropdown-menu:after, + .dropdown-menu:before { + display: none; + } } /* Print Styling */ @media screen { - /* Class to hide elements only shown when printing */ - .hide-print { - display: none !important; - } + + /* Class to hide elements only shown when printing */ + .hide-print { + display: none !important; + } } @media print { - * { background: transparent !important; color: black !important; box-shadow:none !important; text-shadow: none !important; filter:none !important; -ms-filter: none !important; } /* Black prints faster: h5bp.com/s */ - a, a:visited { text-decoration: underline; } - a[href]:after { content: " (" attr(href) ")"; } - abbr[title]:after { content: " (" attr(title) ")"; } - a[href^="javascript:"]:after, a[href^="#"]:after { content: ""; } /* Don't show links for images, or javascript/internal links */ - pre, blockquote { - border: 1px solid #999; - page-break-inside: avoid; - margin: 0.5cm; - padding: 0.5cm - } - thead { display: table-header-group; } /* h5bp.com/t */ - tr, img { page-break-inside: avoid; } - img { max-width: 100% !important; } - @page { - margin: 1.5cm; - } - - body { font-size: 0.85rem;} - p, h2, h3 { orphans: 3; widows: 3; } - h1, h2, h3 { page-break-after: avoid; } - h1 { font-size: 1.4rem; } - h2 { font-size: 1.1rem; } - h3 { font-size: 1rem; } - h4 { font-size: 0.9rem; } - .header-bottom { - margin-bottom: 2rem; - page-break-after: always; - } - .hide-screen { - /* Hide elements that only appear on screen */ - display: none !important; - } - - .print-header { - /* format navbar for print */ - display: block; - z-index:1030; - width: 100%; - height: 3rem; - padding: .6rem 1rem; - margin-bottom: 1rem; - color:#fff; - white-space: nowrap; - font-family: 'Crete Round', serif; - border-bottom: 1px solid lightgrey; - } + * { + background: transparent !important; + color: black !important; + box-shadow: none !important; + text-shadow: none !important; + filter: none !important; + -ms-filter: none !important; + } + + /* Black prints faster: h5bp.com/s */ + a, + a:visited { + text-decoration: underline; + } + + a[href]:after { + content: " (" attr(href) ")"; + } + + abbr[title]:after { + content: " (" attr(title) ")"; + } + + a[href^="javascript:"]:after, + a[href^="#"]:after { + content: ""; + } + + /* Don't show links for images, or javascript/internal links */ + pre, + blockquote { + border: 1px solid #999; + page-break-inside: avoid; + margin: 0.5cm; + padding: 0.5cm + } + + thead { + display: table-header-group; + } + + /* h5bp.com/t */ + tr, + img { + page-break-inside: avoid; + } + + img { + max-width: 100% !important; + } + + @page { + margin: 1.5cm; + } + + body { + font-size: 0.85rem; + } + + p, + h2, + h3 { + orphans: 3; + widows: 3; + } + + h1, + h2, + h3 { + page-break-after: avoid; + } + + h1 { + font-size: 1.4rem; + } + + h2 { + font-size: 1.1rem; + } + + h3 { + font-size: 1rem; + } + + h4 { + font-size: 0.9rem; + } + + .header-bottom { + margin-bottom: 2rem; + page-break-after: always; + } + + .hide-screen { + /* Hide elements that only appear on screen */ + display: none !important; + } + + .print-header { + /* format navbar for print */ + display: block; + z-index: 1030; + width: 100%; + height: 3rem; + padding: .6rem 1rem; + margin-bottom: 1rem; + color: #fff; + white-space: nowrap; + font-family: 'Crete Round', serif; + border-bottom: 1px solid lightgrey; + } } body { - background-color: var(--bg-color); - color: var(--font-color); + background-color: var(--bg-color); + color: var(--font-color); } /* h1 { @@ -1036,190 +1226,202 @@ body { } */ a { - color: var(--primary-color); + color: var(--primary-color); } #results-value { - color: var(--heading-color); + color: var(--heading-color); } /*Simple css to style it like a toggle switch*/ @media (max-width: 991.98px) { - /* phones */ - #low-contrast-button { - margin-left: 0; - margin: auto; - padding: 10px; - } + + /* phones */ + #low-contrast-button { + margin-left: 0; + margin: auto; + padding: 10px; + } } @media (min-width: 991.98px) { - /* screens */ - #low-contrast-button { - margin-left: 1em; - padding: 0; - /* margin:inherit; */ - } + + /* screens */ + #low-contrast-button { + margin-left: 1em; + padding: 0; + /* margin:inherit; */ + } } /*Simple css to style it like a toggle switch*/ .theme-switch-wrapper { - display: flex; - align-items: center; + display: flex; + align-items: center; } .theme-switch-wrapper em { - margin-left: 10px; - font-size: 1rem; + margin-left: 10px; + font-size: 1rem; } .theme-switch { - display: inline-block; - height: 34px; - position: relative; - width: 60px; + display: inline-block; + height: 34px; + position: relative; + width: 60px; } .theme-switch input { - display: none; + display: none; } .slider { - background-color: #ccc; - bottom: 0; - cursor: pointer; - left: 0; - position: absolute; - right: 0; - top: 0; - transition: .4s; + background-color: #ccc; + bottom: 0; + cursor: pointer; + left: 0; + position: absolute; + right: 0; + top: 0; + transition: .4s; } .slider:before { - background-color: #fff; - bottom: 4px; - content: "\f185"; - height: 26px; - left: 4px; - position: absolute; - transition: .4s; - width: 26px; - font-family: "Font Awesome 5 Free"; - padding-top: 1px; - vertical-align: middle; - font-weight: 900; - display: inline-block; - color: #666; + background-color: #fff; + bottom: 4px; + content: "\f185"; + height: 26px; + left: 4px; + position: absolute; + transition: .4s; + width: 26px; + font-family: "Font Awesome 5 Free"; + padding-top: 1px; + vertical-align: middle; + font-weight: 900; + display: inline-block; + color: #666; } input:checked+.slider { - background-color: #888888; + background-color: #888888; } input:checked+.slider:before { - transform: translateX(26px); - content: "\f186"; + transform: translateX(26px); + content: "\f186"; } .slider.round { - border-radius: 34px; + border-radius: 34px; } .slider.round:before { - border-radius: 50%; + border-radius: 50%; } /* hidden text for image links and other controls so as to accommodate screen readers*/ .visually-hidden { - clip: rect(0 0 0 0); - clip-path: inset(50%); - height: 1px; - overflow: hidden; - position: absolute; - white-space: nowrap; - width: 1px; + clip: rect(0 0 0 0); + clip-path: inset(50%); + height: 1px; + overflow: hidden; + position: absolute; + white-space: nowrap; + width: 1px; } /* hacking some header stylings because we were cheating around accessible structuring to get the design we wanted */ -div.card div.media-body.p-3 h2{ - font-size:1.2rem; - margin-top:1.8rem; - letter-spacing: normal; +div.card div.media-body.p-3 h2 { + font-size: 1.2rem; + margin-top: 1.8rem; + letter-spacing: normal; } -div.author-info h2.author-name, div.citation-info h2.suggested-citation-header{ - font-family: 'Roboto Condensed'; - font-weight: 600; - font-size: 1.2rem; - color: var(--font-color); - text-transform: uppercase; - margin-top: 1.4rem; - letter-spacing: normal; +div.author-info h2.author-name, +div.citation-info h2.suggested-citation-header { + font-family: 'Roboto Condensed'; + font-weight: 600; + font-size: 1.2rem; + color: var(--font-color); + text-transform: uppercase; + margin-top: 1.4rem; + letter-spacing: normal; } /* modifies the language selector for contrast improvement */ -.btn.disabled, .btn:disabled { - opacity: 1; - text-decoration: underline !important; - font-weight: bold; +.btn.disabled, +.btn:disabled { + opacity: 1; + text-decoration: underline !important; + font-weight: bold; } + /* modifies contrast on team page */ -.badge-success{ - background-color: #026d02; +.badge-success { + background-color: #026d02; } /* correcting accessibility issues on lesson pages */ -header div.header-helpers ul li, header div.metarow { - color: #565656; +header div.header-helpers ul li, +header div.metarow { + color: #565656; } -header h3{ - text-decoration: underline; +header h3 { + text-decoration: underline; } -div.alert.alert-success a{ - color: #2b542c; +div.alert.alert-success a { + color: #2b542c; } -div.peer-review a, main header div.open-license a, div.donate a{ - color: #205279; +div.peer-review a, +main header div.open-license a, +div.donate a { + color: #205279; } -div.highlight span.c1{ - color: #56564d; +div.highlight span.c1 { + color: #56564d; } -div.highlight .mi, div.highlight .nb, div.highlight .mf{ - color: #008080; +div.highlight .mi, +div.highlight .nb, +div.highlight .mf { + color: #008080; } /* accessibility corrections on lessons index page */ -.activities li{ - background-color: #2b74af; +.activities li { + background-color: #2b74af; } -.activities li.current{ - background-color: #014c88; +.activities li.current { + background-color: #014c88; } -.topics li{ - color: #014c88; +.topics li { + color: #014c88; } -#search-button, #enable-search-button{ - color: #014c88; +#search-button, +#enable-search-button { + color: #014c88; } /* support us pages contrast fixes*/ -table thead tr th a:link{ -color: #92cfff; -} -body div.container article div.alert.alert-info a{ - color: #1e4f77; +table thead tr th a:link { + color: #92cfff; } + +body div.container article div.alert.alert-info a { + color: #1e4f77; +} \ No newline at end of file diff --git a/en/about.md b/en/about.md index cd8f2bd485..257f5623c0 100755 --- a/en/about.md +++ b/en/about.md @@ -1,7 +1,7 @@ --- layout: blank title: About the Programming Historian -redirect_from: /about +redirect_from: /about/ --- # About the _Programming Historian_ @@ -10,11 +10,11 @@ redirect_from: /about ## Peer Review All tutorials at _Programming Historian_ are rigorously peer reviewed, guided through the review process by one of our editors. Review involves a thorough exchange with the lesson editor to ensure the lesson works as intended and that all concepts are explained for a non-specialist reader, before the tutorial is sent to external reviewers. -The review process is an integral component of a collaborative, productive, and sustainable effort for scholars to teach and learn from each other. Once a tutorial slips into our [editorial workflow]({{site.baseurl}}/author-guidelines), we do everything we can to make sure the tutorial becomes as useful as possible and published in a reasonable amount of time. Consult our [Reviewer Guidelines]({{site.baseurl}}/reviewer-guidelines) for more information. +The review process is an integral component of a collaborative, productive, and sustainable effort for scholars to teach and learn from each other. Once a tutorial slips into our [editorial workflow]({{site.baseurl}}/en/author-guidelines), we do everything we can to make sure the tutorial becomes as useful as possible and published in a reasonable amount of time. Consult our [Reviewer Guidelines]({{site.baseurl}}/en/reviewer-guidelines) for more information. ## Open Source -The _Programming Historian_ team is committed to open source values. All contributed lessons make use of open source programming languages and software whenever possible. This policy is meant to minimize costs for all parties, and to allow the greatest possible level of participation. We believe everyone should be able to benefit from these tutorials, not just those with large research budgets for expensive proprietary software. Since 2016, a citable version of the _Programming Historian_ project has been deposited on [Zenodo](https://zenodo.org/). The 2022 deposit is available at [doi.org/10.5281/zenodo.7313045](https://doi.org/10.5281/zenodo.7313045). Since 2018, the [UK Web Archive](https://www.webarchive.org.uk/) has made regular crawls of the _Programming Historian_. These are archived and made publicly available [via their website](https://www.webarchive.org.uk/wayback/en/archive/*/http://programminghistorian.org/). +The _Programming Historian_ team is committed to open source values. All contributed lessons make use of open source programming languages and software whenever possible. This policy is meant to minimize costs for all parties, and to allow the greatest possible level of participation. We believe everyone should be able to benefit from these tutorials, not just those with large research budgets for expensive proprietary software. Since 2016, a citable version of the _Programming Historian_ project has been deposited on [Zenodo](https://zenodo.org/). The 2022 deposit is available at [doi.org/10.5281/zenodo.7313045](https://doi.org/10.5281/zenodo.7313045). Since 2018, the [UK Web Archive](https://www.webarchive.org.uk/) has made regular crawls of the _Programming Historian_. These are archived and made publicly available [via their website](https://www.webarchive.org.uk/wayback/en/archive/*/https://programminghistorian.org/). ## Diamond Open Access @@ -25,7 +25,7 @@ We do not charge Article Processing Charges (APCs), nor do we charge library sub The _Programming Historian_ (ISSN {{ site.data.snippets.issn[page.lang] }}) is indexed by the [Directory of Open Access Journals](https://doaj.org/toc/2397-2068). ## Awards -The _Programming Historian_ has won multiple awards which recognise and celebrate our achievements in the spheres of open access publishing and digital scholarship. In 2016 our English-language journal was the winner of the [Digital Humanities Awards](http://dhawards.org/dhawards2016/results/) in the Best Series of Posts category, then in the following year, 2017, _Programming Historian en español_ [won that very same accolade](http://dhawards.org/dhawards2017/results/). In 2018, The _Programming Historian en español_, was the winner of 'Mejor iniciativa formativa desarrollada durante el año 2018', [Humanidades Digitales Hispánicas Association](http://humanidadesdigitaleshispanicas.es/). We won the [Canadian Social Knowledge Institute's Open Scholarship Award](https://etcl.uvic.ca/events-activities/open-scholarship-awards/) 2020 and in 2021 we were awarded [Coko Foundation's Open Publishing Award](https://web.archive.org/web/20220408041024/https://openpublishingawards.org/results/2021/index.html) in their Open Content category. In 2022, we won the Best DH Training Materials category of the [Digital Humanities Awards](http://dhawards.org/dhawards2022/results/). +The _Programming Historian_ has won multiple awards which recognise and celebrate our achievements in the spheres of open access publishing and digital scholarship. In 2016 our English-language journal was the winner of the [Digital Humanities Awards](https://dhawards.org/dhawards2016/results/) in the Best Series of Posts category, then in the following year, 2017, _Programming Historian en español_ [won that very same accolade](https://dhawards.org/dhawards2017/results/). In 2018, The _Programming Historian en español_, was the winner of 'Mejor iniciativa formativa desarrollada durante el año 2018', [Humanidades Digitales Hispánicas Association](https://humanidadesdigitaleshispanicas.es/). We won the [Canadian Social Knowledge Institute's Open Scholarship Award](https://etcl.uvic.ca/events-activities/open-scholarship-awards/) 2020 and in 2021 we were awarded [Coko Foundation's Open Publishing Award](https://web.archive.org/web/20220408041024/https://openpublishingawards.org/results/2021/index.html) in their Open Content category. In 2022, we won the Best DH Training Materials category of the [Digital Humanities Awards](https://dhawards.org/dhawards2022/results/). ## Diversity Policy @@ -40,4 +40,4 @@ For a list of our funders and supports, see the ['Support Us']({{site.baseurl}}/ ## History of the Project -The _Programming Historian_ was founded in 2008 by William J. Turkel and Alan MacEachern. Turkel published [a blog post](http://digitalhistoryhacks.blogspot.com/2008/01/programming-historian.html) at the time, setting out their intentions for the project. Initially it focused heavily on the Python programming language and was published open access as a *Network in Canadian History & Environment* (NiCHE) ‘Digital Infrastructure’ project. In 2012, _Programming Historian_ expanded its editorial team and launched as an open access peer reviewed scholarly journal of methodology for digital historians. In 2016 we added a Spanish Language publication to the initial English-language publication and in 2017 started publishing translated lessons under the title *[Programming Historian en español]({{site.baseurl}}/es)*. In 2018 we [hosted our first Spanish-language writing workshop](/posts/bogota-workshop-report) and issued a call for [new lessons in Spanish](/posts/convocatoria-de-tutoriales). In the same year we added a French language publication and launched *[Programming Historian en français]({{site.baseurl}}/fr)* in 2019. A year later, we were joined by a Portuguese-speaking team and launched *[Programming Historian em português]({{site.baseurl}}/pt)* in early 2021. +The _Programming Historian_ was founded in 2008 by William J. Turkel and Alan MacEachern. Turkel published [a blog post](https://digitalhistoryhacks.blogspot.com/2008/01/programming-historian.html) at the time, setting out their intentions for the project. Initially it focused heavily on the Python programming language and was published open access as a *Network in Canadian History & Environment* (NiCHE) ‘Digital Infrastructure’ project. In 2012, _Programming Historian_ expanded its editorial team and launched as an open access peer reviewed scholarly journal of methodology for digital historians. In 2016 we added a Spanish Language publication to the initial English-language publication and in 2017 started publishing translated lessons under the title *[Programming Historian en español]({{site.baseurl}}/es)*. In 2018 we [hosted our first Spanish-language writing workshop](/posts/bogota-workshop-report) and issued a call for [new lessons in Spanish](/posts/convocatoria-de-tutoriales). In the same year we added a French language publication and launched *[Programming Historian en français]({{site.baseurl}}/fr)* in 2019. A year later, we were joined by a Portuguese-speaking team and launched *[Programming Historian em português]({{site.baseurl}}/pt)* in early 2021. diff --git a/en/author-guidelines.md b/en/author-guidelines.md index 71d396d26a..8d1a8de187 100755 --- a/en/author-guidelines.md +++ b/en/author-guidelines.md @@ -20,7 +20,7 @@ These guidelines have been developed to help you understand the process of creat ## Step 1: Proposing a New Lesson
    -Our English journal is currently seeking proposals for new original lessons or translations to be considered for publication in 2026. Learn more in our our call for proposals (open until 15 February 2026). Submissions to our Spanish, French and Portuguese journals are open year-round. +Our English journal will open its next annual submission window in November 2026. In the meantime, you may find it useful to consult our past call for proposals (closed in February 2026).
    You can get a sense of what we publish by looking through our [published lessons]({{site.baseurl}}/en/lessons), reading our [reviewer guidelines]({{site.baseurl}}/en/reviewer-guidelines) or browsing [lessons in development](https://github.com/programminghistorian/ph-submissions/tree/gh-pages/en/drafts). Please also take a moment to check our [Lesson Concordance document](https://docs.google.com/spreadsheets/d/1vrvZTygZLfQRoQildD667Xcgzhf_reQC8Nq4OD-BRIA/edit#gid=0) to see which methods we have already covered in our published or forthcoming lessons. @@ -321,7 +321,7 @@ The peer review process normally happens in 3 stages: 3) Once your editor and peer reviewers are happy with the piece, the editor will recommend publication to the Managing Editor, who will read the piece to ensure that it meets our Author's Guidelines and standards. In some cases there may be additional revisions or copy editing at this stage to bring the piece in line with our publishing standards. If the Managing Editor is happy with the piece, it will be moved to the live site for publication. Your editor will inform you of any additional information required at this stage. -You may find it helpful to read our [editor guidelines](/editor-guidelines), which detail our editorial process. +You may find it helpful to read our [editor guidelines](/en/editor-guidelines), which detail our editorial process. If at any point you are unsure of your role or what to do next, post a question to the peer review issue. One of our editors will respond as soon as possible. We endeavour to respond to all queries within a few days. diff --git a/en/contribute.md b/en/contribute.md index 1f6e42fa29..2c07e464f3 100755 --- a/en/contribute.md +++ b/en/contribute.md @@ -1,7 +1,7 @@ --- title: Contribute to the Programming Historian layout: blank -redirect_from: /contribute +redirect_from: /contribute/ --- # Contribute to The Programming Historian @@ -11,7 +11,7 @@ The _Programming Historian_ runs on the far-from-endless energy of volunteers, a ## Write a new lesson
    -Our English journal is currently seeking proposals for new original lessons or translations to be considered for publication in 2026. Learn more in our our call for proposals (open until 15 February 2026). Submissions to our Spanish, French and Portuguese journals are open year-round. +Our English journal will open its next annual submission window in November 2026. In the meantime, you may find it useful to consult our past call for proposals (closed in February 2026).
    {{ site.data.snippets.write-a-lesson-image-alt[page.lang] }} @@ -31,7 +31,7 @@ If you would like to contribute as a peer reviewer, please take a few minutes to {{ site.data.snippets.editor-guidelines-image-alt[page.lang] }} -Our editorial board members help facilitate peer review and work with authors closely to make improvements to their lessons. Our [guidelines for editors](editor-guidelines) is meant to ensure that everyone, from authors to reviewers to members of the wider community, receive a fair and consistent experience during peer review. +Our editorial board members help facilitate peer review and work with authors closely to make improvements to their lessons. Our [guidelines for editors](/en/editor-guidelines) is meant to ensure that everyone, from authors to reviewers to members of the wider community, receive a fair and consistent experience during peer review. From time to time we may advertise that we are seeking more editors. @@ -41,13 +41,13 @@ From time to time we may advertise that we are seeking more editors. If you are fluent in more than one of our publication languages (French, English, Spanish, Portuguese), you are invited to get in touch with us about translating one of our published Programming Historian lessons from one language to another. This will help us to assist building multilingual digital humanities communities, and to build your language, method, and technological skills. -We are seeking rigorous and readable translations that take into account the Spanish-language, the French-language and the Portuguese-language research contexts and the resources available in our respective communities. If you are interested in collaborating, consult our instructions for authors and translators [in Spanish](/es/guia-para-autores.html), [in French](/fr/consignes-auteurs.html) and [in Portuguese](/pt/directrizes-autor.html). +We are seeking rigorous and readable translations that take into account the Spanish-language, the French-language and the Portuguese-language research contexts and the resources available in our respective communities. If you are interested in collaborating, consult our instructions for authors and translators [in Spanish](/es/guia-para-autores), [in French](/fr/consignes-auteurs) and [in Portuguese](/pt/directrizes-autor). ## Provide feedback or report problems {{ site.data.snippets.feedback-image-alt[page.lang] }} -We welcome [feedback](feedback.html) on any aspect of the _Programming Historian_. Let us know what we can do to make the project better! +We welcome [feedback](/en/feedback) on any aspect of the _Programming Historian_. Let us know what we can do to make the project better! We are especially grateful for tips about lessons that seem to be broken. As URLs change and as new versions of software and platforms are released, lessons develop glitches over time. Please help us keep the _Programming Historian_ up to date by letting us know about these when you come across them in the course of your reading. @@ -59,9 +59,9 @@ We are especially grateful for tips about lessons that seem to be broken. As URL This project is our attempt to demonstrate what open access academic publishing can and should be. Please help us spreading the message and providing the widest possible access to this resource by asking your librarian to include the project in your library catalogue. -The _Programming Historian_ has listings in WorldCat ([English](http://www.worldcat.org/title/programming-historian/oclc/951537099), [Spanish](https://www.worldcat.org/title/programming-historian-en-espanol/oclc/1061292935&referer=brief_results), [French](https://uva.worldcat.org/title/programming-historian-en-franais/oclc/1104391842) and [Portuguese](https://search.worldcat.org/title/1332987197)). +The _Programming Historian_ has listings in WorldCat ([English](https://www.worldcat.org/title/programming-historian/oclc/951537099), [Spanish](https://www.worldcat.org/title/programming-historian-en-espanol/oclc/1061292935&referer=brief_results), [French](https://uva.worldcat.org/title/programming-historian-en-franais/oclc/1104391842) and [Portuguese](https://search.worldcat.org/title/1332987197)). -With thanks to the [University of Purdue library](http://purdue-primo-prod.hosted.exlibrisgroup.com/primo_library/libweb/action/dlDisplay.do?vid=PURDUE&search_scope=everything&docId=PURDUE_ALMA51671812890001081&fn=permalink) and Amanda Visconti, and to the University of Virginia. +With thanks to the [University of Purdue library](https://purdue-primo-prod.hosted.exlibrisgroup.com/primo_library/libweb/action/dlDisplay.do?vid=PURDUE&search_scope=everything&docId=PURDUE_ALMA51671812890001081&fn=permalink) and Amanda Visconti, and to the University of Virginia. The English edition of the project is indexed by the [Directory of Open Access Journals](https://doaj.org/toc/2397-2068). diff --git a/en/editor-guidelines.md b/en/editor-guidelines.md index a5bbc61ffc..3df647652d 100755 --- a/en/editor-guidelines.md +++ b/en/editor-guidelines.md @@ -19,7 +19,7 @@ Thank you for editing a lesson for the *Programming Historian*. We are extremely -We always encourage prospective authors to pitch their ideas before they start writing. If a piece is not suitable for the *Programming Historian* our job is to tell an author before they have written a full tutorial. We hope this saves everyone time and energy. Once we have spoken to an author and encouraged their idea, our aim is always to support authors until the piece is publishable. Our goal is to help them reach that stage as efficiently as possible with clear guidance. You may find it helpful to familiarise yourself with our [instructions for authors](/author-guidelines). +We always encourage prospective authors to pitch their ideas before they start writing. If a piece is not suitable for the *Programming Historian* our job is to tell an author before they have written a full tutorial. We hope this saves everyone time and energy. Once we have spoken to an author and encouraged their idea, our aim is always to support authors until the piece is publishable. Our goal is to help them reach that stage as efficiently as possible with clear guidance. You may find it helpful to familiarise yourself with our [instructions for authors](/en/author-guidelines). ### Safe Spaces The *Programming Historian* is committed to providing a safe space for the exchange of ideas, where everyone can share without fear of harassment or abuse. The editor plays a fundamental role in ensuring that space endures. Your job includes enforcing our anti-harassment policy at all times. If you need help please ask one of the other editors or PH ombudsperson (Dr Ian Milligan - i2milligan@uwaterloo.ca). You can read more about our [commitment to safe spaces](/posts/PH-commitment-to-diversity) on the project blog. @@ -53,7 +53,7 @@ The editor is encouraged to adjust the issue text to reflect any additional goal When the lesson materials are ready for submission, the author will contact their assigned editor, whose job will be to upload them to the [ph-submissions repository](https://github.com/programminghistorian/ph-submissions) after first checking to ensure that there are no major metadata issues. 1. **Uploading the Lesson**: the lesson itself should be uploaded to the appropriate subfolder (depending on whether it is an original lesson or a translation) of the [lessons folder](https://github.com/programminghistorian/ph-submissions/tree/gh-pages/en) within the corresponding language folder in the root of the ph-submissions repository. If you need help, see [GitHub's instructions](https://help.github.com/articles/adding-a-file-to-a-repository/). -2. **Uploading Images**: if the lesson includes images, make sure all of the files are named according to the naming conventions specified in the [author guidelines](/author-guidelines). The editor should create a folder for the images in the [images directory](https://github.com/programminghistorian/ph-submissions/tree/gh-pages/images). This folder should have the same name as the lesson filename. Upload the images to this folder. +2. **Uploading Images**: if the lesson includes images, make sure all of the files are named according to the naming conventions specified in the [author guidelines](/en/author-guidelines). The editor should create a folder for the images in the [images directory](https://github.com/programminghistorian/ph-submissions/tree/gh-pages/images). This folder should have the same name as the lesson filename. Upload the images to this folder. 3. **Uploading Data**: if the lesson includes data files, they should be uploaded to a similarly named folder in the [assets directory](https://github.com/programminghistorian/ph-submissions/tree/gh-pages/assets). After uploading, the editor should check the [commit history for the repository](https://github.com/programminghistorian/ph-submissions/commits/gh-pages) to ensure that their upload received a green check mark. If not, something went wrong and the [wiki](https://github.com/programminghistorian/jekyll/wiki/Making-Technical-Contributions#checking-travis-for-errors) should be consulted for troubleshooting the errors. Upon successful submission of the lesson, the editor will create a review ticket for the lesson and close the proposal issue. From here on, the editor should ensure that the author work from the latest version of the lesson in the repository and upload changes directly to GitHub themselves. @@ -61,9 +61,9 @@ After uploading, the editor should check the [commit history for the repository] ### Open Peer Review The *Programming Historian* uses a model of open peer review, while we believe this helps maintain civility and the productive sharing of ideas, authors have the right (and we have a requirement to respect that right) to request a closed peer review. There are many reasons why someone might be hesitant to engage in an open review and we encourage authors to always pursue the option with which they are most comfortable. -Before soliciting external reviews, the editor should read and try the tutorial and use their experience with the *Programming Historian* to help the author make initial improvements (if required). The editor is not expected to be a expert in content of the lesson, this is the role of the [reviewers](/reviewer-guidelines). +Before soliciting external reviews, the editor should read and try the tutorial and use their experience with the *Programming Historian* to help the author make initial improvements (if required). The editor is not expected to be a expert in content of the lesson, this is the role of the [reviewers](/en/reviewer-guidelines). -The editor should complete an initial sustainability overview of the submission to ensure that software versions and dependencies are clearly marked, specificities of software like screenshots are limited to those required to complete the lesson, and that the lesson makes use of existing software documentation whenever available and appropriate. Editors should also ensure that lessons try, as much as possible, to avoid software specific directions, such as "Right-click on the _x_ icon to access the _x_ menu," instead favoring general methodological overviews. The Editorial Checklist [contains more details about sustainability practices](#c-sustainability-review) for PH. +The editor should complete an initial sustainability overview of the submission to ensure that software versions and dependencies are clearly marked, specificities of software like screenshots are limited to those required to complete the lesson, and that the lesson makes use of existing software documentation whenever available and appropriate. Editors should also ensure that lessons try, as much as possible, to avoid software specific directions, such as "Right-click on the _x_ icon to access the _x_ menu," instead favoring general methodological overviews. The Editorial Checklist [contains more details about sustainability practices](#c-sustainability--internationalization-review) for PH. Often editors need help clarifying the intended audience of a lesson, or identifying jargon that needs further explanation. This initial review helps let the external reviewers focus on improving the piece. This is normally done openly on our submission system (see below), but it can be a closed review at the request of either party. @@ -82,7 +82,7 @@ For each potential reviewer you do contact, regardless of response, please enter Please enter the date using the `mm/dd/yyyy` format. -When inviting reviewers, the editor should provide them with our [reviewer guidelines](/reviewer-guidelines) and give them a deadline for completing their review (usually one month) so that we can ensure the timely publication of the tutorial. +When inviting reviewers, the editor should provide them with our [reviewer guidelines](/en/reviewer-guidelines) and give them a deadline for completing their review (usually one month) so that we can ensure the timely publication of the tutorial. When a lesson has been submitted, the editor will open a new 'issue' on our [Github submissions repository](https://github.com/programminghistorian/ph-submissions/issues) where the open review will take place. This message board allows everyone to keep track of the conversation. You will need to sign up for a free Github account if you do not already have one, as will both the author and reviewers. @@ -127,7 +127,7 @@ With your summary of the reviews and any final instructions for the editor, incl ## Technical Processes of Review - Editorial Checklist -Our peer review is conducted on our [Submissions repository](https://github.com/programminghistorian/ph-submissions) on Github. Full instructions for how to upload files, including file formats and formatting guidelines can be found on our [Author Submission Instructions](/author-guidelines) which will always contain the most up to date instructions. Please familiarise yourself with these steps or refer to them as needed. If you need help you are always welcome to [email another editor directly](/project-team). +Our peer review is conducted on our [Submissions repository](https://github.com/programminghistorian/ph-submissions) on Github. Full instructions for how to upload files, including file formats and formatting guidelines can be found on our [Author Submission Instructions](/en/author-guidelines) which will always contain the most up to date instructions. Please familiarise yourself with these steps or refer to them as needed. If you need help you are always welcome to [email another editor directly](/en/project-team). There are a few areas where you should intervene in the process from a technical standpoint. They include: @@ -144,7 +144,7 @@ Once you have chosen a name for the lesson file, use the same name to create a n ### B) Initial Check of Markdown -Authors are responsible for checking that their lesson has rendered properly in Markdown. If they have followed the syntax rules, it should be ok. If you can see any Markdown symbols on the page, something went wrong. Detailed instructions of Markdown syntax are available on our [Author Guidelines](/author-guidelines) +Authors are responsible for checking that their lesson has rendered properly in Markdown. If they have followed the syntax rules, it should be ok. If you can see any Markdown symbols on the page, something went wrong. Detailed instructions of Markdown syntax are available on our [Author Guidelines](/en/author-guidelines) You can quickly check that everything looks correct on a lesson submission by looking at the rendered version of the page. It will be found at: @@ -181,7 +181,7 @@ If a lesson does use a sequential image naming system, it is possible that figur Regardless of how the images are named (semantically or sequentially), they should be placed in a subdirectory within the `images` directory. The subdirectory should be named using the same URL slug used to name the lesson. Make sure the images are in web-friendly formats such as PNG or JPEG and sized appropriately (both in terms of pixels and bytes). -Full instructions on adding images is available in [Author Submission Instructions](/author-guidelines). +Full instructions on adding images is available in [Author Submission Instructions](/en/author-guidelines). ### E) Verify Data files @@ -284,14 +284,14 @@ difficulty: 2 ### 4) Find an Image to represent the lesson -We represent our lessons using an old image that we feel captures some element of the task described in the tutorial. You can see the full range of these on the [main Lessons directory](/lessons/). These images are selected by editors. +We represent our lessons using an old image that we feel captures some element of the task described in the tutorial. You can see the full range of these on the [main Lessons directory](/en/lessons/). These images are selected by editors. Here are a few places to look for lesson images: - The [British Library](https://www.flickr.com/photos/britishlibrary) - The [Internet Archive Book Images](https://archive.org/details/bookimages) - The [Virtual Manuscript Library of Switzerland](https://www.e-codices.unifr.ch/en) - - The [Library of Congress Maps](http://www.loc.gov/maps/collections) + - The [Library of Congress Maps](https://www.loc.gov/maps/collections) Ensure that the image matches the style of the other images (it should be a book image, not a photograph), is at least 200 pixels in both dimensions, and is not copyright restricted. Make sure the image is not offensive, and keeping with our [Commitment to Diversity](/posts/PH-commitment-to-diversity) try to find something that does not perpetuate stereotypes or send a subtle message about maleness and whiteness. @@ -356,7 +356,7 @@ There are several ways that you can perform a pull request to publish the files: * A) Follow our ["Making Technical Contributions" guidelines](https://github.com/programminghistorian/jekyll/wiki/Making-Technical-Contributions), which uses the Github website GUI. -* B) Use `git` from the command line. The following instructions assume that you have already cloned both the `jekyll` and `ph-submissions` repositories to your local machine. (Our [lesson on using GitHub Desktop](/lessons/getting-started-with-github-desktop) may be helpful if this is new to you.) If you are not sure how to do that or have any questions, contact the technical team for assistance. +* B) Use `git` from the command line. The following instructions assume that you have already cloned both the `jekyll` and `ph-submissions` repositories to your local machine. (Our [lesson on using GitHub Desktop](/en/lessons/getting-started-with-github-desktop) may be helpful if this is new to you.) If you are not sure how to do that or have any questions, contact the technical team for assistance. 1. Go to the directory for your local `ph-submissions` repository. 2. `git pull` to get all of the newest changes on your machine (or `sync` if you are using GitHub Desktop) diff --git a/en/events.md b/en/events.md index dfe9a1aa2e..216a01afdb 100644 --- a/en/events.md +++ b/en/events.md @@ -1,7 +1,7 @@ --- title: Events layout: blank -redirect_from: /events +redirect_from: /events/ --- # Events diff --git a/en/feedback.md b/en/feedback.md index 876863d004..16c77ed106 100755 --- a/en/feedback.md +++ b/en/feedback.md @@ -4,7 +4,7 @@ date: 16-07-2021 layout: blank redirect_from: - /report-issue.html - - /feedback + - /feedback/ --- @@ -16,7 +16,7 @@ Have you followed the instructions in a lesson meticulously and still run into a We define bugs as: "An error in a computer program that produces an unexpected result or that behaves different from the instructions in a lesson." Note that we cannot attend to errors caused by the user editing the code or changing materials (datasets, input files, etc.) -First, we ask that you check on our [issue tracker](https://github.com/orgs/programminghistorian/projects/6) if someone has already flagged the problem and, in such case, that you leave a comment. If the issue has not been reported, follow one of these options: +First, we ask that you check on our [issue tracker](https://github.com/programminghistorian/jekyll/issues) if someone has already flagged the problem and, in such case, that you leave a comment. If the issue has not been reported, follow one of these options:
    Please, do not create a Pull Request with the correction. diff --git a/en/index.md b/en/index.md index 108c490069..ad20631c5c 100644 --- a/en/index.md +++ b/en/index.md @@ -1,6 +1,7 @@ --- layout: base title: The Programming Historian +permalink: /en/ ---
    {{ site.data.snippets.front-image-alt[page.lang] }} @@ -23,20 +24,20 @@ title: The Programming Historian

    Teach

    -

    Use the Programming Historian in your classes or workshops! Please let us know how we can improve our lessons to suit your needs, or if you run into trouble using one.

    +

    Use the Programming Historian in your classes or workshops! Please let us know how we can improve our lessons to suit your needs, or if you run into trouble using one.

    Contribute

    -

    Write a lesson, join our team of reviewers, provide feedback. We're always keen to hear from our readers!

    +

    Write a lesson, join our team of reviewers, provide feedback. We're always keen to hear from our readers!

    Our Team

    -

    As a volunteer, community-driven resource, we take pride in showing off and giving credit to the great many people who have contributed their time and energy to the Programming Historian.

    +

    As a volunteer, community-driven resource, we take pride in showing off and giving credit to the great many people who have contributed their time and energy to the Programming Historian.

    diff --git a/en/individual.md b/en/individual.md index 49edca3e6f..62e9947421 100644 --- a/en/individual.md +++ b/en/individual.md @@ -2,9 +2,9 @@ layout: blank title: Individual Supporters redirect_from: -- individual -- /en/support-us -- support-us +- /individual/ +- /en/support-us/ +- /support-us/ --- # Individual Supporters @@ -19,7 +19,7 @@ Your support directly enables the infrastructure that keeps our publications tog
    - + @@ -37,4 +37,4 @@ One-time donations can be made to the Programming Historian via [Paypal](https:/ # Organisational Support -If you work for an organisation that would be interested in contributing to *Programming Historian's* success, please direct them to our [Institutional Partnership Programme](ipp), which provides crucial support to our work. +If you work for an organisation that would be interested in contributing to *Programming Historian's* success, please direct them to our [Institutional Partnership Programme](/en/ipp), which provides crucial support to our work. diff --git a/en/ipp.md b/en/ipp.md index 8677ff4ae3..be0f86146e 100644 --- a/en/ipp.md +++ b/en/ipp.md @@ -1,7 +1,7 @@ --- layout: blank title: Institutional Partnership Programme -redirect_from: /ipp +redirect_from: /ipp/ --- @@ -50,7 +50,7 @@ By joining the Institutional Partner Programme you will receive the following be
    - + diff --git a/en/jisc-tna-partnership.md b/en/jisc-tna-partnership.md index dcbf376a67..77eb445c58 100644 --- a/en/jisc-tna-partnership.md +++ b/en/jisc-tna-partnership.md @@ -1,7 +1,7 @@ --- title: Lessons published in partnership with Jisc and The National Archives layout: blank -redirect_from: /jisc-tna-partnership +redirect_from: /jisc-tna-partnership/ --- # Lessons published in partnership with [Jisc](https://www.jisc.ac.uk/) and [The National Archives](https://www.nationalarchives.gov.uk/) diff --git a/en/lesson-retirement-policy.md b/en/lesson-retirement-policy.md index d08e03aaaf..e4536f291d 100755 --- a/en/lesson-retirement-policy.md +++ b/en/lesson-retirement-policy.md @@ -1,7 +1,7 @@ --- title: Lesson Retirement Policy layout: blank -redirect_from: /lesson-retirement-policy +redirect_from: /lesson-retirement-policy/ --- # Lesson Retirement Policy @@ -33,11 +33,11 @@ Whether or not a new derivative is created, the following steps will be taken wi ## Related Sustainability Guidelines -[Author Guidelines for Writing Sustainably](/author-guidelines#write-sustainably) +[Author Guidelines for Writing Sustainably](/en/author-guidelines#sustainable-writing) -[Reviewer Guidelines for Assessing Lesson Sustainability](/reviewer-guidelines#sustainability) +[Reviewer Guidelines for Assessing Lesson Sustainability](/en/reviewer-guidelines#sustainability) -[Editor Guidelines for Fostering Lesson Sustainability](/editor-guidelines#c-sustainability-review) +[Editor Guidelines for Fostering Lesson Sustainability](/en/editor-guidelines#c-sustainability--internationalization-review) ## Retired Lessons diff --git a/en/lessons/analyzing-documents-with-tfidf.md b/en/lessons/analyzing-documents-with-tfidf.md index e24508472a..26c0fe1250 100644 --- a/en/lessons/analyzing-documents-with-tfidf.md +++ b/en/lessons/analyzing-documents-with-tfidf.md @@ -326,7 +326,7 @@ As I have described, __tf-idf__ has its origins in information retrieval, and th ### 1. As an Exploratory Tool or Visualization Technique -As I've already demonstrated, terms lists with __tf-idf__ scores for each document in a corpus can be a strong interpretive aid in themselves, they can help generate hypotheses or research questions. Word lists can also be the building bocks for more sophisticated browsing and visualization strategies. ["A full-text visualization of the Iraq War Logs"](http://jonathanstray.com/a-full-text-visualization-of-the-iraq-war-logs), by Jonathan Stray and Julian Burgess, is a good example of this use case.[^11] Using __tf-idf__-transformed features, Stray and Burgess build a network visualization that positions Iraq War logs in relation to their most distinctive keywords. This way of visualizing textual information led Stray to develop [the Overview Project](https://www.overviewdocs.com), which provides a dashboard for users to visualize and search thousands of documents at a time. We could use this kind of approach to graph our obituaries corpus and see if there are keyword communities. +As I've already demonstrated, terms lists with __tf-idf__ scores for each document in a corpus can be a strong interpretive aid in themselves, they can help generate hypotheses or research questions. Word lists can also be the building bocks for more sophisticated browsing and visualization strategies. ["A full-text visualization of the Iraq War Logs"](https://jonathanstray.com/a-full-text-visualization-of-the-iraq-war-logs), by Jonathan Stray and Julian Burgess, is a good example of this use case.[^11] Using __tf-idf__-transformed features, Stray and Burgess build a network visualization that positions Iraq War logs in relation to their most distinctive keywords. This way of visualizing textual information led Stray to develop [the Overview Project](https://www.overviewdocs.com), which provides a dashboard for users to visualize and search thousands of documents at a time. We could use this kind of approach to graph our obituaries corpus and see if there are keyword communities. ### 2. Textual Similarity and Feature Sets @@ -435,7 +435,7 @@ If you are not using Anaconda, you will need to cover the following dependencies 1. Install Python 2 or 3 (preferably Python 3.6 or later) 2. Recommended: install and run a virtual environment -3. Install the Scikit-Learn library and its dependencies (see [http://scikit-learn.org/stable/install.html](http://scikit-learn.org/stable/install.html)). +3. Install the Scikit-Learn library and its dependencies (see [https://scikit-learn.org/stable/install.html](https://scikit-learn.org/stable/install.html)). 4. Install Jupyter Notebook and its dependencies # Endnotes diff --git a/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md b/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md index 179ed4ac9c..6f85b281d4 100644 --- a/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md +++ b/en/lessons/analyzing-multilingual-text-nltk-spacy-stanza.md @@ -153,7 +153,7 @@ with open("war-and-peace-excerpt.txt") as file: print(war_and_peace) ``` -Running this code should output the text as shown in [Developing Python Code for Multilingual Text Analysis](#Developing-Python-Code-for-Multilingual-Text-Analysis) above. +Running this code should output the text as shown in [Developing Python Code for Multilingual Text Analysis](#developing-python-code-for-multilingual-text-analysis) above. Now, let’s remove the [newline characters](https://perma.cc/UX3B-R2WF). Newline characters are used to signify the end of a line in character encoding specifications such as Unicode. We will replace all newlines (represented as a `\n` in the code) with a space, assign the cleaned text to a new variable named `cleaned_war_and_peace` and print it to check what we’ve done. Replacing the newline characters with a space will combine the text into a continuous string and homogenize the text. This ensures that the tokenizer is not mislead into creating sentence splits where there shouldn’t be any. This is the only modification to the text that we will be doing for the purposes of this lesson, but if you are interested in different steps you can take to prepare your text for multilingual analysis, please consult [this article](https://perma.cc/Z4VX-RHT2). diff --git a/en/lessons/applied-archival-downloading-with-wget.md b/en/lessons/applied-archival-downloading-with-wget.md index 90761c2f88..c9b5d69204 100755 --- a/en/lessons/applied-archival-downloading-with-wget.md +++ b/en/lessons/applied-archival-downloading-with-wget.md @@ -15,8 +15,8 @@ exclude_from_check: activity: acquiring topics: [web-scraping] abstract: "Now that you have learned how Wget can be used to mirror or download specific files from websites via the command line, it's time to expand your web-scraping skills through a few more lessons that focus on other uses for Wget's recursive retrieval function." -previous: automated-downloading-with-wget -redirect_from: /lessons/applied-archival-downloading-with-wget +previous: /en/lessons/automated-downloading-with-wget +redirect_from: /lessons/applied-archival-downloading-with-wget/ avatar_alt: Diagram of a well-drilling aparatus doi: 10.46430/phen0022 --- @@ -72,8 +72,8 @@ identify the beginning URL in the series of documents that you want to download. Because of its smaller size we're going to use the online war diary for [No. 14 Canadian General Hospital][] as our example. The entire war diary is 80 pages long. The URL for page 1 is - and the URL for page -80 is '. Note that + and the URL for page +80 is '. Note that they are in sequential order. We want to download the .jpeg images for *all* of the pages in the diary. To do this, we need to design a script to generate all of the URLs for the pages in between (and including) the @@ -163,7 +163,7 @@ and press enter again. You now have the directory selected and are ready to begin downloading. Based on what you have learned from [Ian Milligan's Wget -lesson](../lessons/automated-downloading-with-wget), enter the following into +lesson](/en/lessons/automated-downloading-with-wget), enter the following into the command line (note you can choose whatever you like for your 'limit rate', but be a responsible internet citizen and keep it under 200kb/s!): @@ -197,9 +197,9 @@ Mutineers, 1789" which provides an account of the mutiny aboard the HMS (pages) to the notebook. This is somewhat misleading. Click on the first thumbnail in the top right to view the whole page. Now, *right-click -\> view image*. The URL should be -``. If you browse through +''. If you browse through the thumbnails, the last one is 'Part 127', which is located at -``. The discrepancy +''. The discrepancy between the range of URLs and the total number of files means that you may miss a page or two in the automated download – in this case there are a few URLs that include a letter in the name of the .jpeg @@ -209,7 +209,7 @@ if you miss a page or two during an automated download. Note that a potential workaround could include using regular expressions to make more complicated queries if appropriate -(for more, see the [Understanding Regular Expressions](/lessons/understanding-regular-expressions) +(for more, see the [Understanding Regular Expressions](/en/lessons/understanding-regular-expressions) lesson). Let's run the script and Wget command once more: @@ -258,7 +258,7 @@ complicate matters and do not permit us to easily generate URLs with the first script we used. Here's a workaround. Click on this link: - + The page you just opened is a sub-directory of the website that lists the .jpeg files for a selection of the Jefferson Papers. This means that @@ -270,15 +270,15 @@ URLs you do not actually need to write a script (although you could using my final example, which discusses the problem of leading zeros). Instead, simply manipulate the URLs in a .txt file as follows: - + - + - + ... all the way up to - + This is the last sub-directory on the Library of Congress site for these dates in Series 1. This last URL contains images 1400-1487. @@ -319,12 +319,12 @@ Archives example, to get the simplified URL you must *right-click -\> view image* using your web-browser. The URL for the first poster should be: - + Follow the same steps for the last poster in the gallery – the URL should be: -. +. The script we used to download from LAC will not work because the range function cannot comprehend leading zeros. The script below provides an @@ -383,12 +383,12 @@ toolkit. As new methods for scraping online repositories become available, we will continue to update this lesson with additional examples of Wget's power and potential. - [ActiveHistory.ca]: http://www.activehistory.ca - [curl]: http://chronicle.com/blogs/profhacker/download-a-sequential-range-of-urls-with-curl/41055 + [ActiveHistory.ca]: https://www.activehistory.ca + [curl]: https://chronicle.com/blogs/profhacker/download-a-sequential-range-of-urls-with-curl/41055 [Indian Affairs Annual Reports database]: https://recherche-collection-search.bac-lac.gc.ca/eng/Home/Search?q=%20Indian%20Affairs%20Annual%20Reports%20database [View a scanned page of original Report]: https://recherche-collection-search.bac-lac.gc.ca/eng/home/record?app=fonandcol&IdNumber=2061374&q=Indian%20Affairs%20Annual%20Reports [No. 14 Canadian General Hospital]: https://recherche-collection-search.bac-lac.gc.ca/eng/Home/Record?app=fonandcol&IdNumber=2005110&new=-8585971893141232328 [http://data2.archives.ca/e/e061/e001518109.jpg]: http://data2.archives.ca/e/e061/e001518029.jpg - [leading zeros]: http://en.wikipedia.org/wiki/Leading_zero - [Series 1: General Correspondence. 1651-1827]: http://memory.loc.gov/cgi-bin/ampage?collId=mtj1&fileName=mtj1page001.db&recNum=1&itemLink=/ammem/collections/jefferson_papers/mtjser1.html&linkText=6 - [Historical Medical Poster Collection]: http://cushing.med.yale.edu/gsdl/collect/mdposter/ + [leading zeros]: https://en.wikipedia.org/wiki/Leading_zero + [Series 1: General Correspondence. 1651-1827]: https://www.loc.gov/search/?fa=partof:the+thomas+jefferson+papers+at+the+library+of+congress:+series+1:+general+correspondence.+1651-1827 + [Historical Medical Poster Collection]: https://library.medicine.yale.edu/collections/digitized-collections/medical-historical-posters/ diff --git a/en/lessons/automated-downloading-with-wget.md b/en/lessons/automated-downloading-with-wget.md index 1b3ec73cca..e7044f6d64 100755 --- a/en/lessons/automated-downloading-with-wget.md +++ b/en/lessons/automated-downloading-with-wget.md @@ -15,8 +15,8 @@ activity: acquiring topics: [web-scraping] abstract: "Wget is a useful program, run through your computer's command line, for retrieving online material." -next: applied-archival-downloading-with-wget -redirect_from: /lessons/automated-downloading-with-wget +next: /en/lessons/applied-archival-downloading-with-wget +redirect_from: /lessons/automated-downloading-with-wget/ avatar_alt: Diagram of an elevator system in a mineshaft doi: 10.46430/phen0001 --- @@ -32,7 +32,7 @@ Editor's Note This lesson requires you to use the command line. If you have no previous experience using the command line you may find it helpful to -work through the *Programming Historian’s* [Introduction to the Bash Programming Language](/lessons/intro-to-bash). +work through the *Programming Historian’s* [Introduction to the Bash Programming Language](/en/lessons/intro-to-bash). Lesson Goals ------------ @@ -277,10 +277,10 @@ manual][] page. Let's take an example dataset. Say you wanted to download all of the papers hosted on the website ActiveHistory.ca. They are all located at: -; in the sense that they are all +; in the sense that they are all contained within the `/papers/` directory: for example, the 9th paper published on the website -is . Think of this +is . Think of this structure in the same way as directories on your own computer: if you have a folder labeled `/History/`, it likely contains several files within it. The same structure holds true for websites, and we are using @@ -331,7 +331,7 @@ Saving to: `index.html.1' ``` What you have done is downloaded just the first page of -, the index page for the papers to your +, the index page for the papers to your new directory. If you open it, you'll see the main text on the home page of ActiveHistory.ca. So at a glance, we have already quickly downloaded something. @@ -357,10 +357,10 @@ options. So let's learn a few commands now: Recursive retrieval is the most important part of wget. What this means is that the program begins following links from the website and downloading them too. So for example, the - has a link to -, so it will download + has a link to +, so it will download that too if we use recursive retrieval. However, it will also follow any -other links: if there was a link to somewhere on that +other links: if there was a link to somewhere on that page, it would follow that and download it as well. By default, -r sends wget to a depth of five sites after the first one. This is following links, to a limit of five clicks after the first website. At this point, @@ -376,8 +376,8 @@ have a short version, this could be initiated using -np). This is an important one. What this means is that wget should follow links, but not beyond the last parent directory. In our case, that means that it won't go anywhere that is not part of the -http://activehistory.ca/papers/ hierarchy. If it was a long path such as -http://niche-canada.org/projects/events/new-events/not-yet-happened-events/, +https://activehistory.ca/papers/ hierarchy. If it was a long path such as +https://niche-canada.org/projects/events/new-events/not-yet-happened-events/, it would only find files in the `/not-yet-happened-events/` folder. It is a critical command for delineating your search. @@ -493,12 +493,12 @@ files, backups, etc. I've only given a snapshot of some of wget's functionalities. For more, please visit the [wget manual][GNU wget manual]. - [Command Line Bootcamp]: http://praxis.scholarslab.org/scratchpad/bash/ + [Command Line Bootcamp]: https://praxis.scholarslab.org/scratchpad/bash/ [download XCode via this link]: https://itunes.apple.com/us/app/xcode/id497799835?mt=12 [Apple Developer website]: https://developer.apple.com/xcode/ [View Downloads]: https://developer.apple.com/downloads/ - [GNU website]: http://www.gnu.org/software/wget/ - [HTTP]: http://ftp.gnu.org/gnu/wget/ + [GNU website]: https://www.gnu.org/software/wget/ + [HTTP]: https://ftp.gnu.org/gnu/wget/ [FTP]: ftp://ftp.gnu.org/gnu/wget/ - [ugent website]: http://users.ugent.be/~bpuype/wget/ - [GNU wget manual]: http://www.gnu.org/software/wget/manual/wget.html + [ugent website]: https://users.ugent.be/~bpuype/wget/ + [GNU wget manual]: https://www.gnu.org/software/wget/manual/wget.html diff --git a/en/lessons/basic-text-processing-in-r.md b/en/lessons/basic-text-processing-in-r.md index 9a24abc88c..a1e91fdb45 100755 --- a/en/lessons/basic-text-processing-in-r.md +++ b/en/lessons/basic-text-processing-in-r.md @@ -18,7 +18,7 @@ activity: analyzing topics: [distant-reading, r, data-visualization] abstract: | Learn how to use R to analyze high-level patterns in texts, apply stylometric methods over time and across authors, and use summary methods to describe items in a corpus. -redirect_from: /lessons/basic-text-processing-in-r +redirect_from: /lessons/basic-text-processing-in-r/ avatar_alt: Children visiting a mobile book-mobile doi: 10.46430/phen0061 --- @@ -640,11 +640,11 @@ Many generic tutorials exist for all three of these, as well as extensive packag # Endnotes -[^1]: Taryn Dewar, "R Basics with Tabular Data," Programming Historian (05 September 2016), [/lessons/r-basics-with-tabular-data](/lessons/r-basics-with-tabular-data). +[^1]: Taryn Dewar, "R Basics with Tabular Data," Programming Historian (05 September 2016), [/en/lessons/r-basics-with-tabular-data](/en/lessons/r-basics-with-tabular-data). [^2]: Our corpus has 236 State of the Union addresses. Depending on exactly what is counted, this number can be slightly higher or lower. -[^3]: All Presidential State of the Union Addresses were downloaded from The American Presidency Project at the University of California Santa Barbara. (Accessed 2016-11-11) [http://www.presidency.ucsb.edu/sou.php](http://www.presidency.ucsb.edu/sou.php). +[^3]: All Presidential State of the Union Addresses were downloaded from The American Presidency Project at the University of California Santa Barbara. (Accessed 2016-11-11) [https://www.presidency.ucsb.edu/sou.php](https://www.presidency.ucsb.edu/sou.php). [^4]: Peter Norvig. "Google Web Trillion Word Corpus". (Accessed 2016-11-11) [http://norvig.com/ngrams/](https://web.archive.org/web/20260326183858/http://norvig.com/ngrams/). diff --git a/en/lessons/beginners-guide-to-twitter-data.md b/en/lessons/beginners-guide-to-twitter-data.md index c934a56fca..166c790767 100644 --- a/en/lessons/beginners-guide-to-twitter-data.md +++ b/en/lessons/beginners-guide-to-twitter-data.md @@ -16,7 +16,7 @@ reviewers: - Frédéric Clavert - Telmo Menezes - Ed Summers -review-ticket: http://programminghistorian.github.io/ph-submissions/lessons/beginners-guide-to-twitter-data +review-ticket: https://programminghistorian.github.io/ph-submissions/lessons/beginners-guide-to-twitter-data difficulty: 1 activity: acquiring topics: [data-manipulation, api] @@ -140,7 +140,7 @@ TweetSets provides additional files from the Hurricane Irma dataset. These data Download and extract the files. On a Windows computer, you can use an application such as [7-Zip](https://www.7-zip.org/) to uncompress files with a .gz exension. -If you are unfamiliar with social network analysis, it might be worthwhile to check out one of Scott Weingart’s ["Demystifying Networks"](http://journalofdigitalhumanities.org/1-1/demystifying-networks-by-scott-weingart/) series to familiarize yourself with the basic linguistic and visual vocabularies. If you have done so, you will recognize that the TweetSets outputs show us some basic information that can be used to reconstruct a social network. The edges file shows us who is tweeting to whom; the nodes files associates user names with ID numbers; and the top mentions and users files do the same, but for the most actively mentioned and most actively tweeting users. +If you are unfamiliar with social network analysis, it might be worthwhile to check out one of Scott Weingart’s ["Demystifying Networks"](https://journalofdigitalhumanities.org/1-1/demystifying-networks-by-scott-weingart/) series to familiarize yourself with the basic linguistic and visual vocabularies. If you have done so, you will recognize that the TweetSets outputs show us some basic information that can be used to reconstruct a social network. The edges file shows us who is tweeting to whom; the nodes files associates user names with ID numbers; and the top mentions and users files do the same, but for the most actively mentioned and most actively tweeting users. The edges file is 13,856,080 lines, so too large to work with in Excel. For this lesson, we will work with only the first 1,000 lines of data in the file. The [Introduction to the Bash Command Line](/en/lessons/intro-to-bash) lesson describes how you can use a command-line interface to read parts of a file using commands such as `head`. We can read the first 1,001 lines (1,000 lines of data plus a header) of the file into a new file using the following command: @@ -208,7 +208,7 @@ Once you've done this, you can see the returned value is now in the formula fiel {% include figure.html filename="vlookup-final.png" caption="Done with data formatting!" %} ## Further Applications -After repeating this process on the second column, this spreadsheet is ready to be used in a variety of social network visualizations. It will drop right in to a SNA tool like [Palladio](http://hdlab.stanford.edu/palladio/), or, with some light reformatting, into software like [Gephi](https://gephi.org/) or [Cytoscape](https://cytoscape.org/). The VLOOKUP we did makes it so you can do the visualizations with human-legible user names, rather than rather meaningless user IDs. +After repeating this process on the second column, this spreadsheet is ready to be used in a variety of social network visualizations. It will drop right in to a SNA tool like [Palladio](https://hdlab.stanford.edu/palladio/), or, with some light reformatting, into software like [Gephi](https://gephi.org/) or [Cytoscape](https://cytoscape.org/). The VLOOKUP we did makes it so you can do the visualizations with human-legible user names, rather than rather meaningless user IDs. {% include figure.html filename="palladio.png" caption="A very quick social network sketch showing the users who most often mentioned @realDonaldTrump in their hurricane tweets. Done in Palladio." %} diff --git a/en/lessons/building-static-sites-with-jekyll-github-pages.md b/en/lessons/building-static-sites-with-jekyll-github-pages.md index f661475ede..e76cd7a0b3 100755 --- a/en/lessons/building-static-sites-with-jekyll-github-pages.md +++ b/en/lessons/building-static-sites-with-jekyll-github-pages.md @@ -16,14 +16,14 @@ review-ticket: https://github.com/programminghistorian/ph-submissions/issues/3 activity: presenting topics: [website, data-management] abstract: "This lesson will help you create entirely free, easy-to-maintain, preservation-friendly, secure website over which you have full control, such as a scholarly blog, project website, or online portfolio." -redirect_from: /lessons/building-static-sites-with-jekyll-github-pages +redirect_from: /lessons/building-static-sites-with-jekyll-github-pages/ avatar_alt: An illustration of Dr. Jekyll transforming into Mr. Hyde doi: 10.46430/phen0048 --- **This lesson is for you if** you'd like an entirely free, easy-to-maintain, preservation-friendly, secure website over which you have full control, such as a scholarly blog, project website, or online portfolio. -**At the end of this lesson**, you'll have a basic live website where you can publish content that other people can visit—it will look like [this](http://amandavisconti.github.io/JekyllDemo/)!—and you'll also have some resources to explore if you want to further customize the site. +**At the end of this lesson**, you'll have a basic live website where you can publish content that other people can visit—it will look like [this](https://amandavisconti.github.io/JekyllDemo/)!—and you'll also have some resources to explore if you want to further customize the site. **Requirements:** A computer (Mac/Windows/Linux are all okay, but this lesson doesn't cover some aspects of Linux use, and you may encounter some issues if you are using a Mac with an M-series (silicon) chip), the ability to download and install software on the computer, an internet connection that can support downloading software. Users have reported needing between 1-3 hours to complete the entire lesson. @@ -33,11 +33,11 @@ doi: 10.46430/phen0048 -## What are static sites, Jekyll, etc. & why might I care? +## What are static sites, Jekyll, etc. & why might I care? -*This tutorial is built on the [official Jekyll Documentation](http://jekyllrb.com/docs/home/) written by the Jekyll community. See the ["Read more"](#section9-3) section below if you'd like to know even more about these terms!* +*This tutorial is built on the [official Jekyll Documentation](https://jekyllrb.com/docs/home/) written by the Jekyll community. See the ["Read more"](#further-reading) section below if you'd like to know even more about these terms!* -### Dynamic websites, static websites, & Jekyll +### Dynamic websites, static websites, & Jekyll *Dynamic websites*, such as those created and managed by a content management system such as [Drupal](https://www.drupal.com/), [WordPress](https://wordpress.org/), and [Omeka](https://omeka.org/), pull information from a database to fill in the content on a webpage. When you search for a book on Amazon.com, for example, the search results page you are shown didn’t already exist as a full HTML page; instead, Amazon.com has a template for search results page that includes things all results pages share (like the main menu and Amazon logo), but it queries the database to insert the results of that search you initiated into that template. @@ -49,15 +49,15 @@ Note that when someone refers to a "Jekyll website", they really mean a static ( Because static sites are really just text files (no database to complicate matters), you can easily *version* a static site—that is, use a tool to keep track of the different versions of the site over time by tracking how the text files that compose the site have been altered. Versioning is especially helpful when you need to merge two files (e.g. two students are writing a blog post together, and you want to combine their two versions), or when you want compare files to look for differences among them (e.g. "How did the original About page describe this project?"). Versioning is great when working with a team (e.g. helps you combine and track different people's work), but it's also useful when writing or running a website on your own. -Read more about [Jekyll here](http://jekyllrb.com/docs/home/) or [static site generators here](https://davidwalsh.name/introduction-static-site-generators). +Read more about [Jekyll here](https://jekyllrb.com/docs/home/) or [static site generators here](https://davidwalsh.name/introduction-static-site-generators). -### GitHub & GitHub Pages +### GitHub & GitHub Pages *[GitHub Pages](https://pages.github.com/)* is a free place to store the files that run a website and host that website for people to visit (it only works for particular types of website, like basic HTML sites or Jekyll sites, and does not host databases). -*[GitHub](https://github.com/)* is a visual way to use *[git]( https://git-scm.com/doc)*, a system for *versioning*: keeping track of changes to computer files (including code and text documents) over time (as explained [above](#section0-1)). If you're curious, here's [a friendly lesson for exploring GitHub](https://guides.github.com/activities/hello-world/). +*[GitHub](https://github.com/)* is a visual way to use *[git]( https://git-scm.com/doc)*, a system for *versioning*: keeping track of changes to computer files (including code and text documents) over time (as explained [above](#dynamic-websites-static-websites--jekyll)). If you're curious, here's [a friendly lesson for exploring GitHub](https://guides.github.com/activities/hello-world/). -### What are the reasons for using a static website? +### What are the reasons for using a static website? Options like [Drupal](https://www.drupal.com/), [WordPress](https://wordpress.org/), and [Omeka](https://omeka.org/) are good for the needs of complex, interactive websites like Amazon or an interactive digital edition of a novel—but for many blogs, project websites, and online portfolios, a static website (such as a website created using Jekyll) can do everything you need while providing some nice perks: @@ -69,7 +69,7 @@ Options like [Drupal](https://www.drupal.com/), [WordPress](https://wordpress.or - **More customization possible**: Since learning to master your website is easier, things you'll definitely want to do, like changing the look (the "theme") of a Jekyll-created site, are much easier than altering the look of a WordPress or Drupal site. - **Free hosting:** While many website tools like Drupal, WordPress, and Omeka are free, hosting them (paying for someone to serve your website's files to site visitors) can cost money. -- **Versioning:** Hosting on GitHub Pages means your site is linked into GitHub's visual interface for git versioning, so you can track changes to your site and always roll back to an earlier state of any blog post, page, or the site itself if needed. This includes uploaded files you might want to store on the site, like old syllabi and publications. (Versioning is [explained in more detail above](#section0-1).) +- **Versioning:** Hosting on GitHub Pages means your site is linked into GitHub's visual interface for git versioning, so you can track changes to your site and always roll back to an earlier state of any blog post, page, or the site itself if needed. This includes uploaded files you might want to store on the site, like old syllabi and publications. (Versioning is [explained in more detail above](#dynamic-websites-static-websites--jekyll).) - **Security:** There's no database to protect from hackers. - **Speed:** Minimal website files and no database to query mean a faster page-loading time. @@ -79,29 +79,29 @@ Creating a static website using Jekyll offers more perks in addition to all the - **Built for blogging:** Jekyll was built to support blog posts, so it's easy to blog (add new, date-sorted content) and do related tasks like display an archive of all blog posts by month, or include a link to the three most recent blog posts at the bottom of each post. - **Templating automates repeated tasks:** Jekyll makes it easy to automate repeated website tasks via its "templating" system: you can create content that should, for example, appear on the header and footer of every page (e.g. logo image, main menu), or following the title of every blog post (e.g. author name and publication date). This templated information will automatically be repeated on every appropriate webpage, instead of forcing you to manually rewrite that information on every webpage where you want it to appear. Not only does this save a lot of copying and pasting—if you ever want to change something that appears on every page of your website (e.g. a new site logo or a new item in the main menu), changing it once in a template will change in on every place it appears on your website. -## Preparing for installation +## Preparing for installation -We're ready to get to work! In the rest of this lesson, we're going to get a few programs installed on your computer, use the command line to install a few things that can only be installed that way, look at and customize a private version of your website, and finally make your website publicly accessible on the Web. If you run into problems at any point in this lesson, see the [help section](#section9) for how to ask questions or report issues. +We're ready to get to work! In the rest of this lesson, we're going to get a few programs installed on your computer, use the command line to install a few things that can only be installed that way, look at and customize a private version of your website, and finally make your website publicly accessible on the Web. If you run into problems at any point in this lesson, see the [help section](#help-credits--further-reading) for how to ask questions or report issues. In this section, we'll make sure you have a couple things ready on your computer for when we need them later in the lesson by covering what operating system you can use (i.e. Mac/Windows/Linux), creating a GitHub account and installing the GitHub app, why you should use a "text editor" program to work on your website, and how to use the command line. Everything this lesson has you install is a standard and trusted web development tool, so it isn't important to know exactly what each of these things do before installing it. I'll try to balance more information about the things it's most useful for you to fully understand, with providing a brief explanation for each piece and also link to further information in case you'd like to know more about what you're putting on your computer. -### Operating systems +### Operating systems This tutorial should be usable by both Mac and Windows users. Jekyll can also work for Linux; this tutorial uses the GitHub Desktop software (Mac and Windows only) for simplicity, but Linux users will need to use git over the command line instead (not covered here). -Jekyll isn't officially supported for Windows, which means none of the official Jekyll documentation (the pages that walk you through setting up Jekyll and what its different pieces do, which you could consult instead of or in addition to this lesson) addresses Windows use. I've used [David Burela's Windows instructions]( https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/) to note the places in the ["Installing Dependencies" section](#section2) when Windows users should do something different; the rest of the lesson should work the same for both Mac and Windows users, though note that screenshots throughout the lesson are all from a Mac (so thing may look slightly different for a Windows user). +Jekyll isn't officially supported for Windows, which means none of the official Jekyll documentation (the pages that walk you through setting up Jekyll and what its different pieces do, which you could consult instead of or in addition to this lesson) addresses Windows use. I've used [David Burela's Windows instructions]( https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/) to note the places in the ["Installing Dependencies" section](#installing-dependencies) when Windows users should do something different; the rest of the lesson should work the same for both Mac and Windows users, though note that screenshots throughout the lesson are all from a Mac (so thing may look slightly different for a Windows user). -### GitHub user account +### GitHub user account *A GitHub user account will let you host your website (make it available for others to visit) for free on GitHub (we'll cover how in a later step). As a bonus, it will also let you keep track of versions of the website and its writing as it grows or changes over time.* -1\. Visit [GitHub.com](https://github.com/) and click on the "Sign up" button on the upper right. Write your desired username. This will be visible to others, identify you on GitHub, and also be part of your site's URL; for example, the author's GitHub username is amandavisconti and her demo Jekyll site's URL is http://amandavisconti.github.io/JekyllDemo/. (*Note you can also purchase your own domain name and use it for this site, but that won't be covered in this tutorial*). Also write your desired email address and password, then click "Create an account". +1\. Visit [GitHub.com](https://github.com/) and click on the "Sign up" button on the upper right. Write your desired username. This will be visible to others, identify you on GitHub, and also be part of your site's URL; for example, the author's GitHub username is amandavisconti and her demo Jekyll site's URL is https://amandavisconti.github.io/JekyllDemo/. (*Note you can also purchase your own domain name and use it for this site, but that won't be covered in this tutorial*). Also write your desired email address and password, then click "Create an account". 2\. On the next page, click the "Choose" button next to the "Free" plan option, ignore the "Help me set up an organization next" checkbox, and click "Finish sign up". 3\. *Optional*: Visit https://github.com/settings/profile to add a full name (can be your real name, GitHub user name, or something else) and other public profile information, if desired. -### GitHub Desktop app +### GitHub Desktop app *The GitHub Desktop app will make updating your live website (one we set it up) easy—instead of using the command line every time you want to update your site, you'll be able to use an easier visual tool to update your site.* @@ -113,15 +113,15 @@ Jekyll isn't officially supported for Windows, which means none of the official 6\. The last page will ask "Which repositories would you like to use?". Ignore this and click the "Done" button. 7\. *Optional:* Follow the walkthrough of the GitHub Desktop app that will appear (this isn't necessary; we will cover anything you need to do with GitHub in this lesson). -### Text editor +### Text editor -You'll need to download and install a "text editor" program on your computer for making small customizations to your Jekyll site's code. Good free options include [TextWrangler](http://www.barebones.com/products/textwrangler/download.html) (Mac) or [Notepad++](https://notepad-plus-plus.org/) (Windows). Software aimed at word processing, like Microsoft Word or Word Pad, isn't a good choice because it's easy to forget how to format and save the file, accidentally adding in extra and/or invisible formatting and characters that will break your site. You'll want something that specifically can save what you write as plaintext (e.g. HTML, Markdown). +You'll need to download and install a "text editor" program on your computer for making small customizations to your Jekyll site's code. Good free options include [TextWrangler](https://www.barebones.com/products/textwrangler/download.html) (Mac) or [Notepad++](https://notepad-plus-plus.org/) (Windows). Software aimed at word processing, like Microsoft Word or Word Pad, isn't a good choice because it's easy to forget how to format and save the file, accidentally adding in extra and/or invisible formatting and characters that will break your site. You'll want something that specifically can save what you write as plaintext (e.g. HTML, Markdown). -*Optional:* See [the "Authoring in Markdown" section below](#section5-2) for notes on a Markdown-specific editing program, which you may also wish to install when you get to the point of authoring webpages and/or blog posts. +*Optional:* See [the "Authoring in Markdown" section below](#authoring-in-markdown) for notes on a Markdown-specific editing program, which you may also wish to install when you get to the point of authoring webpages and/or blog posts. -### Command line +### Command line -The command line is a way to interact with your computer using text: it lets you type in commands for actions from simpler things such as "show me a list of the files in this directory" or "change who is allowed to access this file", to more complex behavior. Sometimes there are nice visual ways to do things on your computer (e.g. the GitHub Desktop app [we installed above](#section1-2)), and sometimes you'll need to use the command line to type out commands to get your computer to do things. The Programming Historian has [an in-depth lesson exploring the command line written](/lessons/intro-to-bash) by Ian Milligan and James Baker if you want more information than provided here, but this lesson will cover everything you need to know to complete the lesson (and we'll only use the command line when it's necessary or much easier than a visual interface). +The command line is a way to interact with your computer using text: it lets you type in commands for actions from simpler things such as "show me a list of the files in this directory" or "change who is allowed to access this file", to more complex behavior. Sometimes there are nice visual ways to do things on your computer (e.g. the GitHub Desktop app [we installed above](#github-desktop-app)), and sometimes you'll need to use the command line to type out commands to get your computer to do things. The Programming Historian has [an in-depth lesson exploring the command line written](/en/lessons/intro-to-bash) by Ian Milligan and James Baker if you want more information than provided here, but this lesson will cover everything you need to know to complete the lesson (and we'll only use the command line when it's necessary or much easier than a visual interface). Where the command line uses text commands, a "graphical user interface" (aka GUI) is what you probably normally use to work with your computer: anything where commands are given through a visual interface containing icons, images, mouse-clicking, etc. is a GUI. Often it's simpler and faster to type in (or cut and paste from a tutorial) a series of commands via the command line, than to do something using a GUI; sometimes there are things you'll want to do for which no one has yet created a GUI, and you'll need to do them via the command line. @@ -146,11 +146,11 @@ When asked to open a command line window and enter commands in this lesson, keep Starting with macOS Catalina, zsh has replaced bash as the default shell for Macs. This lesson has not been tested on the zsh shell. If you wish to set your shell to bash to follow this tutorial, have a look at the instructions [here](https://support.apple.com/en-us/HT208050). -## Installing dependencies +## Installing dependencies -*We'll install some software dependencies (i.e. code Jekyll depends on to be able to work), using the command line because there isn't a visual interface for doing this. This section is divided into instructions for if you're [On a Mac](#sectionMac) or [On Windows](#sectionWindows), so skip down to [On Windows](#sectionWindows) now if you're using Windows.* +*We'll install some software dependencies (i.e. code Jekyll depends on to be able to work), using the command line because there isn't a visual interface for doing this. This section is divided into instructions for if you're [On a Mac](#on-a-mac) or [On Windows](#on-windows), so skip down to [On Windows](#on-windows) now if you're using Windows.* -### On a Mac +### On a Mac *If you're using a Mac computer, follow the instructions below until you hit a line that says the Windows-specific instructions are beginning.* @@ -158,11 +158,11 @@ Starting with macOS Catalina, zsh has replaced bash as the default shell for Mac Please note that if you are using a Mac with an M-series (silicon) chip, you may encounter some compatibility issues when installing Jekyll or its dependencies. If that’s the case, we advise you to follow additional installation instructions for the relevant package, either from the official documentation, or online help forums.
    -Open a command line window (Applications > Utilities > Terminal) and enter the code shown in the steps below (*`code is formatted like this`*), keeping [the command line tips from above](#section1-4) in mind. +Open a command line window (Applications > Utilities > Terminal) and enter the code shown in the steps below (*`code is formatted like this`*), keeping [the command line tips from above](#command-line) in mind. -### Command line tools suite +### Command line tools suite -You'll need to first install the Mac "command line tools" suite to be able to use [Homebrew](http://brew.sh/) (which we'll install next). Homebrew lets you download and install open-source software on Macs from the command line (it's a "package manager"), which will make installing Ruby (the language Jekyll is built on) easier. +You'll need to first install the Mac "command line tools" suite to be able to use [Homebrew](https://brew.sh/) (which we'll install next). Homebrew lets you download and install open-source software on Macs from the command line (it's a "package manager"), which will make installing Ruby (the language Jekyll is built on) easier. In Terminal, paste the following code then press enter: @@ -180,9 +180,9 @@ You'll see a message that "The software was installed" when the installation is {% include figure.html filename="building-static-sites-with-jekyll-github-pages-2.5.png" caption="Popup message stating the software was installed" %} -### Homebrew +### Homebrew -After the command line tools suite has completed installation, return to your command line window and enter the following to install [Homebrew](http://brew.sh/): +After the command line tools suite has completed installation, return to your command line window and enter the following to install [Homebrew](https://brew.sh/): ``` /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" @@ -192,7 +192,7 @@ You'll need to press enter when prompted and enter your computer password when a {% include figure.html filename="building-static-sites-with-jekyll-github-pages-4.png" caption="The command entered into the author’s command line, followed by all the text that appeared (including the prompt to press enter, and to enter my password)" %} -### Ruby & Ruby Gems +### Ruby & Ruby Gems Jekyll is built from the [Ruby coding language](https://en.wikipedia.org/wiki/Ruby_%28programming_language%29). [Ruby Gems](https://rubygems.org/) makes setting up Ruby software like Jekyll easy (it's a package manager, just like Homebrew—instead of making installation easy on Macs, it adds some stuff to make Ruby installations simpler). @@ -213,23 +213,23 @@ Some users of macOS Catalina and macOS Big Sur have reported encountering diffic
    -### NodeJS +### NodeJS [NodeJS](https://nodejs.org/en/) (or Node.js) is a development platform (in particular, a "runtime environment") that does things like making Javascript run faster. `brew install node` -### Jekyll +### Jekyll -[Jekyll](https://jekyllrb.com/) is the code that creates your website (i.e. "site generation"), making it easier to do certain common tasks such as using the same template (same logo, menu, author information…) on all your blog post pages. There's more info on [what Jekyll and static sites are](#section0-1), and on [why you'd want to use Jekyll to make a static website](#section0-3), above. +[Jekyll](https://jekyllrb.com/) is the code that creates your website (i.e. "site generation"), making it easier to do certain common tasks such as using the same template (same logo, menu, author information…) on all your blog post pages. There's more info on [what Jekyll and static sites are](#dynamic-websites-static-websites--jekyll), and on [why you'd want to use Jekyll to make a static website](#what-are-the-reasons-for-using-a-static-website), above. `gem install jekyll` If you get a permissions error at this point, entering `usr/local/bin/gem install jekyll` instead of the command above may help. -**Skip the following steps (which are for Windows users only) and jump down to [Setting up Jekyll](#section3).** +**Skip the following steps (which are for Windows users only) and jump down to [Setting up Jekyll](#setting-up-jekyll).** -### On Windows +### On Windows *Instructions for Windows users differ from those for Mac users just in this one "Installing dependencies" section. Only do the following if you're using Windows.* @@ -243,7 +243,7 @@ If you get a permissions error at this point, entering `usr/local/bin/gem instal `Install MSYS2 and MINGW development toolchain succeeded` -5\. Close this command prompt and open a new one to install Jekyll. [Jekyll](https://jekyllrb.com/) is the code that creates your website (i.e. 'site generation'), making it easier to do certain common tasks such as using the same template (same logo, menu, author information…) on all your blog post pages. There's more info on [what Jekyll and static sites are](#section0-1), and on [why you'd want to use Jekyll to make a static website](#section0-3), above. We'll now install Jekyll (if Windows Security gives you a warning popup, ignore it): +5\. Close this command prompt and open a new one to install Jekyll. [Jekyll](https://jekyllrb.com/) is the code that creates your website (i.e. 'site generation'), making it easier to do certain common tasks such as using the same template (same logo, menu, author information…) on all your blog post pages. There's more info on [what Jekyll and static sites are](#dynamic-websites-static-websites--jekyll), and on [why you'd want to use Jekyll to make a static website](#what-are-the-reasons-for-using-a-static-website), above. We'll now install Jekyll (if Windows Security gives you a warning popup, ignore it): `gem install jekyll bundler` @@ -254,7 +254,7 @@ If you get a permissions error at this point, entering `usr/local/bin/gem instal **From now on, all instructions are for both Mac and PC users!** -## Setting up Jekyll +## Setting up Jekyll *You've now installed everything needed to make your website. In this section, we'll use Jekyll to generate a new folder full of the files that constitute your website. We'll also locate this folder in a place accessible to the GitHub Desktop app so they're in the right place when we want to publish them as a public website later in the lesson.* @@ -282,7 +282,7 @@ If you get a permissions error at this point, entering `usr/local/bin/gem instal Don't forget to wait until the command prompt appears again to move to the next step. -4\. Your site's public URL will take the form http://amandavisconti.github.io/JekyllDemo/, with *amandavisconti* being the author's GitHub username and *JekyllDemo* the name of the site I entered at this step (*an option to purchase and use your own [custom URL](#section7-2) is possible, but not covered in this lesson*). **Lowercase and uppercase website names do *not* point to the same website automatically**, so unlike my *JekyllDemo* example you might wish to pick an all-lowercase name to make sure people who hear about the site tend to type its URL correctly. +4\. Your site's public URL will take the form https://amandavisconti.github.io/JekyllDemo/, with *amandavisconti* being the author's GitHub username and *JekyllDemo* the name of the site I entered at this step (*an option to purchase and use your own [custom URL](#functionality) is possible, but not covered in this lesson*). **Lowercase and uppercase website names do *not* point to the same website automatically**, so unlike my *JekyllDemo* example you might wish to pick an all-lowercase name to make sure people who hear about the site tend to type its URL correctly. At the command line, type in the following (but replace *JekyllDemo* with whatever you want your site to be called): @@ -294,11 +294,11 @@ Don't forget to wait until the command prompt appears again to move to the next `cd JekyllDemo` - If you look in the *GitHub > JekyllDemo* folder in Finder, you'll see that a bunch of new files—the files that will run your website!—have been installed (we'll describe what each does [further on in the lesson](#section4-2)): + If you look in the *GitHub > JekyllDemo* folder in Finder, you'll see that a bunch of new files—the files that will run your website!—have been installed (we'll describe what each does [further on in the lesson](#where-and-what-is-everything)): {% include figure.html filename="building-static-sites-with-jekyll-github-pages-9.png" caption="In Finder, we can see that bunch of new files—the files that will run your website!—have been installed" %} -## Running a website locally +## Running a website locally *This section will describe how to run your website* ***locally****—meaning you'll be able to see what your website will look like in a web browser just on your computer (aka locally), but not anywhere else. Working on a "local" version of a website means that it's private to your computer; no one else can see your website yet (your website isn't "live" or "public": no one can type in the URL and see it in their browser).* *This means you can experiment all you want, and only publish your site for the world to see when it's ready. Or, once you've made your site live, you can continue to experiment locally with new writing, design, etc. and only add these to the public site once you're happy with how they look on the local site.* @@ -313,7 +313,7 @@ Don't forget to wait until the command prompt appears again to move to the next *--watch* together with *bundle exec* tells Jekyll to watch for changes to the website's files, such as you writing and saving a new blog post or webpage, and to include these changes on refreshing your web browser. **An exception to this** is the _config.yml file, which I'll discuss in more detail in the next section (any changes made there *won't* show up until you stop and restart Jekyll). -2\. After typing in the command in the previous step, you'll notice that the process never finishes. Remember how on the command line, if you type in anything while the previous command is still processing, you can cause problems? Jekyll is now being run from this command line window, so you'll need to open a new command line window if you want to type other commands while your local site is still accessible to you (see [the section on command line usage above](#section1-4).) +2\. After typing in the command in the previous step, you'll notice that the process never finishes. Remember how on the command line, if you type in anything while the previous command is still processing, you can cause problems? Jekyll is now being run from this command line window, so you'll need to open a new command line window if you want to type other commands while your local site is still accessible to you (see [the section on command line usage above](#command-line).) {% include figure.html filename="building-static-sites-with-jekyll-github-pages-10.png" caption="The command line after entering the command to start serving your Jekyll website" %} @@ -325,7 +325,7 @@ Don't forget to wait until the command prompt appears again to move to the next {% include figure.html filename="building-static-sites-with-jekyll-github-pages-11.png" caption="A basic Jekyll website with boilerplate text" %} -### Mini cheatsheet +### Mini cheatsheet - Type `bundle exec jekyll serve --watch` at the command line to start running your website locally. You'd visit **localhost:4000** in a browser to see your local site now, but in the next section we'll be changing things such that you'll need to visit **localhost:4000/JekyllDemo/** to see the site from then on (filling in your website folder name for *JekyllDemo*, and making sure to include the last slash). @@ -335,13 +335,13 @@ Don't forget to wait until the command prompt appears again to move to the next - Typing or pasting in `bundle exec jekyll serve --watch` a lot? Instead, you can type the ↑ (up arrow) at the command line to scroll through recently typed commands; just press enter after the command you want to use appears. -## Tweaking the settings +## Tweaking the settings *You now have a basic, private website accessible only on your computer. In this section, we'll begin to customize your site by changing the website title and author information, and giving a brief overview of what the different website files do.* -### Basic site settings via _config.yml +### Basic site settings via _config.yml -1\. Navigate to your website folder in Finder (Macs) or the directory folder (Windows. The author's website at */Users/DrJekyll/GitHub/JekyllDemo* (*DrJekyll* is my logged in username, and *JekyllDemo* is the name of my website folder). [Return to the "Setting up Jekyll" section](#section3) if you need help locating your website folder. +1\. Navigate to your website folder in Finder (Macs) or the directory folder (Windows. The author's website at */Users/DrJekyll/GitHub/JekyllDemo* (*DrJekyll* is my logged in username, and *JekyllDemo* is the name of my website folder). [Return to the "Setting up Jekyll" section](#setting-up-jekyll) if you need help locating your website folder. You'll notice that generating and running your site in the previous section added a new "_site" folder. This is where Jekyll puts the HTML files it generates from the other files in your website folder. Jekyll works by taking various files like your site configuration settings (_config.yml) and files that just contain post or page content without other webpage information (e.g. about.md), putting these all together, and spitting out HTML pages that a web browser is able to read and display to site visitors. @@ -370,11 +370,11 @@ You'll notice that generating and running your site in the previous section adde - **email**: Your email address. - **description**: A description of your website that will be used in search engine results and the site's RSS feed. - **baseurl**: Fill in the quotation marks with a forward slash followed by the name of your website folder (e.g. "/JekyllDemo/") to help locate the site at the correct URL. Make sure that your folder is the same the GitHub repository name and ends with a backslash (`/`). It will be required for publishing it on GitHub Pages. - - **url**: Replace "http://yourdomain.com" with "localhost:4000" to help locate your local version of the site at the correct URL. + - **url**: Replace "https://yourdomain.com" with "localhost:4000" to help locate your local version of the site at the correct URL. - **twitter_username**: Your Twitter username (do not include @ symbol). - **github_username**: Your GitHub username. - The changes you made to the *baseurl* and *url* lines will let your site run from the same files both locally on your computer and live on the Web, but **doing this changed the URL where you'll see your local site from now on** (while [Jekyll is running](#section3-1)) from localhost:4000 to **localhost:4000/JekyllDemo/** (substitute your website folder name for *JekyllDemo* and remembering the last slash mark). + The changes you made to the *baseurl* and *url* lines will let your site run from the same files both locally on your computer and live on the Web, but **doing this changed the URL where you'll see your local site from now on** (while Jekyll is running) from localhost:4000 to **localhost:4000/JekyllDemo/** (substitute your website folder name for *JekyllDemo* and remembering the last slash mark). In the screenshot below, I have deleted the initial commented lines 1-9 and 12-15, as well as the commented text stating what "description" does (not necessary, just to show you can delete comments that you don't care about seeing!) and customized the rest of the file as instructed above: @@ -384,43 +384,43 @@ You'll notice that generating and running your site in the previous section adde {% include figure.html filename="building-static-sites-with-jekyll-github-pages-17.png" caption="The author's customized local website" %} -### Where (and what) is everything? +### Where (and what) is everything? {#where-and-what-is-everything} -To get a sense of how your site works and what files you'd experiment with to do more advanced things, here are some notes on what each folder or file in your current website folder does. Remember to always open and edit any files with a text editor (e.g. TextWrangler) and not a word processor (e.g. not Microsoft Word or anything that lets you add formatting like italic and bold); this prevents invisible formatting characters from being saved in the file and messing up the website. If you just want to start adding content to your site and make it public, you can [skip to the next section](#section5). +To get a sense of how your site works and what files you'd experiment with to do more advanced things, here are some notes on what each folder or file in your current website folder does. Remember to always open and edit any files with a text editor (e.g. TextWrangler) and not a word processor (e.g. not Microsoft Word or anything that lets you add formatting like italic and bold); this prevents invisible formatting characters from being saved in the file and messing up the website. If you just want to start adding content to your site and make it public, you can [skip to the next section](#writing-pages-and-posts). {% include figure.html filename="building-static-sites-with-jekyll-github-pages-18.png" caption="A Finder window showing the default files and folders in a Jekyll website folder" %} -- **_config.yml** is discussed [above](#section4-1); it provides basic settings information about your site, such as the site's title and additional possibilities we won't cover here, like how to structure links to posts (e.g. should they follow the pattern MySite.com/year/month/day/post-title?). +- **_config.yml** is discussed [above](#basic-site-settings-via-_configyml); it provides basic settings information about your site, such as the site's title and additional possibilities we won't cover here, like how to structure links to posts (e.g. should they follow the pattern MySite.com/year/month/day/post-title?). - **_includes** folder has files that get included on all or certain pages (e.g. code to make the header contain your site title and main menu on every page of the site) - **_layouts** folder contains code that controls how the pages on your site look (default.html), as well as customizations of that code to further style blog posts (post.html) and pages (page.html) -- **_posts** folder holds the individual files that each represent a blog post on your website. Adding a new post to this folder will make a new blog post appear on your website, in reverse chronological order (newest post to oldest). We'll cover adding blog posts in the [next section](#section5-2). +- **_posts** folder holds the individual files that each represent a blog post on your website. Adding a new post to this folder will make a new blog post appear on your website, in reverse chronological order (newest post to oldest). We'll cover adding blog posts in the [next section](#authoring-posts). - **_sass** folder holds SCSS files that control the visual design of the site - **_site** folder is where the HTML pages that appear on the web are generated and stored (e.g. you'll write and save posts as Markdown files, but Jekyll will convert these to HTML for display in a web browser) - **index.md** is a place to add content that you want to appear on your homepage, such as a biography blurb to appear above the "Posts" list -- **about.md** is an example of a Jekyll *page*. It's already linked in the header of your website, and you can customize its text by opening and writing in that file. We'll cover adding more site pages in the [next section](#section5-3). +- **about.md** is an example of a Jekyll *page*. It's already linked in the header of your website, and you can customize its text by opening and writing in that file. We'll cover adding more site pages in the [next section](#authoring-pages). - **css** folder holds CSS converted from SCSS that controls the visual design of the site - **feed.xml** lets people follow the RSS feed of your blog posts - **index.html** controls the structuring of content on your site's homepage -## Writing pages and posts +## Writing pages and posts {#writing-pages-and-posts} *This section will describe how to create pages and blog posts on your website.* **Pages** and **posts** are just two types of written content that's styled differently. Pages are content (like an "About" page) that isn't organized or displayed chronologically, but might be included in your website's main menu; posts are meant to be used for content best organized by publication date. The URLs (links) for pages and posts are also different by default (although you can change this): page URLs look like *MySite.com/about/*, while post URLs look like *MySite.com/2016/02/29/my-post-title.html.* -#### Authoring in Markdown +#### Authoring in Markdown Markdown is a way of formatting your writing for reading on the web: it's a set of easy-to-remember symbols that show where text formatting should be added (e.g. a # in front of text means to format it as a heading, while a * in front of text means to format it as a bulleted list item). For Jekyll in particular, Markdown means you can write webpages and blog posts in a way that's comfortable to authors (e.g. no need to look up/add in HTML tags while trying to write an essay), but have that writing show up formatted nicely on the web (i.e. a text-to-HTML convertor). -We won't cover Markdown in this lesson; if you're not familiar with it, for now you can just create posts and pages with no formatting (i.e. no bold/italic, no headers, no bulleted lists). But these are easy to learn how to add: there's a handy markdown [reference](http://kramdown.gettalong.org/quickref.html), as well as [a Programming Historian lesson by Sarah Simpkin on the hows and whys of writing with Markdown](/lessons/getting-started-with-markdown). Check out these links if you'd like to format text (italics, bold, headings, bullet/numbered lists) or add hyperlinks or embedded images and other files. +We won't cover Markdown in this lesson; if you're not familiar with it, for now you can just create posts and pages with no formatting (i.e. no bold/italic, no headers, no bulleted lists). But these are easy to learn how to add: there's a handy markdown [reference](https://kramdown.gettalong.org/quickref.html), as well as [a Programming Historian lesson by Sarah Simpkin on the hows and whys of writing with Markdown](/en/lessons/getting-started-with-markdown). Check out these links if you'd like to format text (italics, bold, headings, bullet/numbered lists) or add hyperlinks or embedded images and other files. -Make sure any Markdown cheatsheets you look at are for the "[kramdown](http://kramdown.gettalong.org/quickref.html)" flavor of Markdown, which is what GitHub Pages (where we'll be hosting our website) supports. (*There are [various "flavors" of Markdown](https://github.com/jgm/CommonMark/wiki/Markdown-Flavors) that have subtle differences in what various symbols do, but for the most part frequently used symbols like those that create heading formatting are the same—so you're actually probably okay using a markdown cheatsheet that doesn't specify it's kramdown, but if you're getting errors on your site using symbols that aren't included in kramdown might be why*). +Make sure any Markdown cheatsheets you look at are for the "[kramdown](https://kramdown.gettalong.org/quickref.html)" flavor of Markdown, which is what GitHub Pages (where we'll be hosting our website) supports. (*There are [various "flavors" of Markdown](https://github.com/jgm/CommonMark/wiki/Markdown-Flavors) that have subtle differences in what various symbols do, but for the most part frequently used symbols like those that create heading formatting are the same—so you're actually probably okay using a markdown cheatsheet that doesn't specify it's kramdown, but if you're getting errors on your site using symbols that aren't included in kramdown might be why*). -You might be interested in "markdown editor" software such as [Typora](http://www.typora.io/) (OS X and Windows; free during current beta period), which will let you use popular keyboard shortcuts to write Markdown (e.g. highlight text and press command-B to make it bold) and/or type in Markdown but have it show as it will look on the web (see headings styled like headings, instead of like normal text with a # in front of them). +You might be interested in "markdown editor" software such as [Typora](https://www.typora.io/) (OS X and Windows; free during current beta period), which will let you use popular keyboard shortcuts to write Markdown (e.g. highlight text and press command-B to make it bold) and/or type in Markdown but have it show as it will look on the web (see headings styled like headings, instead of like normal text with a # in front of them). -### Authoring pages +### Authoring pages -1\. To see an existing page on your website (created as a default part of a Jekyll website [when you created the rest of your website's files](#section3)), navigate to your website folder and open the **about.md** file either in a text editor (e.g. TextWrangler) or a Markdown editor (e.g. Typora) to see the file that creates the "About" page. Also click on the "About" link in the top-right of your webpage to see what the webpage the file creates looks like in a browser. +1\. To see an existing page on your website (created as a default part of a Jekyll website [when you created the rest of your website's files](#setting-up-jekyll)), navigate to your website folder and open the **about.md** file either in a text editor (e.g. TextWrangler) or a Markdown editor (e.g. Typora) to see the file that creates the "About" page. Also click on the "About" link in the top-right of your webpage to see what the webpage the file creates looks like in a browser. 2\. The stuff between the \--- dashes is called "front matter" (*note that opening the file in a Markdown editor might make the front matter appear on a gray background instead of between dashes*). The front matter tells your site whether to format the content below the front matter as a page or blog post, the title of the post, the date and time the post should show it was published, and any categories you'd like the post or page listed under. @@ -430,15 +430,15 @@ You might be interested in "markdown editor" software such as [Typora](http://ww - **title:** Change this to the desired page title (unlike posts, no quotation marks around the title). In the screenshot below, I added a page with the title "Resume". - **permalink:** change the text between the two forward slash marks to the word (*or phrase—but you'll need to use hyphens and not spaces!*) that you want to follow your site's main URL to reach the page. For example, **permalink: /about/** locates a page at **localhost:4000/yourwebsitefoldername/about/** -3\. The space below the front matter's second — dashes (or below the front matter's gray box, if using a Markdown editor) is where you write the content of your page, using [the Markdown formatting described above](#section5-1). +3\. The space below the front matter's second — dashes (or below the front matter's gray box, if using a Markdown editor) is where you write the content of your page, using [the Markdown formatting described above](#authoring-in-markdown). 4\. To create a new page in addition to the "About" page that already exists on the site (and can be customized or deleted), create a copy of the *about.md* file in the same folder (the main website folder) and change its filename to the title you wish, using hyphens instead of spaces (e.g. *resume.md* or *contact-me.md*). Also change the title and permalink in the file's front matter, and the content of the file. The new page should automatically appear in the main menu in the site's header: {% include figure.html filename="building-static-sites-with-jekyll-github-pages-22.png" caption="After adding a new page file to the website folder, the new page appears in the website's header menu" %} -For reference, you can check out [an example of a page](http://amandavisconti.github.io/JekyllDemo/resume/) on my demo site, or see [the file that's behind that page](https://raw.githubusercontent.com/amandavisconti/JekyllDemo/gh-pages/resume.md). +For reference, you can check out [an example of a page](https://amandavisconti.github.io/JekyllDemo/resume/) on my demo site, or see [the file that's behind that page](https://raw.githubusercontent.com/amandavisconti/JekyllDemo/gh-pages/resume.md). -### Authoring posts +### Authoring posts 1\. In Finder, navigate to your website folder (e.g. *JekyllDemo*) and the *_posts* folder inside it. Open the file inside it with either a text editor (e.g. TextWrangler) or a Markdown editor (e.g. Typora). The file will be named something like *2016-02-28-welcome-to-jekyll.markdown* (the date will match when you created the Jekyll site). @@ -456,7 +456,7 @@ For reference, you can check out [an example of a page](http://amandavisconti.gi - **title:** Change "Welcome to Jekyll!" to whatever title you'd like for your new post (keeping the quotation marks around the title). It's the norm to make the title the same as the words in the filename (except with added spaces and capitalization). This is how the title will appear on the post's webpage). - **date:** Change this to when you want the post to show as its publication date and time, making sure to match the date that's part of the filename. (The date *and* time should have occurred already, for your post to show up.) - **categories:** Delete the words "jekyll update" for now, and don't add anything else here—the current theme doesn't use these and they mess up the post URLs. (*Other themes can use this field to sort blog posts by categories*.) - - **The space below the second \--- (or below the gray box, if using a Markdown editor):** This is where you write your blog post, using [the Markdown formatting described above](#section5-1). + - **The space below the second \--- (or below the gray box, if using a Markdown editor):** This is where you write your blog post, using [the Markdown formatting described above](#authoring-in-markdown). After saving, you should now be able to see your second post on the front page of your site, and clicking on the link should take you to the post's page: @@ -469,13 +469,13 @@ Notice that **the URL of the post** is your local website URL (e.g. *localhost:4 **To create further posts**, duplicate an existing file, then remember to change not just the front matter and content inside the post as described above, but also the file name (date and title) of the new file. -For reference, you can check out [an example of a post](https://amandavisconti.github.io/JekyllDemo/2016/11/12/a-post-about-my-research.html) on my demo site, or see [the code running that post](http://raw.githubusercontent.com/amandavisconti/JekyllDemo/gh-pages/_posts/2016-02-29-a-post-about-my-research.markdown). +For reference, you can check out [an example of a post](https://amandavisconti.github.io/JekyllDemo/2016/11/12/a-post-about-my-research.html) on my demo site, or see [the code running that post](https://raw.githubusercontent.com/amandavisconti/JekyllDemo/gh-pages/_posts/2016-02-29-a-post-about-my-research.markdown). -## Hosting on GitHub Pages +## Hosting on GitHub Pages {#hosting-on-github-pages} *You now know how to add text pages and posts to your website. In this section. we'll move your local site live so that others can visit it on the Web.* **At this point, we are making a version of your website publicly viewable** *(e.g. to search engines and to anyone who knows of or happens on the link).* -*[Earlier in the lesson,](#section1-2) you installed the GitHub Desktop app. We'll now use this app to easily move your website files to a place that will serve them to visitors as webpages (GitHub Pages), where the public can then visit them online. This first time, we'll move all your website's files to the Web since none of them are there yet; in the future, you'll use this app whenever you've adjusted the website's files (added, edited, or deleted content or files) on your local version of the website and are ready for the same changes to appear on the public website (there's [a cheatsheet at the end of this section](#section6-1) for this).* +*[Earlier in the lesson,](#github-desktop-app) you installed the GitHub Desktop app. We'll now use this app to easily move your website files to a place that will serve them to visitors as webpages (GitHub Pages), where the public can then visit them online. This first time, we'll move all your website's files to the Web since none of them are there yet; in the future, you'll use this app whenever you've adjusted the website's files (added, edited, or deleted content or files) on your local version of the website and are ready for the same changes to appear on the public website (there's [a cheatsheet at the end of this section](#mini-cheatsheet) for this).* 1\. Open the GitHub Desktop app. Click the **+** icon in the top left corner, and click on the "Add" option along the top of the box that appears (if "Add" isn't already selected). 2\. Click on the "Choose…" button and choose the folder (*JekyllDemo* in my example) containing your website files (if on a Mac and unable to locate this folder, your Library folder may be hidden; [use these directions](https://support.native-instruments.com/hc/en-us/articles/360020012418-Accessing-the-Hidden-User-Library-macOS) to make it visible so the GitHub Desktop app can look navigate inside it). @@ -502,9 +502,9 @@ For reference, you can check out [an example of a post](https://amandavisconti.g 9\. Click the "Sync" button in the upper-right. {% include figure.html filename="building-static-sites-with-jekyll-github-pages-28.png" caption="Click the 'Sync' button in the upper-right" %} -10\. You can now visit (and share the link to!) your live website. The URL will follow the pattern of *your GitHub username DOT github.io SLASH name of your website SLASH*. (For example, the author's URL is [amandavisconti.github.io/JekyllDemo/](http://amandavisconti.github.io/JekyllDemo/).) +10\. You can now visit (and share the link to!) your live website. The URL will follow the pattern of *your GitHub username DOT github.io SLASH name of your website SLASH*. (For example, the author's URL is [amandavisconti.github.io/JekyllDemo/](https://amandavisconti.github.io/JekyllDemo/).) -### Mini cheatsheet +### Mini cheatsheet In the future when you want to move changes you've made locally to your live site, just follow these steps: @@ -513,11 +513,11 @@ In the future when you want to move changes you've made locally to your live sit 3\. Once the commit has finished, click the "Sync" button in the top-right. 4\. Give GitHub a little time to receive these changes (about 10-90 seconds) before refreshing your live site to see your changes there. -## Getting fancy +## Getting fancy *This lesson won't cover advanced work like changing the visual appearance of your site or adding new functionality, but here is some information to get you started on your own.* -### Visual design +### Visual design The visual design of a website is often referred to as its *theme* (more properly, a theme is a set of code and image files that together make a major change to the appearance of a website). @@ -527,16 +527,16 @@ Or, you can add in (and further customize, if desired) a theme already created b - [Alex Gil's "Ed" theme for minimal digital editions](https://github.com/minicomp/ed/) and [its documentation](https://github.com/minicomp/ed/blob/main/documentation.md) (free) - [Rebecca Sutton Koeser's "Digital Edition" theme](https://github.com/emory-libraries-ecds/digitaledition-jekylltheme) (free) -- The [Jekyll Themes](http://jekyllthemes.org/) directory (free) -- [JekyllThemes.io](http://jekyllthemes.io/) (free and paid) +- The [Jekyll Themes](https://jekyllthemes.org/) directory (free) +- [JekyllThemes.io](https://jekyllthemes.io/) (free and paid) -### Functionality +### Functionality -- [Jekyll plugins](http://jekyllrb.com/docs/plugins/) allow you to add small bits of code that add functionality to your site such as [full-text search](https://github.com/PascalW/jekyll_indextank), [emoji support](https://github.com/yihangho/emoji-for-jekyll), and [tag clouds](https://gist.github.com/ilkka/710577). +- [Jekyll plugins](https://jekyllrb.com/docs/plugins/) allow you to add small bits of code that add functionality to your site such as [full-text search](https://github.com/PascalW/jekyll_indextank), [emoji support](https://github.com/yihangho/emoji-for-jekyll), and [tag clouds](https://gist.github.com/ilkka/710577). - If you want to host your site on GitHub Pages as we did in this lesson, you can only use the Jekyll plugins already included in the GitHub Pages gem we installed (here's [a full list of what you installed](https://pages.github.com/versions/) when adding the GitHub Pages gem to your Gemfile earlier). - - If you choose to host your Jekyll website elsewhere than GitHub Pages, you can use any Jekyll plugin (instructions to self-host vary by web host and won't be covered here, but [this](http://jekyllrb.com/docs/plugins/) is a page about how to install plugins once you've set up your self-hosted Jekyll site). You can search for "Jekyll plugin" plus the functionality you need to see if one is available, or check out the "Available plugins" section near the bottom of [this page](http://jekyllrb.com/docs/plugins/) for a list of plugins. + - If you choose to host your Jekyll website elsewhere than GitHub Pages, you can use any Jekyll plugin (instructions to self-host vary by web host and won't be covered here, but [this](https://jekyllrb.com/docs/plugins/) is a page about how to install plugins once you've set up your self-hosted Jekyll site). You can search for "Jekyll plugin" plus the functionality you need to see if one is available, or check out the "Available plugins" section near the bottom of [this page](https://jekyllrb.com/docs/plugins/) for a list of plugins. - You can keep GitHub Page's free hosting of your Jekyll website, but give the site a **custom domain name** (domain names are purchased for a reasonable yearly fee—usually around $10—from a "domain name registrar" such as [NearlyFreeSpeech.net](https://www.nearlyfreespeech.net/services/domains)). For example, the author's LiteratureGeek.com blog is built with Jekyll and hosted on GitHub Pages just like the site you built with this lesson, but it uses a custom domain name I purchased and configured to point to my site. Instructions on setting up a custom domain name can be found [here](https://help.github.com/articles/using-a-custom-domain-with-github-pages/).
    @@ -545,7 +545,7 @@ If you set up a custom domain with your GitHub Pages-hosted website, to avoid a - You can **migrate an existing blog** from many other systems including WordPress, Blogger, Drupal, and Tumblr by following the links on the right side of [this page](https://import.jekyllrb.com/docs/home/). When migrating a site, make sure to back up your original site in case it takes a couple tries to get posts living at the same URL as before (so search engine results and bookmarks don't break). -## Cheatsheet +## Cheatsheet **To test stuff locally** (new plugin, theme, how a new blog post looks): @@ -562,28 +562,28 @@ If you set up a custom domain with your GitHub Pages-hosted website, to avoid a * After the commit has completed, click "Sync" in the upper right. * Allow 10-90 seconds for your changes to reach GitHub's web servers, then visit your website and refresh the page to see your changes live. -## Help, credits, & further reading +## Help, credits, & further reading -### Help +### Help -If you run into an issue, [Jekyll has a page on troubleshooting](https://jekyllrb.com/docs/troubleshooting/) that might help. If you're working on the command line and get an error message, don't forget to try searching for that specific error message online. Besides search engines, [the StackExchange site](http://stackexchange.com/) is a good place to find questions and answers from people who have run into the same problem as you in the past. +If you run into an issue, [Jekyll has a page on troubleshooting](https://jekyllrb.com/docs/troubleshooting/) that might help. If you're working on the command line and get an error message, don't forget to try searching for that specific error message online. Besides search engines, [the StackExchange site](https://stackexchange.com/) is a good place to find questions and answers from people who have run into the same problem as you in the past. -### Credits +### Credits Thanks to *Programming Historian* Editor Fred Gibbs for editing, discussing, and reviewing this lesson; Paige Morgan and Jaime Howe for reviewing this lesson; Scott Weingart and students for testing the lesson with Windows; Tod Robbins and Matthew Lincoln for suggestions on the [DH Slack](https://digitalhumanities.slack.com) on what to cover in this lesson; and Roxanne Shirazi for solutions to possible permission and navigation problems. -The Editorial Board would like to thank [spswanz](https://github.com/spswanz) for pointing out to a bug in the [Ruby & Ruby Gems](#section2-3) section. +The Editorial Board would like to thank [spswanz](https://github.com/spswanz) for pointing out to a bug in the [Ruby & Ruby Gems](#ruby--ruby-gems) section. -### Further reading +### Further reading Check out the following links for documentation, inspiration, and further reading about Jekyll: -* [Official Jekyll Documentation](http://jekyllrb.com/docs/home/) -* Jekyll "unofficially" links to two Windows + Jekyll resources: [http://jekyll-windows.juthilo.com/](http://jekyll-windows.juthilo.com/) and [https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/](https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/) +* [Official Jekyll Documentation](https://jekyllrb.com/docs/home/) +* Jekyll "unofficially" links to two Windows + Jekyll resources: [https://jekyll-windows.juthilo.com/](https://jekyll-windows.juthilo.com/) and [https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/](https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/) * [https://help.github.com/articles/using-jekyll-with-pages/](https://help.github.com/articles/using-jekyll-with-pages/) -* Amanda Visconti, ["Introducing Static Sites for Digital Humanities Projects (why & what are Jekyll, GitHub, etc.?)"](http://literaturegeek.com/2015/12/08/WhyJekyllGitHub) -* Alex Gil, ["How (and Why) to Generate a Static Website Using Jekyll, Part 1"](http://chronicle.com/blogs/profhacker/jekyll1/60913) +* Amanda Visconti, ["Introducing Static Sites for Digital Humanities Projects (why & what are Jekyll, GitHub, etc.?)"](https://literaturegeek.com/2015/12/08/WhyJekyllGitHub) +* Alex Gil, ["How (and Why) to Generate a Static Website Using Jekyll, Part 1"](https://chronicle.com/blogs/profhacker/jekyll1/60913) * Eduardo Bouças, ["An Introduction to Static Site Generators"](https://davidwalsh.name/introduction-static-site-generators) * Ben Balter, [Jekyll: Where content is truly king](https://ben.balter.com/2013/10/30/content-is-king/) -* The [Prose](http://prose.io/) content editor (built on Jekyll) +* The [Prose](https://prose.io/) content editor (built on Jekyll) * [Join the Digital Humanities Slack](https://digitalhumanities.slack.com) (anyone can join, even if you have no DH experience) and check out the #publishing channel for discussions of Jekyll and other DH publishing platforms diff --git a/en/lessons/calibrating-radiocarbon-dates-r.md b/en/lessons/calibrating-radiocarbon-dates-r.md index fd280dbe1b..1a61fec5fc 100644 --- a/en/lessons/calibrating-radiocarbon-dates-r.md +++ b/en/lessons/calibrating-radiocarbon-dates-r.md @@ -159,7 +159,7 @@ By now it is clear that these details, if poorly understood, can quickly lead to ## Applications with R -Many tools are now available to calibrate radiocarbon data, like [OxCal](https://c14.arch.ox.ac.uk/oxcal/), [CALIB](http://calib.org) and [ChronoModel](https://chronomodel.com). But these tools are rather intended to deal with [Bayesian](https://perma.cc/R247-RG8E) modeling problems of chronological sequences (which we don't cover in this lesson). R offers an interesting alternative to these tools which suits our needs. R is distributed under an open license, promotes reproducibility and lets you integrate the processing of radiocarbon date into larger projects (spatial analysis, etc.). +Many tools are now available to calibrate radiocarbon data, like [OxCal](https://c14.arch.ox.ac.uk/oxcal/), [CALIB](https://calib.org) and [ChronoModel](https://chronomodel.com). But these tools are rather intended to deal with [Bayesian](https://perma.cc/R247-RG8E) modeling problems of chronological sequences (which we don't cover in this lesson). R offers an interesting alternative to these tools which suits our needs. R is distributed under an open license, promotes reproducibility and lets you integrate the processing of radiocarbon date into larger projects (spatial analysis, etc.). Several R packages are useful for calibrating radiocarbon dates: for example, packages like [Bchron](https://cran.r-project.org/package=Bchron) and [oxcAAR](https://cran.r-project.org/package=oxcAAR) are often oriented towards modeling (constructing chronologies, age-depth models, etc.). The package you will use in this lesson is called [rcarbon](https://cran.r-project.org/package=rcarbon).[^14] It allows you to easily calibrate and analyze radiocarbon ages. @@ -524,7 +524,7 @@ In this lesson, you learned how to combine conventional dates and check for cons [^7]: See, for example, Calabrisotto, C. S., Amadio, M., Fedi, M. E., Liccioli, L. & Bombardieri, L. 2017. "Strategies for Sampling Difficult Archaeological Contexts and Improving the Quality of Radiocarbon Data: The Case of Erimi Laonin Tou Porakou, Cyprus." *Radiocarbon* 59 (6): 1919–30. . -[^8]: Arnold, J. R., & W. F. Libby. 1949. "Age Determinations by Radiocarbon Content: Checks with Samples of Known Age". *Science* 110 (2869): 678‑80. ; Libby, W. F. "Radiocarbon Dating". *Nobel Lecture*. Stockholm, 12 December 1960. [http://www.nobelprize.org/nobel_prizes/chemistry/laureates/1960/libby-lecture.html](https://perma.cc/HPU7-F8GD). +[^8]: Arnold, J. R., & W. F. Libby. 1949. "Age Determinations by Radiocarbon Content: Checks with Samples of Known Age". *Science* 110 (2869): 678‑80. ; Libby, W. F. "Radiocarbon Dating". *Nobel Lecture*. Stockholm, 12 December 1960. [https://www.nobelprize.org/nobel_prizes/chemistry/laureates/1960/libby-lecture.html](https://perma.cc/HPU7-F8GD). [^9]: There actually exists three series of calibration curves: IntCal for the northern hemisphere, SHCal for the southern hemisphere, and Marine for marine samples. diff --git a/en/lessons/choropleth-maps-python-folium.md b/en/lessons/choropleth-maps-python-folium.md index 73389bb86a..13b6c10361 100644 --- a/en/lessons/choropleth-maps-python-folium.md +++ b/en/lessons/choropleth-maps-python-folium.md @@ -743,7 +743,7 @@ Now, high population counties (like Los Angeles and Cook) don't appear so bad. I ### Uneven Distribution of Normalized Data -[Earlier](#The-Problem-of-Uneven-Distribution-of-Data), you saw that the distribution of the `count` variable was wildly uneven. Is `count_per_100k` any better? +[Earlier](#the-problem-of-unevenly-distributed-data), you saw that the distribution of the `count` variable was wildly uneven. Is `count_per_100k` any better? ```python map_df['count_per_100k'].describe() diff --git a/en/lessons/cleaning-data-with-openrefine.md b/en/lessons/cleaning-data-with-openrefine.md index 764254e38b..0c1cae42a5 100755 --- a/en/lessons/cleaning-data-with-openrefine.md +++ b/en/lessons/cleaning-data-with-openrefine.md @@ -20,7 +20,7 @@ activity: transforming topics: [data-manipulation] abstract: "This tutorial focuses on how scholars can diagnose and act upon the accuracy of data." -redirect_from: /lessons/cleaning-data-with-openrefine +redirect_from: /lessons/cleaning-data-with-openrefine/ avatar_alt: Two men laundering clothes outside doi: 10.46430/phen0023 --- @@ -395,28 +395,28 @@ cleaning features, as you're performing these steps on a copy of your data set, and *OpenRefine* allows you to trace back all of your steps in the case you have made an error. - [*OpenRefine*]: http://openrefine.org "OpenRefine" + [*OpenRefine*]: https://openrefine.org "OpenRefine" [Powerhouse museum]: https://powerhouse.com.au/ "Powerhouse museum" [*Potter’s Wheel ABC*]: https://perma.cc/Q6QD-E64N "Potter's Wheel ABC " - [*Wrangler*]: http://vis.stanford.edu/papers/wrangler/ "Wrangler" - [data profiling]: http://en.wikipedia.org/wiki/Data_profiling - [named-entity recognition]: http://en.wikipedia.org/wiki/Named-entity_recognition - [Library of Congress]: http://www.loc.gov/index.html + [*Wrangler*]: https://vis.stanford.edu/papers/wrangler/ "Wrangler" + [data profiling]: https://en.wikipedia.org/wiki/Data_profiling + [named-entity recognition]: https://en.wikipedia.org/wiki/Named-entity_recognition + [Library of Congress]: https://www.loc.gov/index.html "Library of Congress" - [OCLC]: http://www.oclc.org/home.en.html "OCLC" + [OCLC]: https://www.oclc.org/home.en.html "OCLC" [website]: https://powerhouse.com.au/ "website" - [Creative Commons Attribution Share Alike (CCASA) license]: http://creativecommons.org/licenses/by-nc/2.5/au/ - [Controlled vocabulary]: http://en.wikipedia.org/wiki/Controlled_vocabulary - [Linked Data]: http://en.wikipedia.org/wiki/Linked_data + [Creative Commons Attribution Share Alike (CCASA) license]: https://creativecommons.org/licenses/by-nc/2.5/au/ + [Controlled vocabulary]: https://en.wikipedia.org/wiki/Controlled_vocabulary + [Linked Data]: https://en.wikipedia.org/wiki/Linked_data [Download OpenRefine]: https://openrefine.org/download [phm-collection]: /assets/cleaning-data-with-openrefine/phm-collection.tsv [Powerhouse Museum Website]: /images/powerhouseScreenshot.png - [facet]: http://en.wikipedia.org/wiki/Faceted_search + [facet]: https://en.wikipedia.org/wiki/Faceted_search [Screenshot of OpenRefine Example]: /images/overviewOfSomeClusters.png [GREL documentation]: https://openrefine.org/docs/manual/grelfunctions - [CSV]: http://en.wikipedia.org/wiki/Comma-separated_values + [CSV]: https://en.wikipedia.org/wiki/Comma-separated_values [RDF Transform extension]: https://github.com/AtesComp/rdf-transform#rdf-transform [NER extension]: https://github.com/stkenny/Refine-NER-Extension diff --git a/en/lessons/cleaning-ocrd-text-with-regular-expressions.md b/en/lessons/cleaning-ocrd-text-with-regular-expressions.md index 5f3f658eec..184bc672e9 100755 --- a/en/lessons/cleaning-ocrd-text-with-regular-expressions.md +++ b/en/lessons/cleaning-ocrd-text-with-regular-expressions.md @@ -14,7 +14,7 @@ machine-encoded text—has proven a godsend for historical research. This lesson exclude_from_check: - reviewers - review-ticket -redirect_from: /lessons/cleaning-ocrd-text-with-regular-expressions +redirect_from: /lessons/cleaning-ocrd-text-with-regular-expressions/ avatar_alt: A typesetter and inker at work on a printing press doi: 10.46430/phen0024 --- @@ -417,27 +417,27 @@ Regular Expressions are powerful. Yes, they are powerful enough to completely destroy your data. So practice on copies and take it one itty bitty step at a time. - [HeinOnline]: http://home.heinonline.org/ + [HeinOnline]: https://home.heinonline.org/ "Source for Legal and Government-based documents" - [pdfminer]: http://www.unixuser.org/~euske/python/pdfminer/index.html + [pdfminer]: https://www.unixuser.org/~euske/python/pdfminer/index.html "PDF Miner Module" [Pythonium’s Pyrexp]: https://pythonium.net/regex "Python Regex Tester" - [Patterns App]: http://krillapps.com/patterns/ + [Patterns App]: https://krillapps.com/patterns/ "Patterns App for RegEx Experimentation" [cheat sheet]: https://cheatography.com/davechild/cheat-sheets/regular-expressions/ "Reg Ex Cheat Sheet" - [documentation]: http://docs.python.org/2/library/re.html + [documentation]: https://docs.python.org/2/library/re.html "Re Module Documentation" - [Regular Expression HOWTO documentation]: http://docs.python.org/2/howto/regex.html#regex-howto + [Regular Expression HOWTO documentation]: https://docs.python.org/2/howto/regex.html#regex-howto "Reuglar Expressions HOWTO" - [sed]: http://www.gnu.org/software/sed/ "GNU's sed editor" - [grep]: http://www.gnu.org/software/grep/ "GNU's grep editor" - [re.search()]: http://docs.python.org/2/library/re.html#re.search + [sed]: https://www.gnu.org/software/sed/ "GNU's sed editor" + [grep]: https://www.gnu.org/software/grep/ "GNU's grep editor" + [re.search()]: https://docs.python.org/2/library/re.html#re.search "Explanation of re.search() function" - [re.sub()]: http://docs.python.org/2/library/re.html#re.sub + [re.sub()]: https://docs.python.org/2/library/re.html#re.sub "Explanation of re.sub() function" - [verbose mode]: http://docs.python.org/2/library/re.html#re.VERBOSE + [verbose mode]: https://docs.python.org/2/library/re.html#re.VERBOSE "Explanation of re.verbose mode" - [re.compile()]: http://docs.python.org/2/library/re.html#re.compile + [re.compile()]: https://docs.python.org/2/library/re.html#re.compile "Explanation of re.compile() function" diff --git a/en/lessons/clustering-visualizing-word-embeddings.md b/en/lessons/clustering-visualizing-word-embeddings.md index 4d30868c8f..413c783190 100644 --- a/en/lessons/clustering-visualizing-word-embeddings.md +++ b/en/lessons/clustering-visualizing-word-embeddings.md @@ -19,7 +19,7 @@ topics: [machine-learning, network-analysis, python, data-visualization] abstract: This lesson uses word embeddings and clustering algorithms in Python to identify groups of similar documents in a corpus of approximately 9,000 academic abstracts. It will teach you the basics of dimensionality reduction for extracting structure from a large corpus and how to evaluate your results. avatar_alt: Drawing of a star-cluster lesson-partners: [Jisc, The National Archives] -partnership-url: /jisc-tna-partnership +partnership-url: /en/jisc-tna-partnership mathjax: true doi: 10.46430/phen0111 --- diff --git a/en/lessons/code-reuse-and-modularity.md b/en/lessons/code-reuse-and-modularity.md index b559cd473e..07c40ba476 100755 --- a/en/lessons/code-reuse-and-modularity.md +++ b/en/lessons/code-reuse-and-modularity.md @@ -19,13 +19,13 @@ special mechanisms for managing complexity. This lesson will show you how to reuse parts of your code by writing functions and break your programs into modules, in order to keep everything concise and easier to debug." -next: working-with-web-pages -previous: working-with-text-files +next: /en/lessons/working-with-web-pages +previous: /en/lessons/working-with-text-files series_total: 15 lessons sequence: 4 categories: [lessons, original-ph, python] python_warning: false -redirect_from: /lessons/code-reuse-and-modularity +redirect_from: /lessons/code-reuse-and-modularity/ avatar_alt: Three caricature heads doi: 10.46430/phen0002 --- diff --git a/en/lessons/collaborative-blog-with-jekyll-github.md b/en/lessons/collaborative-blog-with-jekyll-github.md index 85f1351006..0d643ccc58 100644 --- a/en/lessons/collaborative-blog-with-jekyll-github.md +++ b/en/lessons/collaborative-blog-with-jekyll-github.md @@ -27,7 +27,7 @@ doi: 10.46430/phen0090 ## Introduction: A Small Collaborative Publishing Ecosystem This lesson will help you turn a basic Jekyll-generated website into a digital humanities (DH) community blog or other multi-author scholarly website, such as a simpler version of the DH center website and blog [ScholarsLab.org](https://scholarslab.org). The ["Building a static website with Jekyll and GitHub Pages" lesson](/en/lessons/building-static-sites-with-jekyll-github-pages) explained how to create a free, easy-to-maintain, preservation-friendly, secure website over which full control is held, such as a scholarly blog, project website, or online portfolio. In this lesson, we provide novice-friendly instructions on how to turn that basic Jekyll website into an active, community-authored blog with a system for reviewing writing and other site changes before moving them to the website.[^1] -The tutorial is divided into two parts: initial, one-time start-up actions; and the steps to follow each time you want to author or edit the site. This lesson will cover creating and editing blog posts on your site and creating and integrating author information for sites supporting multiple authors. We also offer practical advice on the challenges this sort of set-up offers for community authorship as well as questions to consider before undertaking this sort of workflow. If you have an existing blog you are hoping to migrate to Jekyll, we briefly advise on this process [near the end of this lesson](#moving-an-existing-website-to-Jekyll). +The tutorial is divided into two parts: initial, one-time start-up actions; and the steps to follow each time you want to author or edit the site. This lesson will cover creating and editing blog posts on your site and creating and integrating author information for sites supporting multiple authors. We also offer practical advice on the challenges this sort of set-up offers for community authorship as well as questions to consider before undertaking this sort of workflow. If you have an existing blog you are hoping to migrate to Jekyll, we briefly advise on this process [near the end of this lesson](#moving-an-existing-website-to-jekyll). ### Pre-requisites and requirements This lesson assumes you are starting from an existing Jekyll website you have created yourself, either by: @@ -46,7 +46,7 @@ You may be able to follow this lesson using any kind of computer, but this lesso In the version of this lesson that we use for our research center's blog, users tend to complete the lesson in 1–1.5 hours. ### How difficult is this lesson? -The steps in this tutorial are unambiguous and there are very few choices you need to make as you work through this lesson. We tried to make it very detailed, combining the steps to take along with explanations of why you are taking these actions. We have included screenshots so you can compare what you are seeing with what the lesson thinks you should be seeing. A [help](#help) section includes a handy [recap](#workflow-recap) of the steps described below (useful once you understand them and need an accessible reference), links to a glossary of key terms, and further reading. +The steps in this tutorial are unambiguous and there are very few choices you need to make as you work through this lesson. We tried to make it very detailed, combining the steps to take along with explanations of why you are taking these actions. We have included screenshots so you can compare what you are seeing with what the lesson thinks you should be seeing. A [help](#help) section includes a handy [recap](#workflow-summary) of the steps described below (useful once you understand them and need an accessible reference), links to a glossary of key terms, and further reading. You will learn some new terms and gain familiarity with the GitHub.com interface. You will not need to use the command line or understand git/versioning. (We discuss two versioning concepts briefly, but you will not need to understand these to do this tutorial.) @@ -82,7 +82,7 @@ Log into GitHub (create an account if you don't have one). On the upper left of {% include figure.html filename="groupblogging-new-demo-repo.png" caption="Screenshot of creating a new repository" %} -In the "Repository name" field, write a short name for your repository. We recommend "CollabDemo", as this matches the demo repository we set up. +In the "Repository name" field, write a short name for your repository. We recommend "CollabDemo", as this matches the demo repository we set up. Skip all other options on the page and click on the green "Create repository" button at the bottom of the page. @@ -297,7 +297,7 @@ Even if you are the only person authoring on your site, using branches and pull Remember to substitute *https://github.com/your-username/your-repo-name* for *https://github.com/scholarslab/CollabDemo* in these instructions. To set up your site for our review process: -Each person in addition to you you who will write on the site should [create a user account on GitHub.com](http://github.com/join), if they do not already have one. They will need to share their username with you (or someone else who owns the repository). +Each person in addition to you you who will write on the site should [create a user account on GitHub.com](https://github.com/join), if they do not already have one. They will need to share their username with you (or someone else who owns the repository). Your repository page (for example, https://github.com/scholarslab/CollabDemo) has a horizontal row of links just below the name of the repository. Click on the "settings" link, then click on "Manage Access" in the left menu. @@ -311,7 +311,7 @@ Click on "Integrations" in the left menu. Under "Installed GitHub Apps", Netlify Scroll down to the "Repository access" section. Both options will work: the radio button next to "All repositories" is selected; or if you have other repositories you are not sure you want Netlify to run on, select the radio button next to "Only select repositories". For the latter choice, your repository should appear in the list immediately below; if it does not, use the "Select repositories" dropdown menu to add your /CollabDemo repository. Click the green "save" button. You might get redirected to Netlify at this point, but we want to keep doing a few other things in the GitHub interface. -Now we will verify the name of the default branch that GitHub Pages publishes as your website. This should be "gh-pages" if you set up your repository for Netlify according to the earlier instructions. To check this is the case, in your repository settings click on "Options" in the upper left menu, and scroll down to the "GitHub Pages" section to look at what the dropdown under "Source" says. What you see should look similar to the screenshot below, but it might contain a different branch name in the dropdown. Remember whatever branch name is listed here for use in the next step. +Now we will verify the name of the default branch that GitHub Pages publishes as your website. This should be "gh-pages" if you set up your repository for Netlify according to the earlier instructions. To check this is the case, in your repository settings click on "Options" in the upper left menu, and scroll down to the "GitHub Pages" section to look at what the dropdown under "Source" says. What you see should look similar to the screenshot below, but it might contain a different branch name in the dropdown. Remember whatever branch name is listed here for use in the next step. {% include figure.html filename="groupblogging-check-publication-branch.png" caption="Screenshot showing how to check the name of the repository branch that publishes to GitHub Pages" %} @@ -346,7 +346,7 @@ In our workflow, there are two parts to authoring on or editing your website: If you forget what any of these technical terms mean, visit our [glossary](https://scholarslab.lib.virginia.edu/blog/github-jekyll-glossary/) for a reminder. -The previous Jekyll lesson included a section on [how to create and edit posts and pages](/en/lessons/building-static-sites-with-jekyll-github-pages#writing-pages-and-posts-), so we suggest you review that lesson for a general introduction to blogging in Jekyll[^8]. In what follows, we describe the changes to those instructions that will be required for your site to function better as a collaborative blog. The key differences from the last lesson are: +The previous Jekyll lesson included a section on [how to create and edit posts and pages](/en/lessons/building-static-sites-with-jekyll-github-pages/#writing-pages-and-posts), so we suggest you review that lesson for a general introduction to blogging in Jekyll[^8]. In what follows, we describe the changes to those instructions that will be required for your site to function better as a collaborative blog. The key differences from the last lesson are: * the use of branches * authoring and editing on the GitHub.com website (in your browser) rather than locally (in your computer's file system) @@ -383,7 +383,7 @@ You can also look at the address bar; the URL will have changed from https://git Now you are on a new branch where you can work without affecting the repository that determines what is on your website so that work-in-progress will not appear publicly. ### Authoring and editing on GitHub.com -The [previous Jekyll lesson](/en/lessons/building-static-sites-with-jekyll-github-pages#writing-pages-and-posts-) shows how to use Markdown and YAML to write a post or alter a page's content and front matter. Here, we will explain: how to create, commit, and edit a post on GitHub.com (rather than locally); changes to post front matter to support a collaborative website; and ways to check how your post appears from GitHub.com (that is, when you are not running your website locally). +The [previous Jekyll lesson](/en/lessons/building-static-sites-with-jekyll-github-pages/#writing-pages-and-posts) shows how to use Markdown and YAML to write a post or alter a page's content and front matter. Here, we will explain: how to create, commit, and edit a post on GitHub.com (rather than locally); changes to post front matter to support a collaborative website; and ways to check how your post appears from GitHub.com (that is, when you are not running your website locally). #### Create a new post You should still be in the branch you created to contain your work in progress. @@ -422,7 +422,7 @@ Commit (save) as often as you would save any other work-in-progress. Or, draft y {% include figure.html filename="groupblogging-preview-changes.png" caption="Screenshot of where the text editor's preview button is found" %} -The [previous lesson](/en/lessons/building-static-sites-with-jekyll-github-pages#hosting-on-github-pages-) had you use the GitHub Desktop app to commit and merge, instead we will use the GitHub.com interface. This lets contributors unfamiliar or uncomfortable with the command line or running a site locally do everything from GitHub.com. +The [previous lesson](/en/lessons/building-static-sites-with-jekyll-github-pages/#hosting-on-github-pages) had you use the GitHub Desktop app to commit and merge, instead we will use the GitHub.com interface. This lets contributors unfamiliar or uncomfortable with the command line or running a site locally do everything from GitHub.com. To commit your work, scroll down to the bottom of the text editor page. @@ -441,7 +441,7 @@ Leave the radio buttons as-is ("Commit directly to the [your branch name] branch Click the green "Commit new file" button to finish saving your work. #### Adjustments to front matter -You will need to make four changes to how the [previous lesson](/en/lessons/building-static-sites-with-jekyll-github-pages#writing-pages-and-posts-) directed you to write the front matter of a blog post: +You will need to make four changes to how the [previous lesson](/en/lessons/building-static-sites-with-jekyll-github-pages/#writing-pages-and-posts) directed you to write the front matter of a blog post: * Add the "author" field (such as "author: Amanda Visconti") * Remove the hour, minute, and second info from the "date" YAML (we have not found it useful to track times and it can cause problems and confusion when publishing from different time zones) * Remove the "categories" YAML field as these lessons do not explain its use @@ -514,7 +514,7 @@ Click on the green "Create pull request." You will be offered another box to des Under the menu on the right, you will find a section labeled “Reviewers”. Click on the word “reviewers” to see a dropdown menu of contributors associated with your repo who you could ask to review your work (added using the [reviewer permissions](#reviewer-permissions) steps). You will want to tag someone to review your work who has owner permissions on your repo. This will notify the site authors who are confident with Jekyll (the people with “owner” privileges) that you are making a change (like adding a blog post) to the website. -*If you are not an owner*, your work is now done and you are waiting for someone with owner privileges to briefly review your changes, using the steps in ["Merging as an owner" below](#merging-as-an-owner) to check for anything that might break part of the site (highly unlikely with a blog post, more likely with changes to other repo code). Then, they will push your content to the live website. If you would like a preview of what the website will look like when your changes are merged, the next section (["Merging as an owner"](#merging-as-an-owner)) will show you how to use Netlify to do this. +*If you are not an owner*, your work is now done and you are waiting for someone with owner privileges to briefly review your changes, using the steps in ["Merging as an owner" below](#merging-a-pull-request) to check for anything that might break part of the site (highly unlikely with a blog post, more likely with changes to other repo code). Then, they will push your content to the live website. If you would like a preview of what the website will look like when your changes are merged, the next section (["Merging as an owner"](#merging-a-pull-request)) will show you how to use Netlify to do this. ### Merging a pull request If you are the one setting up your GitHub and Jekyll combination, you already have owner permissions for your repository. If not, you will need to contact the owner of the repository to give you access using the steps in the [Reviewer Permissions](#reviewer-permissions) section before being able to follow the steps below to merge collaborators' changes. @@ -547,7 +547,7 @@ Find the list item that starts "netlify/" followed by some gibberish and then by There are three possible next steps, which one you choose depends on how the preview looks and your review of the post. -*Option #1:* If the preview looks correct, you can click on the "Merge pull request" button in the bottom left of the section. Click the checkbox next to the "Use your administrator privileges to merge this pull request" message that appears, then click the "Confirm merge" button, followed by the "delete branch" button that will appear to the right. (Getting rid of branches once we are done with them helps us keep the repository clean, as we may have multiple branches open at one time that are being actively worked on.) +*Option #1:* If the preview looks correct, you can click on the "Merge pull request" button in the bottom left of the section. Click the checkbox next to the "Use your administrator privileges to merge this pull request" message that appears, then click the "Confirm merge" button, followed by the "delete branch" button that will appear to the right. (Getting rid of branches once we are done with them helps us keep the repository clean, as we may have multiple branches open at one time that are being actively worked on.) {% include figure.html filename="groupblogging-delete-branch-PR-done.png" caption="Screenshot showing deleting branch after pull request" %} @@ -586,7 +586,7 @@ Content management systems like WordPress[^9] have been designed to accommodate ### Export your data from WordPress -When you upload content to a WordPress site, your data gets stored in a database that is hidden. So, the first step to transfer a project from WordPress to another format is to retrieve that data. Sometimes, this can be a bit difficult. Interfaces are likely to change after publication of this lesson, so we refer you to [WordPressʼs documentation] (https://en.support.wordpress.com/export/) on the exact steps needed to separate your own content from its CMS. WordPress exports your data in a series of XML files that contain both the content and metadata (information like author, publication date, and tags) for the elements of your site. Please be aware that while these XML files might reference the images and media uploads for a website, the uploaded files themselves must be exported separately. +When you upload content to a WordPress site, your data gets stored in a database that is hidden. So, the first step to transfer a project from WordPress to another format is to retrieve that data. Sometimes, this can be a bit difficult. Interfaces are likely to change after publication of this lesson, so we refer you to [WordPress's documentation](https://en.support.wordpress.com/export/) on the exact steps needed to separate your own content from its CMS. WordPress exports your data in a series of XML files that contain both the content and metadata (information like author, publication date, and tags) for the elements of your site. Please be aware that while these XML files might reference the images and media uploads for a website, the uploaded files themselves must be exported separately. ### Migrate the exported data to a format appropriate to a new platform @@ -600,7 +600,7 @@ Migrating a site from one platform to another is a time-consuming and labor-inte ## Help ### Workflow summary -If you have read the longer explanations above already and just want a checklist, you can [bookmark this section](#workflow-recap): +If you have read the longer explanations above already and just want a checklist, you can [bookmark this section](#workflow-summary): 1. Create new branch & switch into that branch 2. Create new file or edit existing file @@ -613,24 +613,24 @@ If you have read the longer explanations above already and just want a checklist ### Cheat sheets - [Glossary of frequently used terms](https://scholarslab.lib.virginia.edu/blog/github-jekyll-glossary/) (pull, merge, branch, etc.) -- [Overview](/en/lessons/building-static-sites-with-jekyll-github-pages#where-and-what-is-everything-) of what various files in your website folder do +- [Overview](/en/lessons/building-static-sites-with-jekyll-github-pages/#where-and-what-is-everything) of what various files in your website folder do - [Scholars' Lab cheatsheet](https://github.com/scholarslab/scholarslab.org/blob/master/docs/authoring-and-editing.md#markdown--formatting) on basic Markdown formatting, limited to the most frequently used formatting for our particular needs ### Troubleshooting -If you run into an a problem, try reading [Jekyll 's troubleshooting page](https://jekyllrb.com/docs/troubleshooting/). Besides search engines, [the StackExchange site](http://stackexchange.com/) is a good place to find questions and answers from people who have run into the same problem as you in the past (and, hopefully, recorded how they solved it). You might also [join the Digital Humanities Slack](http://tinyurl.com/DHslack) (anyone can join, even if you have no DH experience) and ask questions in the #DHanswers channel. +If you run into an a problem, try reading [Jekyll 's troubleshooting page](https://jekyllrb.com/docs/troubleshooting/). Besides search engines, [the StackExchange site](https://stackexchange.com/) is a good place to find questions and answers from people who have run into the same problem as you in the past (and, hopefully, recorded how they solved it). You might also [join the Digital Humanities Slack](https://tinyurl.com/DHslack) (anyone can join, even if you have no DH experience) and ask questions in the #DHanswers channel. ### Advanced learning The following links are helpful for learning more about documentation, inspiration, and further reading about Jekyll: *Introductions to Jekyll and static sites* -* Amanda Visconti, ["Introducing Static Sites for Digital Humanities Projects (why & what are Jekyll, GitHub, etc.?)"](http://literaturegeek.com/2015/12/08/WhyJekyllGitHub) +* Amanda Visconti, ["Introducing Static Sites for Digital Humanities Projects (why & what are Jekyll, GitHub, etc.?)"](https://literaturegeek.com/2015/12/08/WhyJekyllGitHub) * [Building a static website with Jekyll and GitHub Pages](/en/lessons/building-static-sites-with-jekyll-github-pages) -* Alex Gil, ["How (and Why) to Generate a Static Website Using Jekyll, Part 1"](http://chronicle.com/blogs/profhacker/jekyll1/60913) +* Alex Gil, ["How (and Why) to Generate a Static Website Using Jekyll, Part 1"](https://chronicle.com/blogs/profhacker/jekyll1/60913) * Eduardo Bouças, ["An Introduction to Static Site Generators"](https://davidwalsh.name/introduction-static-site-generators) *Deeper understanding of Jekyll and GitHub Pages* -* [Official Jekyll Documentation](http://jekyllrb.com/docs/home/) -* Jekyll "unofficially" links to two Windows + Jekyll resources: [http://jekyll-windows.juthilo.com/](http://jekyll-windows.juthilo.com/) and [https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/](https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/) +* [Official Jekyll Documentation](https://jekyllrb.com/docs/home/) +* Jekyll "unofficially" links to two Windows + Jekyll resources: [https://jekyll-windows.juthilo.com/](https://jekyll-windows.juthilo.com/) and [https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/](https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/) * [https://help.github.com/articles/using-jekyll-with-pages/](https://help.github.com/articles/using-jekyll-with-pages/) * Ben Balter, [Jekyll: Where content is truly king](https://ben.balter.com/2013/10/30/content-is-king/) * [Using a custom domain with GitHub Pages hosting](https://help.github.com/en/articles/using-a-custom-domain-with-github-pages) You can purchase a domain (such as my-own-domain.com; average costs run around $10-20/year) and switch your website to using that instead of *username.github.io/repo-name* but still use GitHub Pages' free hosting. @@ -653,7 +653,7 @@ If you set up a custom domain with your GitHub Pages-hosted website, to avoid a * [Exitwp](https://github.com/thomasf/exitwp), a Python script developed by Thomas Frössman that Scholars' Lab used to migrate our blog from WordPress to Jekyll *Tools* -* Robust plain text editor options: [Atom](https://atom.io), [Sublime Text](https://www.sublimetext.com/), [Prose](http://prose.io/) content editor (built on Jekyll) +* Robust plain text editor options: [Atom](https://atom.io), [Sublime Text](https://www.sublimetext.com/), [Prose](https://prose.io/) content editor (built on Jekyll) * Project management options: [Trello](https://trello.com/en), [GitHub's project boards](https://help.github.com/en/articles/about-project-boards) *Case study links* diff --git a/en/lessons/common-similarity-measures.md b/en/lessons/common-similarity-measures.md index 25ac34e181..16dafc5f19 100644 --- a/en/lessons/common-similarity-measures.md +++ b/en/lessons/common-similarity-measures.md @@ -245,7 +245,7 @@ vectorizer = CountVectorizer(input="filename", max_features=1000, max_df=0.7) wordcounts = vectorizer.fit_transform(filenames).toarray() ``` -And that's it! You've now counted every word in all 142 texts in the test corpus. To interpret the results, you'll also need to open the metadata file as a [Pandas DataFrame](https://pandas.pydata.org/docs/getting_started/dsintro.html#dataframe). Add the following to the next line of your file: +And that's it! You've now counted every word in all 142 texts in the test corpus. To interpret the results, you'll also need to open the metadata file as a [Pandas DataFrame](https://pandas.pydata.org/docs/reference/frame.html). Add the following to the next line of your file: ```py metadata = pd.read_csv("1666_metadata.csv", index_col="TCP ID") diff --git a/en/lessons/computer-vision-deep-learning-pt1.md b/en/lessons/computer-vision-deep-learning-pt1.md index 926c9f0623..2368a6b0ca 100644 --- a/en/lessons/computer-vision-deep-learning-pt1.md +++ b/en/lessons/computer-vision-deep-learning-pt1.md @@ -23,7 +23,7 @@ topics: [python, machine-learning] abstract: This is the first of a two-part lesson introducing deep learning based computer vision methods for humanities research. Using a dataset of historical newspaper advertisements and the fastai Python library, the lesson walks through the pipeline of training a computer vision model to perform image classification. mathjax: true avatar_alt: An illustration of a camera on top of a wooden stand with a dark cloth. -next: computer-vision-deep-learning-pt2 +next: /en/lessons/computer-vision-deep-learning-pt2 series_total: 2 lessons sequence: 1 doi: 10.46430/phen0101 diff --git a/en/lessons/computer-vision-deep-learning-pt2.md b/en/lessons/computer-vision-deep-learning-pt2.md index 5a32c3fee0..a4dd45a7c5 100644 --- a/en/lessons/computer-vision-deep-learning-pt2.md +++ b/en/lessons/computer-vision-deep-learning-pt2.md @@ -23,7 +23,7 @@ topics: [python, machine-learning] abstract: This is the second of a two-part lesson introducing deep learning based computer vision methods for humanities research. This lesson digs deeper into the details of training a deep learning based computer vision model. It covers some challenges one may face due to the training data used and the importance of choosing an appropriate metric for your model. It presents some methods for evaluating the performance of a model. mathjax: true avatar_alt: A cropped illustration of a mechanical diagram of a machine with pipes. -previous: computer-vision-deep-learning-pt1 +previous: /en/lessons/computer-vision-deep-learning-pt1 series_total: 2 lessons sequence: 2 doi: 10.46430/phen0102 diff --git a/en/lessons/corpus-analysis-voyant-tools.md b/en/lessons/corpus-analysis-voyant-tools.md index f3438d5ed5..d72b8b0a7a 100644 --- a/en/lessons/corpus-analysis-voyant-tools.md +++ b/en/lessons/corpus-analysis-voyant-tools.md @@ -35,7 +35,7 @@ doi: 10.46430/phen0128 ## Introduction -In this lesson, you will learn how to organize a set of texts for research; that is, you will learn the basic steps of creating a 'corpus'. You will also learn the main metrics of quantitative text analysis. For this purpose, you will use [Voyant Tools](http://voyant-tools.org/),[^1] a web-based platform that does not require installation and works in any browser with an internet connection. +In this lesson, you will learn how to organize a set of texts for research; that is, you will learn the basic steps of creating a 'corpus'. You will also learn the main metrics of quantitative text analysis. For this purpose, you will use [Voyant Tools](https://voyant-tools.org/),[^1] a web-based platform that does not require installation and works in any browser with an internet connection. This lesson is designed as a beginner-friendly introduction to corpus analysis and is part of a growing ecosystem of tools and methods in digital humanities. You might also like to explore Heather Froehlich's _Programming Historian_ lesson [Corpus Analysis with Antconc](/en/lessons/corpus-analysis-with-antconc), Peter Organisciak and Boris Capitanu's [Text Mining in Python through the HTRC Feature Reader](/en/lessons/text-mining-with-extracted-features), and Shawn Graham, Scott Weingart, and Ian Milligan's lesson [Getting Started with Topic Modeling and MALLET](/en/lessons/topic-modeling-and-mallet). diff --git a/en/lessons/corpus-analysis-with-antconc.md b/en/lessons/corpus-analysis-with-antconc.md index 61f3d56158..302a026c59 100755 --- a/en/lessons/corpus-analysis-with-antconc.md +++ b/en/lessons/corpus-analysis-with-antconc.md @@ -15,7 +15,7 @@ exclude_from_check: activity: analyzing topics: [distant-reading] abstract: "Corpus analysis is a form of text analysis which allows you to make comparisons between textual objects at a large scale (so-called 'distant reading')." -redirect_from: /lessons/corpus-analysis-with-antconc +redirect_from: /lessons/corpus-analysis-with-antconc/ avatar_alt: Three large ornate bookcases doi: 10.46430/phen0043 --- @@ -47,12 +47,12 @@ You have done this sort of thing before, if you have ever... * Used [Voyant Tools][48] for looking at patterns in one text * Followed [Programming Historian][51]’s Introduction to Python tutorials -In many ways [Voyant](http://voyant-tools.org/) is a gateway into conducting more sophisticated, replicable analysis, as the DIY aesthetic of Python or R scripting may not appeal to everyone. [AntConc](http://www.laurenceanthony.net/software/antconc/) fills this void by being a standalone software package for linguistic analysis of texts, freely available for Windows, Mac OS, and Linux and is highly maintained by its creator, [Laurence Anthony](http://www.laurenceanthony.net/). There are other concordance software packages available, but it is freely available across platforms and very well maintained. See the [concordance bibliography][56] for other resources. +In many ways [Voyant](https://voyant-tools.org/) is a gateway into conducting more sophisticated, replicable analysis, as the DIY aesthetic of Python or R scripting may not appeal to everyone. [AntConc](https://www.laurenceanthony.net/software/antconc/) fills this void by being a standalone software package for linguistic analysis of texts, freely available for Windows, Mac OS, and Linux and is highly maintained by its creator, [Laurence Anthony](https://www.laurenceanthony.net/). There are other concordance software packages available, but it is freely available across platforms and very well maintained. See the [concordance bibliography][56] for other resources. This tutorial explores several different ways to approach a corpus of texts. It's important to note that corpus linguistic approaches are rarely, if ever, a one-size-fits all affair. So, as you go through each step, it's worth thinking about what you're doing and how it can help you answer a specific question with your data. Although I present this tutorial in a building-block approach of 'do this then that to achieve x', it's not always necessary to follow the exact order outlined here. This lessons provides an outline of some of the methods available, rather than a recipe for success. ### Tutorial downloads -1. Software:[AntConc](http://www.laurenceanthony.net/software/antconc/). +1. Software:[AntConc](https://www.laurenceanthony.net/software/antconc/). Unzip the download if necessary, and launch the application. Screen shots below may vary slightly from the version you have (and by operationg system, of course), but the procedures are more or less the same across platforms and recent versions of AntConc. This tutorial is written with a (much older) version of AntConc in mind, as I find it easier to use in an introductory context. You are welcome to use the most recent version, but if you wish to follow along with the screenshots provided, you can download the version used here, [version 3.2.4](https://www.laurenceanthony.net/software/antconc/releases/AntConc324/). 2. Sample Corpus: Download the [zip file of movie reviews](/assets/corpus-analysis-with-antconc/antconc_corpus_files.zip). @@ -77,7 +77,7 @@ Visit your favorite website for news, and navigate to a news article (doesn't ma Open a text editor such as Notepad (on Windows) or TextEdit (on Mac) and paste in your text. -Other free options for text editors include [Notepad++][53] (Windows) or [TextWrangler][54] (Mac), which offer more advanced features, and are especially good for doing a lot of text clean-up. By text clean-up, I mean removing extratextual information such as "boilerplate", which appears regularly throughout. If you keep this information, it's going to throw your data off; text analysis software will address these words in word counts, statistical analyses, and lexical relationships. For example, you might want to remove standard headers and footers which will appear on every page. Please see [“Cleaning Data with OpenRefine"](/lessons/cleaning-data-with-openrefine) for more on how to automate this task. On smaller corpora it may be more feasible to do this yourself, plus you'll get a much better sense of your corpus this way. +Other free options for text editors include [Notepad++][53] (Windows) or [TextWrangler][54] (Mac), which offer more advanced features, and are especially good for doing a lot of text clean-up. By text clean-up, I mean removing extratextual information such as "boilerplate", which appears regularly throughout. If you keep this information, it's going to throw your data off; text analysis software will address these words in word counts, statistical analyses, and lexical relationships. For example, you might want to remove standard headers and footers which will appear on every page. Please see [“Cleaning Data with OpenRefine"](/en/lessons/cleaning-data-with-openrefine) for more on how to automate this task. On smaller corpora it may be more feasible to do this yourself, plus you'll get a much better sense of your corpus this way. Save the article as a .txt file to the desktop. You may want to do some follow-up text cleanup on other information, such as author by-line or title (remove them, then save the file again.) Remember that anything you leave in the text file can and will be addressed by text analysis software. @@ -86,10 +86,10 @@ Go to your desktop and check to see you can find your text file. Repeating this a lot is how you would build a corpus of plain text files; this process is called _corpus construction_, which very often involves addressing questions of sampling, representativeness and organization. Remember, *each file you want to use in your corpus _must_ be a plain text file for Antconc to use it.* It is customary to name files with the .txt suffix so that you know what kind of file it is. As you might imagine, it can be rather tedious to build up a substantial corpus one file at a time, especially if you intend to process a large set of documents. It is very common, therefore, to use webscraping (using a small program to automatically grab files from the web for you) to construct your corpus. To learn more about the concepts and techniques for webscraping, see the _Programming Historian_ tutorials [scraping with Beautiful Soup][50] and [automatic downloading with wget][51]. -Rather than build a corpus one document at a time, we're going to use a prepared corpus of positive and negative movie reviews, borrowed from the [Natural Language Processing Toolkit](http://www.nltk.org/). The NLTK movie review corpus has 2000 reviews, organized by positive and negative outcomes; today we will be addressing a small subset of them (200 positive, 200 negative). +Rather than build a corpus one document at a time, we're going to use a prepared corpus of positive and negative movie reviews, borrowed from the [Natural Language Processing Toolkit](https://www.nltk.org/). The NLTK movie review corpus has 2000 reviews, organized by positive and negative outcomes; today we will be addressing a small subset of them (200 positive, 200 negative). Corpus construction is a subfield in its own right. Please see [Representativeness in Corpus Design](https://academic.oup.com/dsh/article-abstract/8/4/243/928942)," _Literary and Linguistic Computing_, 8 (4): 243-257 -and [_Developing Linguistic Corpora: a Guide to Good Practice_](http://www.amazon.com/Developing-Linguistic-Corpora-Practice-Guides/dp/1842172050/ref=sr_1_1) for more information. +and [_Developing Linguistic Corpora: a Guide to Good Practice_](https://www.amazon.com/Developing-Linguistic-Corpora-Practice-Guides/dp/1842172050/ref=sr_1_1) for more information. ### Getting Started with AntConc: The AntConc user interface, loading corpora @@ -255,6 +255,7 @@ You can also opt to swap reference corpus & main files (SWAP REF/MAIN FILES) In Keyword List, just hit Start (with nothing typed in the search box). If you've just swapped the reference corpus and the target files, you may be prompted to create a new word list before AntConc will calculate the keywords. We see a list of Keywords that have words that are much more "unusual" – more statistically unexpected – in the corpus we are looking at when compared to the reference corpus. +> Keyness: this is the frequency of a word in the text when compared with its frequency in a reference corpus, "such that the statistical probability as computed by an appropriate procedure is smaller than or equal to a p value specified by the user." – taken from [here][41].) For those interested in the statistical details, see the section on keyness on p7 of Laurence Anthony's [readme file](https://www.laurenceanthony.net/software/antconc/releases/AntConc335/help.pdf). > Keyness: this is the frequency of a word in the text when compared with its frequency in a reference corpus, "such that the statistical probability as computed by an appropriate procedure is smaller than or equal to a p value specified by the user." – taken from [here][41].) For those interested in the statistical details, see the section on keyness on p7 of Laurence Anthony's [readme file](https://www.laurenceanthony.net/software/antconc/releases/AntConc335/help.pdf). What are our keywords? @@ -298,17 +299,17 @@ In summary: it's worth thinking about: ### Further resources for this tutorial [A short bibliography on corpus linguistics][43]. -[A more step-by-step version of this tutorial, assuming no computer knowledge](http://hfroehli.ch/workshops/getting-started-with-antconc/) - - -[41]: http://www.lexically.net/downloads/version6/HTML/index.html?keyness_definition.htm -[43]: http://hfroehlich.wordpress.com/2014/05/11/intro-bibliography-corpus-linguistics/ -[47]: http://hfroehli.ch/workshops/getting-started-with-antconc/ -[48]: http://voyant-tools.org/ -[50]: /lessons/intro-to-beautiful-soup -[51]: /lessons/automated-downloading-with-wget -[52]: http://www.antlab.sci.waseda.ac.jp/ -[53]: http://notepad-plus-plus.org/ -[54]: http://www.barebones.com/products/textwrangler/ -[55]: http://www.wordfrequency.info/free.asp -[56]: http://hfroehli.ch/2014/05/11/intro-bibliography-corpus-linguistics/ +[A more step-by-step version of this tutorial, assuming no computer knowledge](https://hfroehli.ch/workshops/getting-started-with-antconc/) + + +[41]: https://www.lexically.net/downloads/version6/HTML/index.html?keyness_definition.htm +[43]: https://hfroehlich.wordpress.com/2014/05/11/intro-bibliography-corpus-linguistics/ +[47]: https://hfroehli.ch/workshops/getting-started-with-antconc/ +[48]: https://voyant-tools.org/ +[50]: /en/lessons/intro-to-beautiful-soup +[51]: /en/lessons/automated-downloading-with-wget +[52]: https://www.antlab.sci.waseda.ac.jp/ +[53]: https://notepad-plus-plus.org/ +[54]: https://www.barebones.com/products/textwrangler/ +[55]: https://www.wordfrequency.info/free.asp +[56]: https://hfroehli.ch/2014/05/11/intro-bibliography-corpus-linguistics/ diff --git a/en/lessons/correspondence-analysis-in-R.md b/en/lessons/correspondence-analysis-in-R.md index c1aef75e1c..9f341cb664 100755 --- a/en/lessons/correspondence-analysis-in-R.md +++ b/en/lessons/correspondence-analysis-in-R.md @@ -16,7 +16,7 @@ activity: analyzing topics: [data-manipulation, network-analysis, r, data-visualization] abstract: | This tutorial explains how to carry out and interpret a correspondence analysis, which can be used to identify relationships within categorical data. -redirect_from: /lessons/correspondence-analysis-in-R +redirect_from: /lessons/correspondence-analysis-in-R/ avatar_alt: Diagram of a cube with labeled edges doi: 10.46430/phen0062 mathjax: true @@ -39,9 +39,9 @@ After reading this tutorial, you should: This tutorial is for intermediate programming historians. It assumes you have basic understanding of R and some basic statistical knowledge. -Taryn Dewar's tutorial on [R Basics with Tabular Data](/lessons/r-basics-with-tabular-data) has information on how to set up and configure R. Taylor Arnold and Lauren Tilton's tutorial on [Basic Text Processing in R](/lessons/basic-text-processing-in-r) could be helpful as a warm-up, also. +Taryn Dewar's tutorial on [R Basics with Tabular Data](/en/lessons/r-basics-with-tabular-data) has information on how to set up and configure R. Taylor Arnold and Lauren Tilton's tutorial on [Basic Text Processing in R](/en/lessons/basic-text-processing-in-r) could be helpful as a warm-up, also. - Since CA is a kind of social network analysis, it would not hurt to look at Marten Düring's [From Hermeneutics to Data to Networks: Data Extraction and Network Visualization of Historical Sources](/lessons/creating-network-diagrams-from-historical-sources) which also has some useful information on structuring data for network analysis. + Since CA is a kind of social network analysis, it would not hurt to look at Marten Düring's [From Hermeneutics to Data to Networks: Data Extraction and Network Visualization of Historical Sources](/en/lessons/creating-network-diagrams-from-historical-sources) which also has some useful information on structuring data for network analysis. ## What is Correspondence Analysis? @@ -91,7 +91,7 @@ As a historian, I suspect that MPs are organized according to committee topics d ## Setting Up R for CA -To do a CA, we will need a library that will do linear algebra for us. For the more mathematics inclined, there is an appendix with some of the details about how this is done. In R, there are a number of options for CA, but we will use the [FactoMineR library](http://factominer.free.fr/)[^factominer] a library focussed on "multivariate exploratory data analysis." FactoMineR can be used to conduct all kinds of different multivariate analysis including hierarchical clusters, factor analysis and so on. +To do a CA, we will need a library that will do linear algebra for us. For the more mathematics inclined, there is an appendix with some of the details about how this is done. In R, there are a number of options for CA, but we will use the [FactoMineR library](https://factominer.free.fr/)[^factominer] a library focussed on "multivariate exploratory data analysis." FactoMineR can be used to conduct all kinds of different multivariate analysis including hierarchical clusters, factor analysis and so on. But first, here is how to install and call the libraries, then pop them into an R object for wrangling. @@ -348,7 +348,7 @@ We also learned how to interpret a CA and how to detect potential analytical pit In general, the benefit of this analysis is to provide a quick overview of two-category dataset as a pathfinder to more substantive historical issues. The use of members and meetings or events in all areas of life (business, not-for-profit, municipal meetings, twitter hashtags etc.) is a common approach to such analysis. Social groups and their preferences is another common use for CA. In each case, the visualisation offers a map with which to observe a snapshot of social, cultural and political life. -Next steps may include adding further categorical dimensions to our analysis, such as incorporating political party, age or gender. When you do CA with more than two categories, it is called [Multiple Correspondence Analysis or MCA](http://www.sthda.com/english/wiki/multiple-correspondence-analysis-essentials-interpretation-and-application-to-investigate-the-associations-between-categories-of-multiple-qualitative-variables-r-software-and-data-mining). While the Mathematics for MCA is more complicated, the end results are quite similar to CA. +Next steps may include adding further categorical dimensions to our analysis, such as incorporating political party, age or gender. When you do CA with more than two categories, it is called [Multiple Correspondence Analysis or MCA](https://www.sthda.com/english/wiki/multiple-correspondence-analysis-essentials-interpretation-and-application-to-investigate-the-associations-between-categories-of-multiple-qualitative-variables-r-software-and-data-mining). While the Mathematics for MCA is more complicated, the end results are quite similar to CA. Hopefully, you can now apply these methods to your own data, helping you to uncover questions and hypotheses that enrich your historical research. Good luck! @@ -464,7 +464,7 @@ Another important score is visible on the CA graph - the percentage of explanato [^inertia]: In general, inertia in statistics refers to the variation or "spread" of a dataset. It is analogous to standard deviation in distribution data. -[^pickton]: See Laura Kane (April 3, 2017), "Missing and murdered women's inquiry not reaching out to families, say advocates." *CBC News Indigenous*. +[^pickton]: See Laura Kane (April 3, 2017), "Missing and murdered women's inquiry not reaching out to families, say advocates." *CBC News Indigenous*. [^pvalue]: In statistics, a p-value, short for _probability value_, is an indicator of how likely an outcome would have occurred under random circumstances. A low p-value would suggest a low probability that the result would have occurred at random and thus provides some evidence that a null hypothesis (in this case, that the MPs and CPCs are independent categories) is unlikely. diff --git a/en/lessons/counting-frequencies.md b/en/lessons/counting-frequencies.md index 4a7d676f4f..c2aa9f24ea 100755 --- a/en/lessons/counting-frequencies.md +++ b/en/lessons/counting-frequencies.md @@ -16,12 +16,12 @@ exclude_from_check: activity: analyzing topics: [python] abstract: "Counting the frequency of specific words in a list can provide illustrative data. This lesson will teach you Python's easy way to count such frequencies." -next: creating-and-viewing-html-files-with-python -previous: normalizing-data +next: /en/lessons/creating-and-viewing-html-files-with-python +previous: /en/lessons/normalizing-data series_total: 15 lessons sequence: 10 python_warning: false -redirect_from: /lessons/counting-frequencies +redirect_from: /lessons/counting-frequencies/ avatar_alt: Disgruntled man sitting on a log surrounded by birds doi: 10.46430/phen0003 --- @@ -434,8 +434,8 @@ file to make sure you have the correct code. - programming-historian-5 ([zip sync][]) - [list comprehension]: http://docs.python.org/tutorial/datastructures.html#list-comprehensions - [computer scientists at Glasgow]: http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words - [Regular Expressions]: https://web.archive.org/web/20180416143856/http://www.diveintopython.net/regular_expressions/index.html + [list comprehension]: https://docs.python.org/tutorial/datastructures.html#list-comprehensions + [computer scientists at Glasgow]: https://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words + [Regular Expressions]: https://web.archive.org/web/20180416143856/https://www.diveintopython.net/regular_expressions/index.html [zip]: /assets/python-lessons4.zip [zip sync]: /assets/python-lessons5.zip diff --git a/en/lessons/creating-an-omeka-exhibit.md b/en/lessons/creating-an-omeka-exhibit.md index 7bc865b75c..347a7c3c49 100755 --- a/en/lessons/creating-an-omeka-exhibit.md +++ b/en/lessons/creating-an-omeka-exhibit.md @@ -18,7 +18,7 @@ exclude_from_check: activity: presenting topics: [website] abstract: "Now that you've added items to your Omeka site and grouped them into collections, you're ready for the next step: taking your users on a guided tour through the items you've collected." -redirect_from: /lessons/creating-an-omeka-exhibit +redirect_from: /lessons/creating-an-omeka-exhibit/ avatar_alt: Ornate room filled with paintings hung salon-style doi: 10.46430/phen0049 --- @@ -29,7 +29,7 @@ doi: 10.46430/phen0049 -Now that you've [added items to your Omeka site](/lessons/up-and-running-with-omeka) and grouped them into collections, you're ready for the next step: taking your users on a guided tour through the items you've collected. +Now that you've [added items to your Omeka site](/en/lessons/up-and-running-with-omeka) and grouped them into collections, you're ready for the next step: taking your users on a guided tour through the items you've collected. Before you begin: Map your exhibit ---------------------------------- diff --git a/en/lessons/creating-and-viewing-html-files-with-python.md b/en/lessons/creating-and-viewing-html-files-with-python.md index e4dce5baab..c1283d749d 100755 --- a/en/lessons/creating-and-viewing-html-files-with-python.md +++ b/en/lessons/creating-and-viewing-html-files-with-python.md @@ -16,12 +16,12 @@ activity: presenting topics: [python, website] abstract: "Here you will learn how to create HTML files with Python scripts, and how to use Python to automatically open an HTML file in Firefox." -next: output-data-as-html-file -previous: counting-frequencies +next: /en/lessons/output-data-as-html-file +previous: /en/lessons/counting-frequencies series_total: 15 lessons sequence: 11 python_warning: false -redirect_from: /lessons/creating-and-viewing-html-files-with-python +redirect_from: /lessons/creating-and-viewing-html-files-with-python/ avatar_alt: Child drawing on a tablet doi: 10.46430/phen0004 --- @@ -199,7 +199,7 @@ path to the directory on your own computer. - python-lessons6.zip [zip sync] [zip file from the previous lesson]: /assets/python-lessons5.zip - [Zotero]: http://zotero.org - [W3 Schools HTML tutorial]: http://www.w3schools.com/html/default.asp - [doctype declaration]: http://www.w3schools.com/tags/tag_doctype.asp + [Zotero]: https://zotero.org + [W3 Schools HTML tutorial]: https://www.w3schools.com/html/default.asp + [doctype declaration]: https://www.w3schools.com/tags/tag_doctype.asp [zip sync]: /assets/python-lessons6.zip diff --git a/en/lessons/creating-apis-with-python-and-flask.md b/en/lessons/creating-apis-with-python-and-flask.md index 482a4369ab..30cc062355 100755 --- a/en/lessons/creating-apis-with-python-and-flask.md +++ b/en/lessons/creating-apis-with-python-and-flask.md @@ -18,7 +18,7 @@ topics: [api, data-management] review-ticket: https://github.com/programminghistorian/ph-submissions/issues/106 abstract: | Learn how to set up a basic Application Programming Interface (API) to make your data more accessible to users. This lesson also discusses principles of API design and the benefits of APIs for digital projects. -redirect_from: /lessons/creating-apis-with-python-and-flask +redirect_from: /lessons/creating-apis-with-python-and-flask/ avatar_alt: Diagram with a series of arcs describing a quarter circle doi: 10.46430/phen0072 --- @@ -47,7 +47,7 @@ Web APIs are tools for making information and application functionality accessib You can use the Windows, macOS, or Linux operating systems to complete this tutorial, and those few instructions that are not the same across platforms will be explicitly noted. Python 3, the Flask web framework, and a web browser are required for this tutorial, and installation instructions for all platforms are outlined below. -The only knowledge explicitly assumed for this lesson is the ability to use a text editor, such as BBEdit on macOS or Notepad++ on Windows. However, knowledge of the command line, Python, and web concepts such as HTTP may make this tutorial easier to follow. If you're new to Python, consider working through the Programming Historian series on [dealing with online sources](/lessons/introduction-and-installation) to familiarize yourself with fundamental concepts in Python programming. +The only knowledge explicitly assumed for this lesson is the ability to use a text editor, such as BBEdit on macOS or Notepad++ on Windows. However, knowledge of the command line, Python, and web concepts such as HTTP may make this tutorial easier to follow. If you're new to Python, consider working through the Programming Historian series on [dealing with online sources](/en/lessons/introduction-and-installation) to familiarize yourself with fundamental concepts in Python programming. ## Installing Python and Flask @@ -72,7 +72,7 @@ This will install Flask using the pip package manager for Python. You should see As an alternative to the above installation instructions, you can install the Python 3 version of Anaconda, which can be downloaded [here](https://www.continuum.io). Anaconda comes with Flask, so if you go this route you will not need to install Flask using the pip package manager. -If you're running into trouble installing Python, you may find [this Programming Historian article on installing Python](/lessons/introduction-and-installation) helpful. Note that the instructions in that tutorial are for installing Python 2—make sure you choose Python 3 when downloading installers from the Python website, since this tutorial uses Python 3. +If you're running into trouble installing Python, you may find [this Programming Historian article on installing Python](/en/lessons/introduction-and-installation) helpful. Note that the instructions in that tutorial are for installing Python 2—make sure you choose Python 3 when downloading installers from the Python website, since this tutorial uses Python 3. If you don't have a preferred text editor, I recommend [BBEdit](https://www.barebones.com/products/bbedit/download.html) for macOS or [Notepad++](https://notepad-plus-plus.org/) for Windows. @@ -117,13 +117,13 @@ The primary focus of this lesson is on creating an API, not exploring or using a Imagine that our research area is sensationalism and the press: has newspaper coverage of major events in the United States become more or less sensational over time? Narrowing the topic, we might ask whether press coverage of, for example, urban fires has increased or decreased with government reporting on fire-related relief spending. -While we won't be able to explore this question thoroughly, we can begin to approach this research space by collecting historical data on newspaper coverage of fires using an API—in this case, the [Chronicling America Historical Newspaper API](http://chroniclingamerica.loc.gov/about/api/). The Chronicling America API allows access to metadata and text for millions of scanned newspaper pages. In addition, unlike many other APIs, it also does not require an authentication process, allowing us to immediately explore the available data without signing up for an account. +While we won't be able to explore this question thoroughly, we can begin to approach this research space by collecting historical data on newspaper coverage of fires using an API—in this case, the [Chronicling America Historical Newspaper API](https://chroniclingamerica.loc.gov/about/api/). The Chronicling America API allows access to metadata and text for millions of scanned newspaper pages. In addition, unlike many other APIs, it also does not require an authentication process, allowing us to immediately explore the available data without signing up for an account. -Our initial goal in approaching this research question is to find all newspaper stories in the Chronicling America database that use the term "fire." Typically, use of an API starts with its documentation. On the [Chronicling America API page](http://chroniclingamerica.loc.gov/about/api/), we find two pieces of information critical for getting the data we want from the API: the API's **base URL** and the **path** corresponding to the function we want to perform on the API—in this case, searching the database. +Our initial goal in approaching this research question is to find all newspaper stories in the Chronicling America database that use the term "fire." Typically, use of an API starts with its documentation. On the [Chronicling America API page](https://chroniclingamerica.loc.gov/about/api/), we find two pieces of information critical for getting the data we want from the API: the API's **base URL** and the **path** corresponding to the function we want to perform on the API—in this case, searching the database. Our base URL is: - http://chroniclingamerica.loc.gov + https://chroniclingamerica.loc.gov All requests we make to the API must begin with this portion of the URL. All APIs have a base URL like this one that is the same across all requests to the API. @@ -133,13 +133,13 @@ Our path is: If we combine the base URL and the path together into one URL, we'll have created a request to the Chronicling America API that returns all available data in the database: - http://chroniclingamerica.loc.gov/search/pages/results/ + https://chroniclingamerica.loc.gov/search/pages/results/ -If you [visit the link above](http://chroniclingamerica.loc.gov/search/pages/results/), you'll see all items available in Chronicling America (12,243,633 at the time of writing), , not just the entries related to our search term, "fire." This request also returns a formatted HTML view, rather than the structured view we want to use to collect data. +If you [visit the link above](https://chroniclingamerica.loc.gov/search/pages/results/), you'll see all items available in Chronicling America (12,243,633 at the time of writing), , not just the entries related to our search term, "fire." This request also returns a formatted HTML view, rather than the structured view we want to use to collect data. According to the Chronicling America documentation, in order to get structured data specifically relating to fire, we need to pass one more kind of data in our request: **query parameters**. - http://chroniclingamerica.loc.gov/search/pages/results/?format=json&proxtext=fire + https://chroniclingamerica.loc.gov/search/pages/results/?format=json&proxtext=fire The query parameters follow the `?` in the request, and are seperated from one another by the `&` symbol. The first query parameter, `format=json`, changes the returned data from HTML to JSON. The second, `proxtext=fire`, narrows the returned entries to those that include our search term. @@ -176,7 +176,7 @@ We'll begin by using Flask to create a home page for our site. In this step, we' ## Creating a Basic Flask Application -[Flask](http://flask.pocoo.org/) is a web framework for Python, meaning that it provides functionality for building web applications, including managing HTTP requests and rendering templates. In this section, we will create a basic Flask application. In later sections, we'll add to this application to create our API. Don't worry if you don't understand each individual line of code yet—explanations will be forthcoming once you have this initial version of the application working. +[Flask](https://flask.pocoo.org/) is a web framework for Python, meaning that it provides functionality for building web applications, including managing HTTP requests and rendering templates. In this section, we will create a basic Flask application. In later sections, we'll add to this application to create our API. Don't worry if you don't understand each individual line of code yet—explanations will be forthcoming once you have this initial version of the application working.

    Why Flask?

    @@ -184,7 +184,7 @@ We'll begin by using Flask to create a home page for our site. In this step, we'

    Python has a number of web frameworks that can be used to create web apps and APIs. The most well-known is Django, a framework that has a set project structure and which includes many built-in tools. This can save time and effort for experienced programmers, but can be overwhelming. Flask applications tend to be written on a blank canvas, so to speak, and so are more suited to a contained application such as our prototype API.

    -First, create a new folder on your computer that will serve as a project folder. This can be in your `Desktop` folder, but I recommend creating a dedicated `projects` folder for this and similar projects. This tutorial will assume that the files related to this lesson will be stored in a folder called `api` inside a folder named `projects` in your home directory. If you need help with navigation on the command line, see the [Programming Historian Introduction to the Bash Command Line](/lessons/intro-to-bash) for the macOS and Linux command line or the [Introduction to the Windows Command Line with PowerShell](/lessons/intro-to-powershell) for Windows. +First, create a new folder on your computer that will serve as a project folder. This can be in your `Desktop` folder, but I recommend creating a dedicated `projects` folder for this and similar projects. This tutorial will assume that the files related to this lesson will be stored in a folder called `api` inside a folder named `projects` in your home directory. If you need help with navigation on the command line, see the [Programming Historian Introduction to the Bash Command Line](/en/lessons/intro-to-bash) for the macOS and Linux command line or the [Introduction to the Windows Command Line with PowerShell](/en/lessons/intro-to-powershell) for Windows. In macOS, you can directly create a an `api` folder inside a `projects` folder in your home directory with this terminal command: @@ -228,9 +228,9 @@ You can check if you're in the correct folder by running the `pwd` command. Once You should see output similar to this: - * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit) + * Running on `http://127.0.0.1:5000/` (Press CTRL+C to quit) -You may also see some lines related to debugging. This message means that Flask is running your application locally (on your computer) at that address. Follow the link above, [http://127.0.0.1:5000/](http://127.0.0.1:5000/), using your web browser to see the running application: +You may also see some lines related to debugging. This message means that Flask is running your application locally (on your computer) at that address. Follow the link above, `http://127.0.0.1:5000/`, using your web browser to see the running application: {% include figure.html filename="welcome.png" caption="The home page when rendered in a browser." %} @@ -240,7 +240,7 @@ Congratulations, you've created a working web application! Now that we have a homepage for our archive, let's talk about how Flask works and what the above code is doing. -Flask maps HTTP requests to Python functions. In this case, we've mapped one URL path ('`/`') to one function, `home`. When we connect to the Flask server at [http://127.0.0.1:5000/](http://127.0.0.1:5000/), Flask checks if there is a match between the path provided and a defined function. Since `/`, or no additional provided path, has been mapped to the `home` function, Flask runs the code in the function and displays the returned result in the browser. In this case, the returned result is HTML markup for a home page welcoming visitors to the site hosting our API. +Flask maps HTTP requests to Python functions. In this case, we've mapped one URL path ('`/`') to one function, `home`. When we connect to the Flask server at `http://127.0.0.1:5000/`, Flask checks if there is a match between the path provided and a defined function. Since `/`, or no additional provided path, has been mapped to the `home` function, Flask runs the code in the function and displays the returned result in the browser. In this case, the returned result is HTML markup for a home page welcoming visitors to the site hosting our API. The process of mapping URLs to functions is called **routing**. The @@ -337,7 +337,7 @@ app.run() Run the code (navigate to your `api` folder in the command line and enter `python api.py`). Once the server is running, visit our route URL to view the data in the catalog: -[http://127.0.0.1:5000/api/v1/resources/books/all](http://127.0.0.1:5000/api/v1/resources/books/all) +`http://127.0.0.1:5000/api/v1/resources/books/all` You should see JSON output for the three entries in our test catalog. Flask provides us with a `jsonify` function that allows us to convert lists and dictionaries to JSON format. In the route we created, our book entries are converted from a list of Python dictionaries to JSON before being returned to a user. @@ -411,20 +411,20 @@ def api_id(): return jsonify(results) app.run() -`````` +``` Once you've updated your API with the `api_id` function, run your code as before (`python api.py` from your `api` directory) and visit the below URLs to test the new filtering capability: -[127.0.0.1:5000/api/v1/resources/books?id=0](http://127.0.0.1:5000/api/v1/resources/books?id=0) -[127.0.0.1:5000/api/v1/resources/books?id=1](http://127.0.0.1:5000/api/v1/resources/books?id=1) -[127.0.0.1:5000/api/v1/resources/books?id=2](http://127.0.0.1:5000/api/v1/resources/books?id=2) -[127.0.0.1:5000/api/v1/resources/books?id=3](http://127.0.0.1:5000/api/v1/resources/books?id=3) +- `http://127.0.0.1:5000/api/v1/resources/books?id=0` +- `http://127.0.0.1:5000/api/v1/resources/books?id=1` +- `http://127.0.0.1:5000/api/v1/resources/books?id=2` +- `http://127.0.0.1:5000/api/v1/resources/books?id=3` Each of these should return a different entry, except for the last, which should return an empty list: `[]`, since there is no book for which the id value is 3. (Counting in programming typically starts from 0, so id=3 would be a request for a nonexistent fourth item.) In the next section, we'll explore our updated API in more detail. ## Understanding Our Updated API -In this code, we first create a new function, called `api_id`, with the `@app.route` syntax that maps the function to the path `/api/v1/resources/books`. That means that this function will run when we access [http://127.0.0.1:5000/api/v1/resources/books](http://127.0.0.1:5000/api/v1/resources/books). (Note that accessing this link without providing an ID will give the error message we provided in the code: `Error: No id field provided. Please specify an id.`) +In this code, we first create a new function, called `api_id`, with the `@app.route` syntax that maps the function to the path `/api/v1/resources/books`. That means that this function will run when we access `http://127.0.0.1:5000/api/v1/resources/books`. (Note that accessing this link without providing an ID will give the error message we provided in the code: `Error: No id field provided. Please specify an id.`) Inside our function, we do two things: @@ -432,7 +432,7 @@ First, examine the provided URL for an id and select the books that match that i This part of the code determines if there is a query parameter, like `?id=0`, and then assigns the provided ID to a variable. -``` +```python if 'id' in request.args: id = int(request.args['id']) else: @@ -441,7 +441,7 @@ This part of the code determines if there is a query parameter, like `?id=0`, an Then this section moves through our test catalog of books, matches those books that have the provided ID, and appends them to the list that will be returned to the user: -``` +```python for book in books: if book['id'] == id: results.append(book) @@ -463,38 +463,38 @@ The prevailing design philosophy of modern APIs is called REST. For our purposes Because HTTP requests are so integral to using a REST API, many design principles revolve around how requests should be formatted. We've already created one HTTP request, which returns all books provided in our sample data. To understand the considerations that go into formatting this request, let's first consider a weak or poorly-designed example of an API endpoint: - http://api.example.com/getbook/10 + `http://api.example.com/getbook/10` The formatting of this request has a number of issues. The first is semantic—in a REST API, our verbs are typically `GET`, `POST`, `PUT`, or `DELETE`, and are determined by the request method rather than in the request URL. That means that the word "get" should not appear in our request, since "get" is implied by the fact that we're using a HTTP GET method. In addition, resource collections such as `books` or `users` should be denoted with plural nouns. This makes it clear when an API is referring to a collection (`books`) or an entry (`book`). Incorporating these principles, our API would look like this: - http://api.example.com/books/10 + `http://api.example.com/books/10` The above request uses part of the path (`/10`) to provide the ID. While this is not an uncommon approach, it's somewhat inflexible—with URLs constructed in this manner, you can generally only filter by one field at a time. Query parameters allow for filtering by multiple database fields and make more sense when providing "optional" data, such as an output format: - http://api.example.com/books?author=Ursula+K.+Le Guin&published=1969&output=xml + `http://api.example.com/books?author=Ursula+K.+Le Guin&published=1969&output=xml` When designing how requests to your API should be structured, it also makes sense to plan for future additions. Even if the current version of your API serves information on only one type of resource—`books`, for example—it makes sense to plan as if you might add other resources or non-resource functionality to your API in the future: - http://api.example.com/resources/books?id=10 + `http://api.example.com/resources/books?id=10` Adding an extra segment on your path such as "resources" or "entries" gives you the option to allow users to search across all resources available, making it easier for you to later support requests such as these: - https://api.example.com/v1/resources/images?id=10 - https://api.example.com/v1/resources/all + `https://api.example.com/v1/resources/images?id=10` + `https://api.example.com/v1/resources/all` Another way to plan for your API's future is to add a version number to the path. This means that, should you have to redesign your API, you can continue to support the old version of the API under the old version number while releasing, for example, a second version (`v2`) with improved or different functionality. This way, applications and scripts built using the old version of your API won't cease to function after your upgrade. After incorporating these design improvements, a request to our API might look like this: - https://api.example.com/v1/resources/books?id=10 + `https://api.example.com/v1/resources/books?id=10` ## Documentation and Examples Without documentation, even the best-designed API will be unusable. Your API should have documentation describing the resources or functionality available through your API that also provides concrete working examples of request URLs or code for your API. You should have a section for each resource that describes which fields, such as `id` or `title`, it accepts. Each section should have an example in the form of a sample HTTP request or block of code. -A fairly common practice in documenting APIs is to provide annotations in your code that are then automatically collated into documentation using a tool such as [Doxygen](https://www.doxygen.nl/) or [Sphinx](http://www.sphinx-doc.org/en/stable/). These tools create documentation from **docstrings**—comments you make on your function definitions. While this kind of documentation is a good idea, you shouldn't consider your job done if you've only documented your API to this level. Instead, try to imagine yourself as a potential user of your API and provide working examples. In an ideal world, you would have three kinds of documentation for your API: a reference that details each route and its behavior, a guide that explains the reference in prose, and at least one or two tutorials that explain every step in detail. +A fairly common practice in documenting APIs is to provide annotations in your code that are then automatically collated into documentation using a tool such as [Doxygen](https://www.doxygen.nl/) or [Sphinx](https://www.sphinx-doc.org/en/stable/). These tools create documentation from **docstrings**—comments you make on your function definitions. While this kind of documentation is a good idea, you shouldn't consider your job done if you've only documented your API to this level. Instead, try to imagine yourself as a potential user of your API and provide working examples. In an ideal world, you would have three kinds of documentation for your API: a reference that details each route and its behavior, a guide that explains the reference in prose, and at least one or two tutorials that explain every step in detail. -For inspiration on how to approach API documentation, see the [New York Public Library Digital Collections API](http://api.repo.nypl.org/), which sets a standard of documentation achievable for many academic projects. For an extensively documented (though sometimes overwhelming) API, see the [MediaWiki Action API](https://www.mediawiki.org/wiki/API:Main_page), which provides documentation to users who pass partial queries to the API. (In our example above, we returned an error on a partial query.) For other professionally maintained API documentation examples, consider the [World Bank API](https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-api-documentation), the various [New York Times APIs](https://developer.nytimes.com/), or the [Europeana Pro API](https://pro.europeana.eu/resources/apis). +For inspiration on how to approach API documentation, see the [New York Public Library Digital Collections API](https://api.repo.nypl.org/), which sets a standard of documentation achievable for many academic projects. For an extensively documented (though sometimes overwhelming) API, see the [MediaWiki Action API](https://www.mediawiki.org/wiki/API:Main_page), which provides documentation to users who pass partial queries to the API. (In our example above, we returned an error on a partial query.) For other professionally maintained API documentation examples, consider the [World Bank API](https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-api-documentation), the various [New York Times APIs](https://developer.nytimes.com/), or the [Europeana Pro API](https://pro.europeana.eu/resources/apis). # Connecting Our API to a Database @@ -576,16 +576,19 @@ def api_filter(): app.run() ``` + Save the code as `api_final.py` in your `api` folder and run it by navigating to your project folder in the terminal and entering the command: - python api_final.py +```python +python api_final.py +``` Note that if a previous version of the code is still running, you will first need to end that process by pressing `Control-C` before executing the new code. Once this example is running, try out the filtering functionality with these HTTP requests: -[http://127.0.0.1:5000/api/v1/resources/books/all](http://127.0.0.1:5000/api/v1/resources/books/all) -[http://127.0.0.1:5000/api/v1/resources/books?author=Connie+Willis](http://127.0.0.1:5000/api/v1/resources/books?author=Connie+Willis) -[http://127.0.0.1:5000/api/v1/resources/books?author=Connie+Willis&published=1999](http://127.0.0.1:5000/api/v1/resources/books?author=Connie+Willis&published=1993) -[http://127.0.0.1:5000/api/v1/resources/books?published=2010](http://127.0.0.1:5000/api/v1/resources/books?published=2010) +- `http://127.0.0.1:5000/api/v1/resources/books/all` +- `http://127.0.0.1:5000/api/v1/resources/books?author=Connie+Willis` +- `http://127.0.0.1:5000/api/v1/resources/books?author=Connie+Willis&published=1999` +- `http://127.0.0.1:5000/api/v1/resources/books?published=2010` The database downloaded for this lesson has 67 entries, one for each of the winners of the Hugo Award for best science fiction novel between 1953 and 2014 (avoiding the voting controversy of 2015). The data set includes the novel's title, author, year of publication, and first sentence. Our API allows users to filter by three fields: `id`, `published` (year of publication), and `author`. @@ -625,24 +628,32 @@ In HTML responses, the code `200` means "OK"(the expected data transferred), whi Our `api_filter` function is an improvement on our previous `api_id` function that returns a book based on its ID. This new function allows for filtering by three different fields: `id`, `published`, and `author`. The function first grabs all the query parameters provided in the URL (remember, query parameters are the part of the URL that follows the `?`, like `?id=10`). +```python query_parameters = request.args +``` It then pulls the supported parameters `id`, `published`, and `author` and binds them to appropriate variables: +```python id = query_parameters.get('id') published = query_parameters.get('published') author = query_parameters.get('author') +``` The next segment begins to build an SQL query that will be used to find the requested information in the database. SQL queries used to find data in a database take this form: +```sql `SELECT FROM WHERE AND ; +``` To get the correct data, we need to build both an SQL query that looks like the above and a list with the filters that will be matched. Combined, the query and the the filters provided by the user will allow us to pull the correct books from our database. We begin to define both the query and the filter list: +```python query = "SELECT * FROM books WHERE" to_filter = [] +``` Then, if `id`, `published`, or `author` were provided as query parameters, we add them to both the query and the filter list: @@ -673,15 +684,19 @@ To perfect our query, we remove the trailing ` AND` and cap the query with the ` Finally, we connect to our database as in our `api_all` function, then execute the query we've built using our filter list: +```python conn = sqlite3.connect('books.db') conn.row_factory = dict_factory cur = conn.cursor() results = cur.execute(query, to_filter).fetchall() +``` Finally, we return the results of our executed SQL query as JSON to the user: +```python return jsonify(results) +``` Whew! When all is said and done, this section of code reads query parameters provided by the user, builds an SQL query based on those parameters, executes that query to find matching books in the database, and returns those matches as JSON to the user. This section of code makes our API's filtering capability considerably more sophisticated—users can now find books by, for example, Ursula K. Le Guin that were published in 1975 or all books in the database published in 2010. @@ -695,7 +710,7 @@ One of the advantages of providing data through an API, as opposed to providing As new Hugo winners were added to the database, the script that generated this visualization would immediately be able to use the new information. If the visualization were created in D3 or another web-based utility, this plot would actually reflect additional data added to the book archive as soon as the archive was updated—that is, in real time. As additional data accrued, we might, for example, learn if John Scalzi's unusually lengthy opening to his 2013 *Red Shirts* was an aberration or the continuation of a longer trend toward wordiness in science fiction. Conversely, if your API were to change its URL structure or cease to function, applications based on it will no longer work. Remember that, when creating an API, you are assuming some responsibility for the applications that others may build with it. -A strong API can be considered the backbone of a potentially limitless number of projects or avenues of research. Though the above example takes the form of a visualization of the limited amount of data we've provided in our Distant Reading Archive, a project based on this API might just as easily take the form of a Twitterbot that shares first sentences (learn how to make one with [this](/lessons/intro-to-twitterbots) Programming Historian lesson) or a library webpage that displays book openings and year of publication alongside other book metadata. In many cases, it makes sense to first create an API interface to your core data or functionality before extrapolating on it to create a visualization, application, or website. Not only does it make your work accessible to researchers working on other projects, but it often leads to a more comprehensible and maintainable project. +A strong API can be considered the backbone of a potentially limitless number of projects or avenues of research. Though the above example takes the form of a visualization of the limited amount of data we've provided in our Distant Reading Archive, a project based on this API might just as easily take the form of a Twitterbot that shares first sentences (learn how to make one with [this](/en/lessons/intro-to-twitterbots) Programming Historian lesson) or a library webpage that displays book openings and year of publication alongside other book metadata. In many cases, it makes sense to first create an API interface to your core data or functionality before extrapolating on it to create a visualization, application, or website. Not only does it make your work accessible to researchers working on other projects, but it often leads to a more comprehensible and maintainable project. # Resources @@ -703,13 +718,13 @@ The below resources provide information on useful APIs for researchers in the hu ## APIs for Humanities Researchers -[Chronicling America \(Library Of Congress\)](http://chroniclingamerica.loc.gov/) - A digitized collection of American newspaper articles from the 18th to the 20th century. +[Chronicling America \(Library Of Congress\)](https://chroniclingamerica.loc.gov/) - A digitized collection of American newspaper articles from the 18th to the 20th century. [Connecting Repositories \(CORE\)](https://core.ac.uk/) - A collection of open access articles from various sources hosted by the Open University. [English Broadside Ballad Archive \(EBBA\)](https://diggingintodata.org/repositories/english-broadside-ballad-archive-ebba) -[History Data Service (HDS)](http://hds.essex.ac.uk/) - A collection of data from a wide variety of historical sources. +[History Data Service (HDS)](https://hds.essex.ac.uk/) - A collection of data from a wide variety of historical sources. [Europeana](https://pro.europeana.eu/) diff --git a/en/lessons/creating-mobile-augmented-reality-experiences-in-unity.md b/en/lessons/creating-mobile-augmented-reality-experiences-in-unity.md index 888ec8199c..1adf05b11a 100644 --- a/en/lessons/creating-mobile-augmented-reality-experiences-in-unity.md +++ b/en/lessons/creating-mobile-augmented-reality-experiences-in-unity.md @@ -52,13 +52,13 @@ In addition to the above software requirements, you will also need to make sure ## How can Humanists use Augmented Reality? -Novel applications of AR continue to surface within a variety of industries: [museums](https://www.youtube.com/watch?v=gx_UQxx54lo) are integrating AR content into their displays, [companies](http://www.gizmag.com/ikea-augmented-reality-catalog-app/28703/) are promoting AR apps in lieu of print or even web-based catalogs, and [engineering firms](https://www.youtube.com/watch?v=bXqe2zSepQ4) are creating AR applications showcasing their efforts to promote sustainability. [Predicted to grow](https://www.statista.com/statistics/786821/ar-device-and-services-revenue-worldwide/) into a multi-billion industry by 2020, augmented reality is an exciting new medium that humanists cannot afford to ignore. Indeed, many scholars within the growing field of digital humanities are beginning to explore how AR can be utilized as a viable medium of scholarly engagement within public spaces, objects, images, and texts. +Novel applications of AR continue to surface within a variety of industries: [museums](https://www.youtube.com/watch?v=gx_UQxx54lo) are integrating AR content into their displays, [companies](https://www.gizmag.com/ikea-augmented-reality-catalog-app/28703/) are promoting AR apps in lieu of print or even web-based catalogs, and [engineering firms](https://www.youtube.com/watch?v=bXqe2zSepQ4) are creating AR applications showcasing their efforts to promote sustainability. [Predicted to grow](https://www.statista.com/statistics/786821/ar-device-and-services-revenue-worldwide/) into a multi-billion industry by 2020, augmented reality is an exciting new medium that humanists cannot afford to ignore. Indeed, many scholars within the growing field of digital humanities are beginning to explore how AR can be utilized as a viable medium of scholarly engagement within public spaces, objects, images, and texts. {% include figure.html filename="ar-dev-1.png" caption="Augmented reality can be used to overlay digital information onto existing texts such as historical markers. This modified image is based on a photograph by Nicholas Henderson. 2015." %} Since at least 2010, [digital artists](https://manifestarblog.wordpress.com/about/) have been creating AR applications for social advocacy and cultural intervention. For example, Tamiko Thiel's AR project [Clouding Green](https://perma.cc/6NLX-AJBH) reveals the carbon footprint of specific technology companies. More recently, a group of New York artists created a ["vandalized" version of Jeff Koon's Snapchat sculptures](https://techcrunch.com/2017/10/08/jeff-koons-augmented-reality-snapchat-artwork-gets-vandalized/) as a way of protesting the digital takeover of public AR spaces. -At the [Trace Initiative](http://web.archive.org/web/20180421163517/http://english.ufl.edu/trace_arcs/), a digital humanities organization in the University of Florida English Department, we seek to build upon the work of these artists by promoting the creation and circulation of humanities-focused mobile AR applications. We released our first AR application [to the Google Play store](https://web.archive.org/web/20210421123810/http://trace-arcs.english.ufl.edu/projects/scramble.html) in spring 2016. +At the [Trace Initiative](https://web.archive.org/web/20180421163517/https://english.ufl.edu/trace_arcs/), a digital humanities organization in the University of Florida English Department, we seek to build upon the work of these artists by promoting the creation and circulation of humanities-focused mobile AR applications. We released our first AR application [to the Google Play store](https://web.archive.org/web/20210421123810/https://trace-arcs.english.ufl.edu/projects/scramble.html) in spring 2016. The augmented reality software used in this tutorial relies on image-recognition technology, meaning that it requires some kind of visual trigger (a logo, painting, etc.) to know when to display digital content. In the example application depicted in the image above, the application is programmed to only display the digital image of John C. Calhoun if the camera "recognizes" the specific historical marker with which it is associated. For this lesson, you will augment the cover of a physical book with a digital overlay that displays a picture of the author. You could use the technical skills gained throughout this tutorial to create digital overlays for a variety of texts such as historical documents or signs. For example, you might create an application that allows readers to scan the pages of a book or document and access historical context or critique related to that specific page. Humanities scholars could also use this tutorial to create site-specific AR applications to educate visitors about cultural aspects of a location that have been excluded from its historical presentation. @@ -82,11 +82,11 @@ HP Reveal is a fantastic AR creation platform that can be learned fairly quickly ### Installing Unity and Vuforia -Since the release of Unity 2017.2, the Vuforia SDK is integrated into the Unity Editor. The [Vuforia SDK](https://developer.vuforia.com/) is a suite of assets and code snippets that work alonside Unity to make it easier to quickly develop and build AR content for a variety of platforms. If you are unable to download Unity 2017.2 or later, [consult this archived lesson for earlier versions of Unity](/lessons/intro-to-augmented-reality-with-unity). To download Unity and Vuforia, go to the [Unity website](https://unity3d.com/get-unity/download/archive) and download Unity 2017.2. In the Components dialog box, select Vuforia Augmented Reality Support in addition to either Android Build Support and/or iOS Build Support, depending on your target mobile device platform. Once the download completes, start Unity and follow the setup prompts. If Unity asks if you are creating a personal or professional account, choose personal account. +Since the release of Unity 2017.2, the Vuforia SDK is integrated into the Unity Editor. The [Vuforia SDK](https://developer.vuforia.com/) is a suite of assets and code snippets that work alonside Unity to make it easier to quickly develop and build AR content for a variety of platforms. If you are unable to download Unity 2017.2 or later, [consult this archived lesson for earlier versions of Unity](/en/lessons/retired/intro-to-augmented-reality-with-unity). To download Unity and Vuforia, go to the [Unity website](https://unity3d.com/get-unity/download/archive) and download Unity 2017.2. In the Components dialog box, select Vuforia Augmented Reality Support in addition to either Android Build Support and/or iOS Build Support, depending on your target mobile device platform. Once the download completes, start Unity and follow the setup prompts. If Unity asks if you are creating a personal or professional account, choose personal account. ### Java Development Kit -Download and install the [Java Development Kit 8](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) for your operating system. At this time, Unity is incompatible with JDK 10. +Download and install the [Java Development Kit 8](https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) for your operating system. At this time, Unity is incompatible with JDK 10. Click the file once it has finished downloading, and follow the installation guide. @@ -190,7 +190,7 @@ This cover of *Of Mice and Men* has sufficient visual complexity; however, it is {% include figure.html filename="ar-dev-10.png" caption="Photo courtesy of Mark Skwarek." %} -If you are taking a picture of your book cover, make sure that there are no extraneous features present in the image. In the case of the *Of Mice and Men* image above, this would be anything beyond the edge of the cover. If your image contains such extraneous features, either take another picture or open it in a photo editor such as [Gimp](http://www.gimp.org/) and crop out these features. [Consult the latest Gimp documentation](https://www.gimp.org/docs/) for help on cropping and resizing images. Make sure that your image file is under 2.5 mb and that it is a .jpg or .png file. +If you are taking a picture of your book cover, make sure that there are no extraneous features present in the image. In the case of the *Of Mice and Men* image above, this would be anything beyond the edge of the cover. If your image contains such extraneous features, either take another picture or open it in a photo editor such as [Gimp](https://www.gimp.org/) and crop out these features. [Consult the latest Gimp documentation](https://www.gimp.org/docs/) for help on cropping and resizing images. Make sure that your image file is under 2.5 mb and that it is a .jpg or .png file. {% include figure.html filename="ar-dev-11.png" caption="Crop out the area around the book." %} @@ -343,7 +343,7 @@ Return to Unity to setup your application for an Android or iOS build: To install your own applications on your Android device, -1. [Enable USB debugging](http://developer.Android.com/tools/device.html) by going to Setting > About Device. +1. [Enable USB debugging](https://developer.Android.com/tools/device.html) by going to Setting > About Device. 2. Tap the Build number seven times. 3. Return to the previous screen and you should now see a Developer Options tab. Click it and make sure the option for USB debugging is checked. diff --git a/en/lessons/creating-network-diagrams-from-historical-sources.md b/en/lessons/creating-network-diagrams-from-historical-sources.md index 07203f36a6..5c4c4f1697 100755 --- a/en/lessons/creating-network-diagrams-from-historical-sources.md +++ b/en/lessons/creating-network-diagrams-from-historical-sources.md @@ -15,7 +15,7 @@ exclude_from_check: activity: transforming topics: [network-analysis, data-visualization] abstract: "Network visualizations can help humanities scholars reveal hidden and complex patterns and structures in textual sources. This tutorial explains how to extract network data (people, institutions, places, etc) from historical sources through the use of non-technical methods developed in Qualitative Data Analysis (QDA) and Social Network Analysis (SNA), and how to visualize this data with the platform-independent and particularly easy-to-use Palladio." -redirect_from: /lessons/creating-network-diagrams-from-historical-sources +redirect_from: /lessons/creating-network-diagrams-from-historical-sources/ avatar_alt: Diagram of the earth and moon's revolution around the sun doi: 10.46430/phen0044 --- @@ -26,10 +26,11 @@ doi: 10.46430/phen0044 -Introduction +## Introduction + ------------ -Network visualizations can help humanities scholars reveal hidden and complex patterns and structures in textual sources. This tutorial explains how to extract network data (people, institutions, places, etc) from historical sources through the use of non-technical methods developed in Qualitative Data Analysis (QDA) and Social Network Analysis (SNA), and how to visualize this data with the platform-independent and particularly easy-to-use [*Palladio*](http://hdlab.stanford.edu/palladio/). +Network visualizations can help humanities scholars reveal hidden and complex patterns and structures in textual sources. This tutorial explains how to extract network data (people, institutions, places, etc) from historical sources through the use of non-technical methods developed in Qualitative Data Analysis (QDA) and Social Network Analysis (SNA), and how to visualize this data with the platform-independent and particularly easy-to-use [*Palladio*](https://hdlab.stanford.edu/palladio/). {% include figure.html caption="Figure 1: A network visualization in Palladio and what you will be able to create by the end of this tutorial." filename="image09.png" %} @@ -39,7 +40,7 @@ The graph above shows an excerpt from the network of Ralph Neumann, particularly Generally, network analysis provides the tools to explore highly complex constellations of relations between entities. Think of your friends: You will find it very easy to map out who are close and who don't get along well. Now imagine you had to explain these various relationships to somebody who does not know any of your friends. Or you wanted to include the relationships between your friends’ friends. In situations like this language and our capacity to comprehend social structures quickly reach their limits. Graph visualizations can be means to effectively communicate and explore such complex constellations. Generally you can think of Social Network Analysis as a means to transform complexity from a problem to an object of research. Often, nodes in a network represent humans connected to other humans by all imaginable types of social relations. But pretty much anything can be understood as a node: A film, a place, a job title, a point in time, a venue. Similarly, the concept of a tie (also called edge) between nodes is just as flexible: two theaters could be connected by a film shown in both of them, or by co-ownership, geographical proximity, or being in business in the same year. All this depends on your research interests and how you express them in form of nodes and relations in a network. -This tutorial can not replace any of the many existing generic network analysis handbooks, such as [John Scott's _Social Network Analysis_](https://uk.sagepub.com/en-gb/eur/the-sage-handbook-of-social-network-analysis/book277881). For a great general introduction to the field and all its pitfalls for humanists I recommend[ ](https://web.archive.org/web/20240203222438/https://www.scottbot.net/HIAL/index.html@p=6279.html)[*Scott Weingart’s blog post series “Networks Demystified”*](https://web.archive.org/web/20240203222438/https://www.scottbot.net/HIAL/index.html@p=6279.html) as well as[ ](http://hal.archives-ouvertes.fr/docs/00/64/93/16/PDF/lemercier_A_zg.pdf)[*Claire Lemercier’s paper “Formal network methods in history: why and how?"*](http://hal.archives-ouvertes.fr/docs/00/64/93/16/PDF/lemercier_A_zg.pdf). You may also want to explore the bibliography and event calendar over at [_Historical Network Research_](http://historicalnetworkresearch.org/) to get a sense of how historians have made use of networks in their research. +This tutorial can not replace any of the many existing generic network analysis handbooks, such as [John Scott's _Social Network Analysis_](https://uk.sagepub.com/en-gb/eur/the-sage-handbook-of-social-network-analysis/book277881). For a great general introduction to the field and all its pitfalls for humanists I recommend[ ](https://web.archive.org/web/20240203222438/https://www.scottbot.net/HIAL/index.html@p=6279.html)[*Scott Weingart’s blog post series “Networks Demystified”*](https://web.archive.org/web/20240203222438/https://www.scottbot.net/HIAL/index.html@p=6279.html) as well as[ ](https://hal.archives-ouvertes.fr/docs/00/64/93/16/PDF/lemercier_A_zg.pdf)[*Claire Lemercier’s paper “Formal network methods in history: why and how?"*](https://hal.archives-ouvertes.fr/docs/00/64/93/16/PDF/lemercier_A_zg.pdf). You may also want to explore the bibliography and event calendar over at [_Historical Network Research_](https://historicalnetworkresearch.org/) to get a sense of how historians have made use of networks in their research. This tutorial will focus on data extraction from unstructured text and shows one way to visualize it using Palladio. It is purposefully designed to be as simple and robust as possible. For the limited scope of this tutorial it will suffice to say that an actor refers to the persons, institutions, etc. which are the object of study and which are connected by relations. Within the context of a network visualization or computation (also called graph), we call them nodes and we call the connections ties. In all cases it is important to remember that nodes and ties are drastically simplified models used to represent the complexities of past events, and in themselves do not always suffice to generate insight. But it is likely that the graph will highlight interesting aspects, challenge your hypothesis and/or lead you to generate new ones. *Network diagrams become meaningful when they are part of a dialogue with data and other sources of information.* @@ -48,15 +49,17 @@ Many network analysis projects in the social sciences rely on pre-existing data In other words, the challenge is to systematize text interpretation. Networks created from pre-existing data sets need to be considered within the context in which they were created (e.g. wording of questions in a questionnaire and selected target groups). Networks created from unstructured text pose challenges on top of this: interpretations are highly individual and depend on viewpoints and context knowledge. -About the case study +## About the case study + -------------------- -The case study I use for this tutorial is a first-person narrative of Ralph Neumann, a Jewish survivor of the Holocaust. You can find the text [*online*](http://web.archive.org/web/20180422010025/http://www.gdw-berlin.de/fileadmin/bilder/publ/publikationen_in_englischer_sprache/2006_Neuman_eng.pdf). The coding scheme which I will introduce below is a simplified version of the one I developed during [*my PhD project on covert support networks during the Second World War*](http://martenduering.com/research/covert-networks-during-the-holocaust/). My research was driven by three questions: To what extent can social relationships help explain why ordinary people took the risks associated with helping? How did such relationships enable people to provide these acts of help given that only very limited resources were available to them? How did social relationships help Jewish refugees to survive in the underground? +The case study I use for this tutorial is a first-person narrative of Ralph Neumann, a Jewish survivor of the Holocaust. You can find the text [*online*](https://web.archive.org/web/20180422010025/https://www.gdw-berlin.de/fileadmin/bilder/publ/publikationen_in_englischer_sprache/2006_Neuman_eng.pdf). The coding scheme which I will introduce below is a simplified version of the one I developed during [*my PhD project on covert support networks during the Second World War*](https://martenduering.com/research/covert-networks-during-the-holocaust/). My research was driven by three questions: To what extent can social relationships help explain why ordinary people took the risks associated with helping? How did such relationships enable people to provide these acts of help given that only very limited resources were available to them? How did social relationships help Jewish refugees to survive in the underground? In this project network visualisations helped me to discover hitherto forgotten yet highly important contact brokers, highlight the overall significance of Jewish refugees as contact brokers and generally to navigate through a total of some 5,000 acts of help which connected some 1,400 people between 1942 and 1945. -Developing a coding scheme +## Developing a coding scheme + -------------------------- In visualizing network relationships, one of the first and most difficult challenges is to decide who should be part of the network and which relations between the selected actors are to be coded. It will probably take some time to figure this out and will likely be an iterative process since you will need to balance your research interests and hypotheses with the availability of information in your texts and represent both in a rigid and necessarily simplifying coding scheme. @@ -135,7 +138,7 @@ The following steps will explain how to visualize network data in Palladio but I Step by Step: -**1. Palladio.** Go to [*http://hdlab.stanford.edu/palladio/*](http://hdlab.stanford.edu/palladio/)*.* +**1. Palladio.** Go to [*https://hdlab.stanford.edu/palladio/*](https://hdlab.stanford.edu/palladio/)*.* **2. Start.** On their website click the “Start” button. @@ -157,7 +160,7 @@ Step by Step: {% include figure.html caption="Figure 9: Linking People to Relations." filename="image08.png" %} -**7. Identify temporal data.** Palladio has nice time visualization features. You can use it if you have start and end points for each relation. The sample data contains two columns with suitable data. Click on “Time Step Start” and select the data type “Year or Date”. Do the same for “Time Step End” (Figure 10). The Palladio team recommends that your data is in the YYYY-MM-DD format, but my more abstract time steps worked well. If you were to load geographical coordinates (not covered by this tutorial but here: [*Palladio Simple Map Scenario*](http://hdlab.stanford.edu/doc/scenario-simple-map.pdf)) you would select the “Coordinates” data type. +**7. Identify temporal data.** Palladio has nice time visualization features. You can use it if you have start and end points for each relation. The sample data contains two columns with suitable data. Click on “Time Step Start” and select the data type “Year or Date”. Do the same for “Time Step End” (Figure 10). The Palladio team recommends that your data is in the YYYY-MM-DD format, but my more abstract time steps worked well. If you were to load geographical coordinates (not covered by this tutorial but here: [*Palladio Simple Map Scenario*](https://hdlab.stanford.edu/doc/scenario-simple-map.pdf)) you would select the “Coordinates” data type. {% include figure.html caption="Figure 10: Changing the data type to 'Year or Date'" filename="image05.png"%} @@ -178,7 +181,7 @@ Network visualizations can be incredibly suggestive. Remember that whatever you {% include figure.html caption="Figure 13: The Facet filter in Palladio." filename="image15.png" %} -**12. Bipartite network visualization.** Now this is nice. But there is something else which makes Palladio a great tool to start out with network visualization: It makes it very easy to produce [*bipartite, or 2-mode networks*](http://en.wikipedia.org/wiki/Bipartite_graph#Examples). What you have seen until now is a so-called unipartite or 1-mode network: It represents relations between source and target nodes of one type (for example “people”) through one or more types of relations, Figures 13 and 14 are examples of this type of graph. +**12. Bipartite network visualization.** Now this is nice. But there is something else which makes Palladio a great tool to start out with network visualization: It makes it very easy to produce [*bipartite, or 2-mode networks*](https://en.wikipedia.org/wiki/Bipartite_graph#Examples). What you have seen until now is a so-called unipartite or 1-mode network: It represents relations between source and target nodes of one type (for example “people”) through one or more types of relations, Figures 13 and 14 are examples of this type of graph. Network analysis however gives you a lot of freedom to rethink what source and targets are. Bipartite networks have two different types of nodes, an example could be to select “people” as the first node type and “point in time” as the second. Figure 15 shows a bipartite network and reveals which recipients of help were present in the network at the same time. Compare this graph to Figure 16 which shows which givers of help were present at the same time. This points at a high rate of fluctuation among helpers, an observation which holds true for all of the networks I studied. While humans are very good at processing people-to-people networks, we find it harder to process these more abstract networks. Give it a try and experiment with different bipartite networks: Click again on “Target” but this time select “Form of Help” or “Sex” or any other category. @@ -204,7 +207,7 @@ Note that if you wanted to see "Giver" and "Recipients" as one node type and "Da {% include figure.html caption="Figure 17: Timeline. isualization of Time Steps." filename="image12.png" %} -**15. Node size.** Palladio lets you size your nodes based on actor attributes. Note that this does not make sense for the sample data given that numerical values represent categories. Node sizes can however be useful if you were to represent the sum of a person’s acts of help, which in this case would correspond to his or her [*Out-Degree*](http://en.wikipedia.org/wiki/Directed_graph#Indegree_and_outdegree), the number of outgoing relations for a node. +**15. Node size.** Palladio lets you size your nodes based on actor attributes. Note that this does not make sense for the sample data given that numerical values represent categories. Node sizes can however be useful if you were to represent the sum of a person’s acts of help, which in this case would correspond to his or her [*Out-Degree*](https://en.wikipedia.org/wiki/Directed_graph#Indegree_and_outdegree), the number of outgoing relations for a node. **16. Export your visualizations.** Palladio lets you export your network as .svg files, a vector-based image format. Use your browser of choice to open them. @@ -241,14 +244,14 @@ Good luck! Other network visualization tools to consider --------------------------------------------- -[*Nodegoat*](http://nodegoat.net/) – similar to Palladio in that it makes data collection, mapping and graph visualizations easy. Allows easy setup of relational databases and lets users store data on their servers. [*Tutorial available here*](http://nodegoat.net/cms/UPLOAD/AsmallguidebyYanan11082014.pdf). +[*Nodegoat*](https://nodegoat.net/) – similar to Palladio in that it makes data collection, mapping and graph visualizations easy. Allows easy setup of relational databases and lets users store data on their servers. [*Tutorial available here*](https://nodegoat.net/cms/UPLOAD/AsmallguidebyYanan11082014.pdf). -[*NodeXL*](https://www.smrfoundation.org/nodexl/) – capable to perform many tasks common in SNA, easy-to-use, open source but requires Windows and MS Office 2007 or newer.[ ](https://www.youtube.com/watch?v=pwsImFyc0lE)[*Tutorial 1*](https://www.youtube.com/watch?v=pwsImFyc0lE), [*Tutorial 2*](http://www.youtube.com/watch?v=xKhYGRpbwOc). +[*NodeXL*](https://www.smrfoundation.org/nodexl/) – capable to perform many tasks common in SNA, easy-to-use, open source but requires Windows and MS Office 2007 or newer.[ ](https://www.youtube.com/watch?v=pwsImFyc0lE)[*Tutorial 1*](https://www.youtube.com/watch?v=pwsImFyc0lE), [*Tutorial 2*](https://www.youtube.com/watch?v=xKhYGRpbwOc). -[*Gephi*](https://gephi.github.io/) – open source, platform independent. The best known and most versatile visualization tool available but expect a steep learning curve. The developers announce support for parallel edges in version 1.0. Tutorials: by [*Clement Levallois*](http://www.clementlevallois.net/training.html) and [*Sebastien Heymann*](http://www.youtube.com/watch?v=L6hHv6y5GsQ). +[*Gephi*](https://gephi.github.io/) – open source, platform independent. The best known and most versatile visualization tool available but expect a steep learning curve. The developers announce support for parallel edges in version 1.0. Tutorials: by [*Clement Levallois*](https://www.clementlevallois.net/training.html) and [*Sebastien Heymann*](https://www.youtube.com/watch?v=L6hHv6y5GsQ). [*VennMaker*](https://www.vennmaker.com) – is platform-independent and can be tested for free. VennMaker inverts the process of data collection: Users start with a customizable canvas and draw self-defined nodes and relations on it. The tool collects the corresponding data in the background. -The most commonly used tools for more mathematical analyses are [*UCINET*](https://sites.google.com/site/ucinetsoftware/home) (licensed, tutorials available on their website) and [*Pajek*](http://pajek.imfm.si/doku.php) (free) for which a great [*handbook*](http://www.cambridge.org/us/academic/subjects/sociology/research-methods-sociology-and-criminology/exploratory-social-network-analysis-pajek-2nd-edition) exists. Both were developed for Windows but run well elsewhere using Wine. +The most commonly used tools for more mathematical analyses are [*UCINET*](https://sites.google.com/site/ucinetsoftware/home) (licensed, tutorials available on their website) and [*Pajek*](https://pajek.imfm.si/doku.php) (free) for which a great [*handbook*](https://www.cambridge.org/us/academic/subjects/sociology/research-methods-sociology-and-criminology/exploratory-social-network-analysis-pajek-2nd-edition) exists. Both were developed for Windows but run well elsewhere using Wine. For Python users the very well documented package[ ](https://networkx.github.io/)[*Networkx*](https://networkx.github.io/) is a great starting point; other packages exist for other programming languages. diff --git a/en/lessons/crowdsourced-data-normalization-with-pandas.md b/en/lessons/crowdsourced-data-normalization-with-pandas.md index 2b0d1beefe..78107800f6 100644 --- a/en/lessons/crowdsourced-data-normalization-with-pandas.md +++ b/en/lessons/crowdsourced-data-normalization-with-pandas.md @@ -38,7 +38,7 @@ At the end of the lesson you will: This tutorial is for you if you are new to crowdsourcing and have little previous Python experience. ### Why Use Crowdsourcing? -In recent years, crowdsourcing cultural heritage projects such as [Transcribe Bentham](http://transcribe-bentham.ucl.ac.uk/td/Transcribe_Bentham) have made new research possible. In this example, volunteers can create accounts and transcribe the over 60,000 manuscripts of English philosopher, [Jeremy Bentham (1748-1832)](https://en.wikipedia.org/wiki/Jeremy_Bentham). Transcribe Bentham is making these important historical and philosophical manuscripts accessible to researchers, particularly those participating in text analysis. Other projects, such as [Penguin Watch](https://www.zooniverse.org/projects/penguintom79/penguin-watch) on [Zooniverse](https://www.zooniverse.org/), allowed members of the public to classify different images of penguins, which contributed to identifying environmental threats. Zooniverse itself is an online platform for "people-powered research," allowing millions of people worldwide to contribute to different research projects. These are all cases where data is collected and analyzed on a massive scale and public assistance is needed to complete very large projects. +In recent years, crowdsourcing cultural heritage projects such as [Transcribe Bentham](https://transcribe-bentham.ucl.ac.uk/td/Transcribe_Bentham) have made new research possible. In this example, volunteers can create accounts and transcribe the over 60,000 manuscripts of English philosopher, [Jeremy Bentham (1748-1832)](https://en.wikipedia.org/wiki/Jeremy_Bentham). Transcribe Bentham is making these important historical and philosophical manuscripts accessible to researchers, particularly those participating in text analysis. Other projects, such as [Penguin Watch](https://www.zooniverse.org/projects/penguintom79/penguin-watch) on [Zooniverse](https://www.zooniverse.org/), allowed members of the public to classify different images of penguins, which contributed to identifying environmental threats. Zooniverse itself is an online platform for "people-powered research," allowing millions of people worldwide to contribute to different research projects. These are all cases where data is collected and analyzed on a massive scale and public assistance is needed to complete very large projects. Computation and programming methods are very powerful but some jobs are only possible because of human expertise. There are elements of transcription or identification that are not easy to do using programming alone. Humans are better able to identify small differences and unusual data. However, people can also contribute to projects in larger ways, usually by competing in a contest. An example of macrotasking - a type of crowdsourcing for larger, more specialized projects - is the [Netflix Prize](https://www.netflixprize.com/). The Netflix Prize called for people to develop an algorithm to better predict movie recommendations for customers and winners received a reward or prize. @@ -78,7 +78,7 @@ Unfortunately, the What's on the menu? website was retired in January 202 Although the website is no longer live, you can still follow along with this lesson without any adjustments. -No matter how strict your guidelines or your submission protocols, variation will always be present in your crowdsourced data. However, there are ways to identify and normalize data in those cases. The New York Public Library (NYPL) possesses a digitized collection of approximately 45,000 menus, dating from the 1840s to today, and offers a good case study on how to correct some of these unavoidable issues. This collection is made public through [What's on the menu?](http://menus.nypl.org/). Instead of using optical character recognition (OCR) – a way of programmatically reading hand-written or printed documents into machine-searchable text - NYPL crowdsources transcription of the collection. Methods like OCR can save time but do not guarantee accuracy and often require humans to check and correct the output. In addition, the NYPL’s menus include a wide variety of handwritten texts and complex fonts which meant writing a universal code to ensure OCR accuracy was very difficult. Even if a universal code could be developed, the NYPL determined several parts of each menu that could only be identified by the human eye. +No matter how strict your guidelines or your submission protocols, variation will always be present in your crowdsourced data. However, there are ways to identify and normalize data in those cases. The New York Public Library (NYPL) possesses a digitized collection of approximately 45,000 menus, dating from the 1840s to today, and offers a good case study on how to correct some of these unavoidable issues. This collection is made public through [What's on the menu?](https://menus.nypl.org/). Instead of using optical character recognition (OCR) – a way of programmatically reading hand-written or printed documents into machine-searchable text - NYPL crowdsources transcription of the collection. Methods like OCR can save time but do not guarantee accuracy and often require humans to check and correct the output. In addition, the NYPL’s menus include a wide variety of handwritten texts and complex fonts which meant writing a universal code to ensure OCR accuracy was very difficult. Even if a universal code could be developed, the NYPL determined several parts of each menu that could only be identified by the human eye. Generated twice a month and available for public download, *What’s on the menu?* provides access to four distinct related datasets. The dataset we will use in this tutorial lists each menu and includes location and date information (the other datasets are relational and focus on different elements of each menu). This collection details meals over 150 years and shows what and when people ate in the past, adding a new dimension to historical understanding. The datasets curated by *What's on the menu?* include `Dish.csv`, `MenuItem.csv`, `MenuPage.csv`, and `Menu.csv`. @@ -415,7 +415,7 @@ To avoid this problem, require date- or time-based data entry conform to a stand #### Converting Datatype to Date Once in a determined format, pandas has a function that can help with date normalization. If the dates you are working with are in a standardized specific order, you can use the function `to_datetime()`. This will convert the `date` column from an object datatype (meaning that the contents of the column are made up of either text or numeric and non-numeric values) to a datetime (meaning that the contents within the column consist of a specifically formatted date and time values) datatype. Further [documentation](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html) details how to customize this function based on the unique date formats in your dataset. -This function is powerful but also potentially limiting because the pandas library only recognizes dates within a [given period of time](http://pandas-docs.github.io/pandas-docs-travis/user_guide/timeseries.html#timestamp-limitations). Because of how the datetime timestamps are calculated within the built-in function, pandas can only deal with a time span of approximately 584 years; the minimum date is 1677 and the maximum date is 2262. Dates outside this timeframe will produce an error. If your datasets date from before 1677, the pandas library is not a good option for this conversion. Other ways to approach date data normalization include using [regular expressions](https://www.oreilly.com/library/view/regular-expressions-cookbook/9781449327453/ch04s04.html), however, this involves being able to identify the specific written pattern(s) in which the errors manifest. +This function is powerful but also potentially limiting because the pandas library only recognizes dates within a [given period of time](https://pandas.pydata.org/docs/user_guide/timeseries.html). Because of how the datetime timestamps are calculated within the built-in function, pandas can only deal with a time span of approximately 584 years; the minimum date is 1677 and the maximum date is 2262. Dates outside this timeframe will produce an error. If your datasets date from before 1677, the pandas library is not a good option for this conversion. Other ways to approach date data normalization include using [regular expressions](https://www.oreilly.com/library/view/regular-expressions-cookbook/9781449327453/ch04s04.html), however, this involves being able to identify the specific written pattern(s) in which the errors manifest. Because of this limitation, data entry errors related to the date produce an error when the `to_datetime` function is run. Our dataset contains several such errors. An example is entry 13,112, where the date is entered as `0190-03-06`. This is most likely an example of an input error, which is normal in transcription (human error). This error is identified if you enter this code in your Python file and run it to convert the column datatype to date: @@ -527,7 +527,7 @@ replaced_dates.to_csv("NYPL_NormalMenus.csv") ``` ## Conclusion -The process of normalizing your data is rarely straightforward. In ["Against Cleaning"](http://curatingmenus.org/articles/against-cleaning/), authors Katie Rawson and Trevor Muñoz discuss what makes “cleaning” the NYPL menu datasets difficult. For example, there were changes in the spelling of different foods over time as well as differences in how dishes and drinks were referenced, to properly reflect their period. To “clean” that data - to normalize it - would diminish the historical value. In addition, as the authors discovered, it proved complex to distinguish “which variants in the names of dishes revealed new information (they) should account for in (their) own data, and which variants were simply accidents of transcription or typesetting.” Methods typically used to clean data were no longer sufficient. +The process of normalizing your data is rarely straightforward. In ["Against Cleaning"](https://curatingmenus.org/articles/against-cleaning/), authors Katie Rawson and Trevor Muñoz discuss what makes “cleaning” the NYPL menu datasets difficult. For example, there were changes in the spelling of different foods over time as well as differences in how dishes and drinks were referenced, to properly reflect their period. To “clean” that data - to normalize it - would diminish the historical value. In addition, as the authors discovered, it proved complex to distinguish “which variants in the names of dishes revealed new information (they) should account for in (their) own data, and which variants were simply accidents of transcription or typesetting.” Methods typically used to clean data were no longer sufficient. Collecting data through crowdsourced means can be highly efficient, but normalizing humanities data can be complicated. Rawson and Muñoz found that the concept of “data cleaning” was no longer accurate and the process could not be completed using the “usual” methods. Humanities data is unique. It is diverse. It is complex. And, in many cases, historical detail is vital. Many techniques for normalization can be carried out programmatically but computers are unable to interpret unique situations with ease. As noted by Rawson and Muñoz, variability is not always a bad thing; it is not a mess that requires order above all else - it is a complex diversity that needs to be preserved. Data variability cannot be avoided when data is crowdsourced. Ultimately, it is up to you to determine whether common normalization practices are appropriate for your data as well as for your research questions. diff --git a/en/lessons/data-mining-the-internet-archive.md b/en/lessons/data-mining-the-internet-archive.md index f20cd0a901..983147e005 100755 --- a/en/lessons/data-mining-the-internet-archive.md +++ b/en/lessons/data-mining-the-internet-archive.md @@ -15,7 +15,7 @@ exclude_from_check: activity: acquiring topics: [web-scraping] abstract: "The collections of the Internet Archive include many digitized historical sources. Many contain rich bibliographic data in a format called MARC. In this lesson, you'll learn how to use Python to automate the downloading of large numbers of MARC files from the Internet Archive and the parsing of MARC records for specific information such as authors, places of publication, and dates. The lesson can be applied more generally to other Internet Archive files and to MARC records found elsewhere." -redirect_from: /lessons/data-mining-the-internet-archive +redirect_from: /lessons/data-mining-the-internet-archive/ avatar_alt: Group of of men working in a mine doi: 10.46430/phen0035 --- @@ -649,37 +649,37 @@ analyze which subjects are common in the MARC records. Now that you have the MARC records downloaded and can use `pymarc` to extract information from the fields, the possibilities can multiply rapidly! - [Internet Archive]: http://archive.org/ - [early JSTOR journal content]: https://archive.org/details/jstor_ejc - [John Adams's personal library]: https://archive.org/details/johnadamsBPL - [Haiti collection]: https://archive.org/details/jcbhaiti - [Ian Milligan]: http://activehistory.ca/2013/09/the-internet-archive-rocks-or-two-million-plus-free-sources-to-explore/ - [Anti-Slavery Collection]: http://archive.org/details/bplscas - [internetarchive]: https://pypi.python.org/pypi/internetarchive - [pymarc]: https://pypi.python.org/pypi/pymarc/ - [this letter]: http://archive.org/details/lettertowilliaml00doug - [original manuscript]: http://archive.org/stream/lettertowilliaml00doug/39999066767938#page/n0/mode/2up - [multiple files]: http://archive.org/download/lettertowilliaml00doug - [Dublin Core]: http://archive.org/download/lettertowilliaml00doug/lettertowilliaml00doug_dc.xml - [MARCXML]: http://archive.org/download/lettertowilliaml00doug/lettertowilliaml00doug_marc.xml - [Library of Congress's MARC 21 Format for Bibliographic Data]: http://www.loc.gov/marc/bibliographic/ - [thousands of antislavery letters, manuscripts, and publications]: http://archive.org/search.php?query=collection%3Abplscas&sort=-publicdate - [eBook and Texts]: https://archive.org/details/texts - [the way that items and item URLs are structured]: http://blog.archive.org/2011/03/31/how-archive-org-items-are-structured/ - [advanced search]: https://archive.org/advancedsearch.php - [this page]: https://archive.org/search.php?query=collection%3A%28bplscas%29 - [search the Archive using the Python module that we installed]: http://internetarchive.readthedocs.io/en/latest/quickstart.html#searching - [the advanced search for the collection]: http://archive.org/search.php?query=collection%3Abplscas - [downloading]: http://internetarchive.readthedocs.io/en/latest/quickstart.html#downloading - [remember those?]: /lessons/code-reuse-and-modularity - [item files are named according to specific rules]: https://archive.org/about/faqs.php#140 - [handling exceptions]: http://docs.python.org/2/tutorial/errors.html#handling-exceptions - [rules specified for the 260 datafield]: http://www.loc.gov/marc/bibliographic/bd260.html - [MARC standards]: http://www.loc.gov/marc/ - [1]: https://github.com/edsu/pymarc - [functions that it provides for working with MARC XML records]: https://github.com/edsu/pymarc/blob/master/pymarc/marcxml.py - [Counting Frequencies]: /lessons/counting-frequencies - [Google Maps lesson]: /lessons/googlemaps-googleearth - [Wordle word cloud]: https://web.archive.org/web/20201202151557/http://www.wordle.net/ - [cleaning of your data]: /lessons/cleaning-ocrd-text-with-regular-expressions - [Installing Python Modules with pip]: /lessons/installing-python-modules-pip +- [Internet Archive](https://archive.org/) +- [early JSTOR journal content](https://archive.org/details/jstor_ejc) +- [John Adams's personal library](https://archive.org/details/johnadamsBPL) +- [Haiti collection](https://archive.org/details/jcbhaiti) +- [Ian Milligan](https://activehistory.ca/2013/09/the-internet-archive-rocks-or-two-million-plus-free-sources-to-explore/) +- [Anti-Slavery Collection](https://archive.org/details/bplscas) +- [internetarchive Python module](https://pypi.python.org/pypi/internetarchive) +- [pymarc](https://pypi.python.org/pypi/pymarc/) +- [this letter](https://archive.org/details/lettertowilliaml00doug) +- [original manuscript](https://archive.org/stream/lettertowilliaml00doug/39999066767938#page/n0/mode/2up) +- [multiple files](https://archive.org/download/lettertowilliaml00doug) +- [Dublin Core metadata](https://archive.org/download/lettertowilliaml00doug/lettertowilliaml00doug_dc.xml) +- [MARCXML metadata](https://archive.org/download/lettertowilliaml00doug/lettertowilliaml00doug_marc.xml) +- [Library of Congress's MARC 21 Format for Bibliographic Data](https://www.loc.gov/marc/bibliographic/) +- [thousands of antislavery letters, manuscripts, and publications](https://archive.org/search.php?query=collection%3Abplscas&sort=-publicdate) +- [eBook and Texts collection](https://archive.org/details/texts) +- [how items and item URLs are structured](https://blog.archive.org/2011/03/31/how-archive-org-items-are-structured/) +- [advanced search](https://archive.org/advancedsearch.php) +- [this page](https://archive.org/search.php?query=collection%3A%28bplscas%29) +- [search the Archive using the Python module](https://archive.org/developers/internetarchive/cli.html#cli-search) +- [the advanced search for the collection](https://archive.org/search.php?query=collection%3Abplscas) +- [downloading](https://archive.org/developers/internetarchive/cli.html#cli-download) +- [remember those?](/en/lessons/code-reuse-and-modularity) +- [item files are named according to specific rules](https://archive.org/about/faqs.php#140) +- [handling exceptions](https://docs.python.org/2/tutorial/errors.html#handling-exceptions) +- [rules specified for the 260 datafield](https://www.loc.gov/marc/bibliographic/bd260.html) +- [MARC standards](https://www.loc.gov/marc/) +- [pymarc GitHub repo](https://github.com/edsu/pymarc) +- [functions for working with MARC XML records](https://github.com/edsu/pymarc/blob/master/pymarc/marcxml.py) +- [Counting Frequencies](/en/lessons/counting-frequencies) +- [Google Maps lesson](/en/lessons/googlemaps-googleearth) +- [Wordle word cloud](https://web.archive.org/web/20201202151557/https://www.wordle.net/) +- [cleaning of your data](/en/lessons/cleaning-ocrd-text-with-regular-expressions) +- [Installing Python Modules with pip](/en/lessons/installing-python-modules-pip) diff --git a/en/lessons/data-wrangling-and-management-in-r.md b/en/lessons/data-wrangling-and-management-in-r.md index b90b51fa28..3c6fa3372d 100755 --- a/en/lessons/data-wrangling-and-management-in-r.md +++ b/en/lessons/data-wrangling-and-management-in-r.md @@ -1,608 +1,608 @@ ---- -title: Data Wrangling and Management in R -slug: data-wrangling-and-management-in-r -layout: lesson -collection: lessons -authors: -- Nabeel Siddiqui -date: 2017-07-31 -reviewers: -- Lauren Tilton -- Ryan Deschamps -editors: -- Ian Milligan -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/60 -difficulty: 2 -activity: transforming -topics: [data-manipulation, data-management, distant-reading, r, data-visualization] -abstract: "This tutorial explores how scholars can organize 'tidy' data, understand R packages to manipulate data, and conduct basic data analysis." -avatar_alt: Bar of soap -doi: 10.46430/phen0063 ---- - -{% include toc.html %} - - - - - -Assumptions -=========== - -This lesson makes a few assumptions about your understanding of R. If -you have not completed the [R Basics with Tabular -Data](/lessons/r-basics-with-tabular-data) -lesson, I suggest you complete that first. Having a background in -another programming language will also be beneficial. If you need a -place to start, I recommend working through the *Programming -Historian's* excellent Python tutorials. - -Lesson Goals -============ - -By the end of this lesson, you will: - -1. Know how to organize data to be "tidy" and why this is important. -2. Understand the dplyr package and use it to manipulate and wrangle - with data. -3. Become acquainted with the pipe operator in R and observe how it can - assist you in creating more readable code. -4. Learn to work through some basic examples of data manipulation to - gain a foundation in exploratory data analysis. - -Introduction -============ - -Data you find "in the wild" will rarely be in a format necessary for -analysis, and you will need to manipulate it before exploring the -questions you are interested in. This may take more time than doing the -analysis itself! In this tutorial, we will learn some basic techniques -for manipulating, managing, and wrangling with our data in R. -Specifically, we will rely on the philosophy of ["tidy -data"](https://www.jstatsoft.org/article/view/v059i10) as articulated by -Hadley Wickham. - -According to [Wickham](http://hadley.nz/), data is "tidy" when it meets -three key criteria: - -1. Each observation is in a row. -2. Each variable is in a column. -3. Each value has its own cell. - -Being observant of these criteria allows us to recognize when data is -organized or unorganized. It also provides us a standardized schema and -set of tools for cleaning up some of the most common ways that datasets -are "messy:" - -1. Column headers are values, not variable names. -2. Multiple variables are stored in one column. -3. Variables are stored in both rows and columns. -4. Multiple types of observational units are stored in the same table. -5. A single observational unit is stored in multiple tables. - -Perhaps most importantly, keeping our data in this format allows us to -use a collection of packages in the -["tidyverse,"](http://tidyverse.org/) which are designed to specifically -work with tidy data. By making sure that our input and output are tidy, -we only have to use a small set of tools to solve a large number of -questions. In addition, we can combine, manipulate, and split tidy -datasets as we see fit. - -In this tutorial, we will be focusing on the -[dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) -package of the tidyverse, but it is worth briefly mentioning some others -we will be running into: - -[**magittr**](http://magrittr.tidyverse.org)--This package gives us -access to the forward pipe operator and makes our code easier to read. -[**ggplot2**](http://ggplot2.tidyverse.org/)--This package utilizes the -["Grammar of Graphics"](http://www.springer.com/us/book/9780387245447) -to provide an easy way to visualize our data. -[**readr**](http://readr.tidyverse.org)--This package makes available a -faster and more streamlined method of importing rectangular data, such -as csv files. -[**tibble**](http://tibble.tidyverse.org/)--This package provides us -access to a reconceptualization of data frames that are easier to work -with and print. - -If you have not already done so, you should install and load the -"tidyverse" before beginning. In addition, make sure that you have the -[latest version of R](https://cran.rstudio.com/) and the [latest version -of R Studio](https://www.rstudio.com/products/rstudio/download/) -installed for your respective platform. - -Copy the following code into RStudio. To run it, you need to highlight -the lines and press Ctrl+Enter (Command+Enter on Mac OS): - - # Install tidyverse libraries and load it - # Do not worry if this takes a while - - install.packages("tidyverse") - library(tidyverse) - -An Example of dplyr in Action -============================= - -Let's go through an example to see how dplyr can aid us as historians by -inputting U.S. decennial census data from 1790 to 2010. Download the -data by [clicking -here](/assets/data-wrangling-and-management-in-r/introductory_state_example.csv) -and place it in the folder that you will use to work through the examples -in this tutorial. - -Since the data is in a csv file, we are going to use the read\_csv() -command in tidyverse's -[readr](https://cran.r-project.org/web/packages/readr/vignettes/readr.html) -package. - -The read\_csv function takes the path of a file we want to import from -as a variable so make sure that you have it set up correctly. - - # Import CSV File and save to us_state_populations_import - # Make sure you set the path of the file correctly - us_state_populations_import<-read_csv("introductory_state_example.csv") - -After you import the data, you will notice that there are three columns: -one for the population, one for the year, and one for the state. This -data is already in a tidy format providing us a multitude of options for -further exploration. - -For this example, let's visualize the population growth of California and -New York to gain a better understanding of Western migration. We will -use dplyr to filter our data so that it only contains information about -the states we are interested in, and we will use ggplot2 to visualize -this information. This exercise is just to provide you a taste of what -dplyr can do, so don't worry if you don't understand the code at this -time. - - # Filter to California and New York states only - california_and_new_york_state_populations<-us_state_populations_import %>% - filter(state %in% c("California", "New York")) - - # Plot California and New York State Populations - ggplot(data=california_and_new_york_state_populations, aes(x=year, y=population, color=state)) + - geom_line() + - geom_point() - -{% include figure.html filename="en-or-data-wrangling-and-management-in-r-01.png" caption="Graph of California and New York population" %} - -As we can see, the population of California has grown considerably -compared to New York. While this particular example may seem obvious -given the history of U.S. migration, the code itself provides us a -foundation that we can build on to ask a multitude of similar questions. -For instance, with a quick change of code, we can create a similar graph -with two different states such as Mississippi and Virginia. - - # Filter to Mississippi and Virginia - mississippi_and_virginia_state_populations<-us_state_populations_import %>% - filter(state %in% c("Mississippi", "Virginia")) - - # Plot California and New York State Populations - ggplot(data=mississippi_and_virginia_state_populations, aes(x=year, y=population, color=state)) + - geom_line() + - geom_point() - -{% include figure.html filename="en-or-data-wrangling-and-management-in-r-02.png" caption="Graph of Mississippi and Virginia population" %} - -Quickly making changes to our code and reanalyzing our data is a -fundamental part of exploratory data analysis (EDA). Rather than trying -to "prove" a hypothesis, exploratory data analysis helps us understand -our data better and ask questions about it. For historians, EDA provides -an easy means of knowing when to dig deeper into a subject and when to -step back, and it is an area where R excels. - -Pipe Operator -============= - -Before looking at dplyr, we need to go over the pipe operator (%>%) -in R since we will often run into it in our examples. As mentioned -earlier, the pipe operator is part of the -[magrittr](https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html) -package created by [Stefan Milton Bache](http://stefanbache.dk/) and -[Hadley Wickham](http://hadley.nz/) and is included in the tidyverse. -Its name is an homage to surrealest painter Rene Magritte, whose "The -Treachery of Images" famously depicted a pipe with the words "this is -not a pipe" underneath in French. - -The pipe operator allows you to pass what is to the left of the pipe as -the first variable in a function specified on the right. Although it may -seem strange at first, once you learn it, you will find that it makes -your code more readable by avoiding nested statements. Don't worry if -all this is a little confusing right now. It will become more clear as -we go through the examples. - -Let's say that we are interested in getting the square root of each -population value and then summing all the square roots before getting -the mean. Obviously, this isn't a useful measurement, but it demonstrates -just how quickly R code can become difficult to read. Normally, we would -nest such statements: - - mean(sum(sqrt(us_state_populations_import$population))) - - ## [1] 1256925 - -As you can see, with enough nested commands, it is hard to remember how -many parenthesis you need and makes the code awkward to read. To mitigate -this, some people may create temporary vectors in between each function -call. - - # Get square root of all the state populations - - sqrt_state_populations_vector<-sqrt(us_state_populations_import$population) - - # Get sum of all the sqrts of the temporary variable - - sum_sqrt_state_populations_vector<-sum(sqrt_state_populations_vector) - - # Get mean of the temporary variable - - mean_sum_sqrt_state_populations_vector<-mean(sum_sqrt_state_populations_vector) - - # Display the mean - - mean_sum_sqrt_state_populations_vector - - ## [1] 1256925 - -Although you get the same answer, this is a lot more readable. However, -it can quickly clutter your workspace if you forget to delete the -temporary vectors. The pipe operator does all this for you. Here is the -same code using the pipe operator. - - us_state_populations_import$population%>%sqrt%>%sum%>%mean - - ## [1] 1256925 - -This is a lot easier to read, and you could make it even more clear by -writing this on multiple lines. - - # Make sure to put the operator at the end of the line - us_state_populations_import$population%>% - sqrt%>% - sum%>% - mean - - ## [1] 1256925 - -Please note that the vectors or data frames that the pipe operator -creates are discarded after the operation is complete. If you want to -store them, you should pass them to a new variable. - - permanent_sqrt_and_sum_state_populations_vector <- us_state_populations_import$population%>%sqrt%>%sum%>%mean - permanent_sqrt_and_sum_state_populations_vector - - ## [1] 1256925 - -We Need a New Dataset -===================== - -Now that we have an understanding of the pipe operator, we are ready to -begin looking at and wrangling with some data. Unfortunately, for -historians, there are only a few easily available datasets--perhaps you -can help change this by making yours available to the public! We are -going to rely on the [history -data](https://cran.r-project.org/package=historydata) package -created by [Lincoln Mullen](http://lincolnmullen.com/). - -Lets go ahead and install and load the package: - - # Install historydata package - install.packages("historydata") - - # Load historydata package - library(historydata) - -This packages contains samples of historical datasets--the earlier U.S. -Census data sample was taken from this package. Throughout this -tutorial, we are specifically going to work with the early\_colleges -dataset that contains data about colleges founded before 1848. Lets -start by loading the data and view it. - - # Make sure you have installed the historydata package and loaded it before this - - data(early_colleges) - early_colleges - - ## # A tibble: 65 × 6 - ## college original_name city state established sponsorship - ## - ## 1 Harvard Cambridge MA 1636 Congregational; after 1805 Unitarian - ## 2 William and Mary Williamsburg VA 1693 Anglican - ## 3 Yale New Haven CT 1701 Congregational - ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational - ## 5 Princeton College of New Jersey Princeton NJ 1746 Presbyterian - ## 6 Columbia King's College New York NY 1754 Anglican - ## 7 Brown Providence RI 1765 Baptist - ## 8 Rutgers Queen's College New Brunswick NJ 1766 Dutch Reformed - ## 9 Dartmouth Hanover NH 1769 Congregational - ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican - ## # ... with 55 more rows - -As you can observe, this dataset contains the current name of the college, its -original name, the city and state where it was founded, when the college was -established, and its sponsorship. As we discussed earlier, before we can -work with a dataset, it is important to think about how to organize the -data. Let's see if any of our data is not in a "tidy" format. Do you see -any cells that do not match the three criteria for tidy data? - -If you guessed the sponsorship of Harvard, you are correct. In addition -to noting the original sponsorship, it also mentions that it changed -sponsorship in 1805. Usually, you want to keep as much information about -your data that you can, but for the purposes of this tutorial, we are -going to change the column to only have the original sponsorship. - - early_colleges[1,6] <- "Congregational" - early_colleges - - ## # A tibble: 65 × 6 - ## college original_name city state established sponsorship - ## - ## 1 Harvard Cambridge MA 1636 Congregational - ## 2 William and Mary Williamsburg VA 1693 Anglican - ## 3 Yale New Haven CT 1701 Congregational - ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational - ## 5 Princeton College of New Jersey Princeton NJ 1746 Presbyterian - ## 6 Columbia King's College New York NY 1754 Anglican - ## 7 Brown Providence RI 1765 Baptist - ## 8 Rutgers Queen's College New Brunswick NJ 1766 Dutch Reformed - ## 9 Dartmouth Hanover NH 1769 Congregational - ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican - ## # ... with 55 more rows - -Now that we have our data in a tidy format, we can shape it through the -dplyr package. - -What is Dplyr? -============== - -[Dplyr](https://cran.rstudio.com/web/packages/dplyr/vignettes/dplyr.html) -is another part of the tidyverse that provides functions for -manipulating and transforming your data. Because we are keeping our data -"tidy," we only need a small set of tools to explore our data. Compared -to base R, using dplyr is often faster, and guarantees that if our -input is tidy then our output will also be tidy. Perhaps most importantly, -dplyr makes our code easier to read and utilizes "verbs" that are, in -most cases, intuitive. Each function in dplyr corresponds to these verbs, -with the five key ones being filter, select, arrange, mutate, and -summarise--dplyr uses the British spelling. Let's go through each of them -individually to see how they work in practice. - -### Select - -If we look at the early\_colleges data, we can observe that there are a -lot of NA's in the original names column. NA signifies that the data is -not available, and we may want to view our data with this column -removed. dplyr's select() function gives us the ability to do this. It -takes the data frame you want to manipulate as the first argument, followed by a -list signifying which columns you would like to keep: - - # Remove the original names column using select() - # Note that you do not have to append the column name with a $ to the end of early_colleges since - # dplyr automatically assumes that a "," represents AND - - select(early_colleges, college, city, state, established, sponsorship) - - ## # A tibble: 65 × 5 - ## college city state established sponsorship - ## - ## 1 Harvard Cambridge MA 1636 congregational - ## 2 William and Mary Williamsburg VA 1693 Anglican - ## 3 Yale New Haven CT 1701 Congregational - ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational - ## 5 Princeton Princeton NJ 1746 Presbyterian - ## 6 Columbia New York NY 1754 Anglican - ## 7 Brown Providence RI 1765 Baptist - ## 8 Rutgers New Brunswick NJ 1766 Dutch Reformed - ## 9 Dartmouth Hanover NH 1769 Congregational - ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican - ## # ℹ 55 more rows - -Let's also go ahead and see how to write this using the pipe operator -(%>%): - - early_colleges%>% - select(college, city, state, established, sponsorship) - - ## # A tibble: 65 × 5 - ## college city state established sponsorship - ## - ## 1 Harvard Cambridge MA 1636 congregational - ## 2 William and Mary Williamsburg VA 1693 Anglican - ## 3 Yale New Haven CT 1701 Congregational - ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational - ## 5 Princeton Princeton NJ 1746 Presbyterian - ## 6 Columbia New York NY 1754 Anglican - ## 7 Brown Providence RI 1765 Baptist - ## 8 Rutgers New Brunswick NJ 1766 Dutch Reformed - ## 9 Dartmouth Hanover NH 1769 Congregational - ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican - ## # ℹ 55 more rows - -Referencing each of the columns that we want to keep just to get rid of -one is a little tedous. We can use the minus symbol (-) to demonstrate -that we want to remove a column. - - early_colleges%>% - select(-original_name) - - ## # A tibble: 65 × 5 - ## college city state established sponsorship - ## - ## 1 Harvard Cambridge MA 1636 congregational - ## 2 William and Mary Williamsburg VA 1693 Anglican - ## 3 Yale New Haven CT 1701 Congregational - ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational - ## 5 Princeton Princeton NJ 1746 Presbyterian - ## 6 Columbia New York NY 1754 Anglican - ## 7 Brown Providence RI 1765 Baptist - ## 8 Rutgers New Brunswick NJ 1766 Dutch Reformed - ## 9 Dartmouth Hanover NH 1769 Congregational - ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican - ## # ℹ 55 more rows - -### Filter - -The filter() function does the same thing as the select function but -rather than choosing the column name, we can use it to filter rows using -a test requirement. For instance, we can view all the colleges that -existed before the turn of the century. - - early_colleges%>% - filter(established < 1800) - - ## # A tibble: 20 × 6 - ## college original_name city state established sponsorship - ## - ## 1 Harvard Cambridge MA 1636 Congregational - ## 2 William and Mary Williamsburg VA 1693 Anglican - ## 3 Yale New Haven CT 1701 Congregational - ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational - ## 5 Princeton College of New Jersey Princeton NJ 1746 Presbyterian - ## 6 Columbia King's College New York NY 1754 Anglican - ## 7 Brown Providence RI 1765 Baptist - ## 8 Rutgers Queen's College New Brunswick NJ 1766 Dutch Reformed - ## 9 Dartmouth Hanover NH 1769 Congregational - ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican - ## 11 Hampden-Sydney Hampden-Sydney VA 1775 Presbyterian - ## 12 Transylvania Lexington KY 1780 Disciples of Christ - ## 13 Georgia, Univ. of Athens GA 1785 Secular - ## 14 Georgetown Washington DC 1789 Roman Catholic - ## 15 North Carolina, Univ. of Chapel Hill NC 1789 Secular - ## 16 Vermont, Univ. of Burlington VT 1791 Nondenominational - ## 17 Williams Williamstown MA 1793 Congregational - ## 18 Tennessee, Univ. of Blount College Knoxville TN 1794 Secular - ## 19 Union College Schenectady NY 1795 Presbyterian with Congregational - ## 20 Marietta Marietta OH 1797 Congregational - -### Mutate - -The mutate command allows you to add a column to your data frame. Right -now, we have the city and state in two separate columns. We can use the -paste command to combine two strings and specify a seperator. Let's place -them in a single column called "location." - - early_colleges%>%mutate(location=paste(city,state,sep=",")) - - ## # A tibble: 65 × 7 - ## college original_name city state established sponsorship location - ## - ## 1 Harvard Cambridge MA 1636 Congregational Cambridge,MA - ## 2 William and Mary Williamsburg VA 1693 Anglican Williamsburg,VA - ## 3 Yale New Haven CT 1701 Congregational New Haven,CT - ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational Philadelphia,PA - ## 5 Princeton College of New Jersey Princeton NJ 1746 Presbyterian Princeton,NJ - ## 6 Columbia King's College New York NY 1754 Anglican New York,NY - ## 7 Brown Providence RI 1765 Baptist Providence,RI - ## 8 Rutgers Queen's College New Brunswick NJ 1766 Dutch Reformed New Brunswick,NJ - ## 9 Dartmouth Hanover NH 1769 Congregational Hanover,NH - ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican Charleston,SC - ## # ... with 55 more rows - -Again, you need to remember that dplyr does not save the data or -manipulate the original. Instead, it creates a temporary data frame at -each step. If you want to keep it, you need to create a permanent -variable. - - early_colleges_with_location <- early_colleges%>% - mutate(location=paste(city, state, sep=",")) - - # View the new tibble with the location added - early_colleges_with_location - - ## # A tibble: 65 × 7 - ## college original_name city state established sponsorship location - ## - ## 1 Harvard Cambridge MA 1636 Congregational Cambridge,MA - ## 2 William and Mary Williamsburg VA 1693 Anglican Williamsburg,VA - ## 3 Yale New Haven CT 1701 Congregational New Haven,CT - ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational Philadelphia,PA - ## 5 Princeton College of New Jersey Princeton NJ 1746 Presbyterian Princeton,NJ - ## 6 Columbia King's College New York NY 1754 Anglican New York,NY - ## 7 Brown Providence RI 1765 Baptist Providence,RI - ## 8 Rutgers Queen's College New Brunswick NJ 1766 Dutch Reformed New Brunswick,NJ - ## 9 Dartmouth Hanover NH 1769 Congregational Hanover,NH - ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican Charleston,SC - ## # ... with 55 more rows - -### Arrange - -The arrange() function allows us to order our columns in a new way. -Currently, the colleges are organized by year in ascending order. Lets -place them in descending order of establishment, in this case, from the end of the Mexican-American War. - - early_colleges %>% - arrange(desc(established)) - - ## # A tibble: 65 × 6 - ## college original_name city state established sponsorship - ## - ## 1 Wisconsin, Univ. of Madison WI 1848 Secular - ## 2 Earlham Richmond IN 1847 Quaker - ## 3 Beloit Beloit WI 1846 Congregational - ## 4 Bucknell Lewisburg PA 1846 Baptist - ## 5 Grinnell Grinnell IA 1846 Congregational - ## 6 Mount Union Alliance OH 1846 Methodist - ## 7 Louisiana, Univ. of New Orleans LA 1845 Secular - ## 8 U.S. Naval Academy Annapolis MD 1845 Secular - ## 9 Mississipps, Univ. of Oxford MI 1844 Secular - ## 10 Holy Cross Worchester MA 1843 Roman Catholic - ## # ... with 55 more rows - -### Summarise - -The last key function in dplyr is summarise()--note the British -spelling. Summarise() takes a function or operation, and is usually used -to create a data frame that contains summary statistics for plotting. We -will use it to calculate the average year that colleges before 1848 were -founded. - - early_colleges%>%summarise(mean(established)) - - ## # A tibble: 1 x 1 - ## `mean(established)` - ## - ## 1 1809.831 - -Putting it All Together -======================= - -Now that we have gone through the five main verbs for dplyr, we can use -them to create a quick visualization of our data. Let's go ahead and -create a bar graph showing the number of secular and non-secular -colleges founded before the U.S. War of 1812: - - secular_colleges_before_1812<-early_colleges%>% - filter(established < 1812)%>% - mutate(is_secular=ifelse(sponsorship!="Secular", "no", "yes")) - - ggplot(secular_colleges_before_1812) + - geom_bar(aes(x=is_secular, fill=is_secular))+ - labs(x="Is the college secular?") - -{% include figure.html filename="en-or-data-wrangling-and-management-in-r-03.png" caption="Number of secular and non-secular colleges before War of 1812" %} - -Again, by making a quick change to our code, we can also look at the -number of secular versus non-secular colleges founded after the start of -the War of 1812: - - secular_colleges_after_1812<-early_colleges%>% - filter(established > 1812)%>% - mutate(is_secular=ifelse(sponsorship!="Secular", "no", "yes")) - - ggplot(secular_colleges_after_1812) + - geom_bar(aes(x=is_secular, fill=is_secular))+ - labs(x="Is the college secular?") - -({% include figure.html filename="en-or-data-wrangling-and-management-in-r-04.png" caption="Number of secular and non-secular colleges after War of 1812" %} - -Conclusion -========== - -This tutorial should put you well on the way to thinking about how to -organize and manipulate your data in R. Later, you will probably want to -graph your data in some way. I recommend that you begin looking at the -[ggplot2](https://ggplot2.tidyverse.org/) package for a set of tools that work -well with dplyr. In addition, you may want to examine some of the -other functions that come with dplyr to hone your skills. Either way, -this should provide a good foundation to build on and cover a lot of -the common problems you will encounter. +--- +title: Data Wrangling and Management in R +slug: data-wrangling-and-management-in-r +layout: lesson +collection: lessons +authors: +- Nabeel Siddiqui +date: 2017-07-31 +reviewers: +- Lauren Tilton +- Ryan Deschamps +editors: +- Ian Milligan +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/60 +difficulty: 2 +activity: transforming +topics: [data-manipulation, data-management, distant-reading, r, data-visualization] +abstract: "This tutorial explores how scholars can organize 'tidy' data, understand R packages to manipulate data, and conduct basic data analysis." +avatar_alt: Bar of soap +doi: 10.46430/phen0063 +--- + +{% include toc.html %} + + + + + +Assumptions +=========== + +This lesson makes a few assumptions about your understanding of R. If +you have not completed the [R Basics with Tabular +Data](/en/lessons/r-basics-with-tabular-data) +lesson, I suggest you complete that first. Having a background in +another programming language will also be beneficial. If you need a +place to start, I recommend working through the *Programming +Historian's* excellent Python tutorials. + +Lesson Goals +============ + +By the end of this lesson, you will: + +1. Know how to organize data to be "tidy" and why this is important. +2. Understand the dplyr package and use it to manipulate and wrangle + with data. +3. Become acquainted with the pipe operator in R and observe how it can + assist you in creating more readable code. +4. Learn to work through some basic examples of data manipulation to + gain a foundation in exploratory data analysis. + +Introduction +============ + +Data you find "in the wild" will rarely be in a format necessary for +analysis, and you will need to manipulate it before exploring the +questions you are interested in. This may take more time than doing the +analysis itself! In this tutorial, we will learn some basic techniques +for manipulating, managing, and wrangling with our data in R. +Specifically, we will rely on the philosophy of ["tidy +data"](https://www.jstatsoft.org/article/view/v059i10) as articulated by +Hadley Wickham. + +According to [Wickham](https://hadley.nz/), data is "tidy" when it meets +three key criteria: + +1. Each observation is in a row. +2. Each variable is in a column. +3. Each value has its own cell. + +Being observant of these criteria allows us to recognize when data is +organized or unorganized. It also provides us a standardized schema and +set of tools for cleaning up some of the most common ways that datasets +are "messy:" + +1. Column headers are values, not variable names. +2. Multiple variables are stored in one column. +3. Variables are stored in both rows and columns. +4. Multiple types of observational units are stored in the same table. +5. A single observational unit is stored in multiple tables. + +Perhaps most importantly, keeping our data in this format allows us to +use a collection of packages in the +["tidyverse,"](https://tidyverse.org/) which are designed to specifically +work with tidy data. By making sure that our input and output are tidy, +we only have to use a small set of tools to solve a large number of +questions. In addition, we can combine, manipulate, and split tidy +datasets as we see fit. + +In this tutorial, we will be focusing on the +[dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) +package of the tidyverse, but it is worth briefly mentioning some others +we will be running into: + +[**magittr**](https://magrittr.tidyverse.org)--This package gives us +access to the forward pipe operator and makes our code easier to read. +[**ggplot2**](https://ggplot2.tidyverse.org/)--This package utilizes the +["Grammar of Graphics"](https://www.springer.com/us/book/9780387245447) +to provide an easy way to visualize our data. +[**readr**](https://readr.tidyverse.org)--This package makes available a +faster and more streamlined method of importing rectangular data, such +as csv files. +[**tibble**](https://tibble.tidyverse.org/)--This package provides us +access to a reconceptualization of data frames that are easier to work +with and print. + +If you have not already done so, you should install and load the +"tidyverse" before beginning. In addition, make sure that you have the +[latest version of R](https://cran.rstudio.com/) and the [latest version +of R Studio](https://www.rstudio.com/products/rstudio/download/) +installed for your respective platform. + +Copy the following code into RStudio. To run it, you need to highlight +the lines and press Ctrl+Enter (Command+Enter on Mac OS): + + # Install tidyverse libraries and load it + # Do not worry if this takes a while + + install.packages("tidyverse") + library(tidyverse) + +An Example of dplyr in Action +============================= + +Let's go through an example to see how dplyr can aid us as historians by +inputting U.S. decennial census data from 1790 to 2010. Download the +data by [clicking +here](/assets/data-wrangling-and-management-in-r/introductory_state_example.csv) +and place it in the folder that you will use to work through the examples +in this tutorial. + +Since the data is in a csv file, we are going to use the read\_csv() +command in tidyverse's +[readr](https://cran.r-project.org/web/packages/readr/vignettes/readr.html) +package. + +The read\_csv function takes the path of a file we want to import from +as a variable so make sure that you have it set up correctly. + + # Import CSV File and save to us_state_populations_import + # Make sure you set the path of the file correctly + us_state_populations_import<-read_csv("introductory_state_example.csv") + +After you import the data, you will notice that there are three columns: +one for the population, one for the year, and one for the state. This +data is already in a tidy format providing us a multitude of options for +further exploration. + +For this example, let's visualize the population growth of California and +New York to gain a better understanding of Western migration. We will +use dplyr to filter our data so that it only contains information about +the states we are interested in, and we will use ggplot2 to visualize +this information. This exercise is just to provide you a taste of what +dplyr can do, so don't worry if you don't understand the code at this +time. + + # Filter to California and New York states only + california_and_new_york_state_populations<-us_state_populations_import %>% + filter(state %in% c("California", "New York")) + + # Plot California and New York State Populations + ggplot(data=california_and_new_york_state_populations, aes(x=year, y=population, color=state)) + + geom_line() + + geom_point() + +{% include figure.html filename="en-or-data-wrangling-and-management-in-r-01.png" caption="Graph of California and New York population" %} + +As we can see, the population of California has grown considerably +compared to New York. While this particular example may seem obvious +given the history of U.S. migration, the code itself provides us a +foundation that we can build on to ask a multitude of similar questions. +For instance, with a quick change of code, we can create a similar graph +with two different states such as Mississippi and Virginia. + + # Filter to Mississippi and Virginia + mississippi_and_virginia_state_populations<-us_state_populations_import %>% + filter(state %in% c("Mississippi", "Virginia")) + + # Plot California and New York State Populations + ggplot(data=mississippi_and_virginia_state_populations, aes(x=year, y=population, color=state)) + + geom_line() + + geom_point() + +{% include figure.html filename="en-or-data-wrangling-and-management-in-r-02.png" caption="Graph of Mississippi and Virginia population" %} + +Quickly making changes to our code and reanalyzing our data is a +fundamental part of exploratory data analysis (EDA). Rather than trying +to "prove" a hypothesis, exploratory data analysis helps us understand +our data better and ask questions about it. For historians, EDA provides +an easy means of knowing when to dig deeper into a subject and when to +step back, and it is an area where R excels. + +Pipe Operator +============= + +Before looking at dplyr, we need to go over the pipe operator (%>%) +in R since we will often run into it in our examples. As mentioned +earlier, the pipe operator is part of the +[magrittr](https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html) +package created by [Stefan Milton Bache](https://stefanbache.dk/) and +[Hadley Wickham](https://hadley.nz/) and is included in the tidyverse. +Its name is an homage to surrealest painter Rene Magritte, whose "The +Treachery of Images" famously depicted a pipe with the words "this is +not a pipe" underneath in French. + +The pipe operator allows you to pass what is to the left of the pipe as +the first variable in a function specified on the right. Although it may +seem strange at first, once you learn it, you will find that it makes +your code more readable by avoiding nested statements. Don't worry if +all this is a little confusing right now. It will become more clear as +we go through the examples. + +Let's say that we are interested in getting the square root of each +population value and then summing all the square roots before getting +the mean. Obviously, this isn't a useful measurement, but it demonstrates +just how quickly R code can become difficult to read. Normally, we would +nest such statements: + + mean(sum(sqrt(us_state_populations_import$population))) + + ## [1] 1256925 + +As you can see, with enough nested commands, it is hard to remember how +many parenthesis you need and makes the code awkward to read. To mitigate +this, some people may create temporary vectors in between each function +call. + + # Get square root of all the state populations + + sqrt_state_populations_vector<-sqrt(us_state_populations_import$population) + + # Get sum of all the sqrts of the temporary variable + + sum_sqrt_state_populations_vector<-sum(sqrt_state_populations_vector) + + # Get mean of the temporary variable + + mean_sum_sqrt_state_populations_vector<-mean(sum_sqrt_state_populations_vector) + + # Display the mean + + mean_sum_sqrt_state_populations_vector + + ## [1] 1256925 + +Although you get the same answer, this is a lot more readable. However, +it can quickly clutter your workspace if you forget to delete the +temporary vectors. The pipe operator does all this for you. Here is the +same code using the pipe operator. + + us_state_populations_import$population%>%sqrt%>%sum%>%mean + + ## [1] 1256925 + +This is a lot easier to read, and you could make it even more clear by +writing this on multiple lines. + + # Make sure to put the operator at the end of the line + us_state_populations_import$population%>% + sqrt%>% + sum%>% + mean + + ## [1] 1256925 + +Please note that the vectors or data frames that the pipe operator +creates are discarded after the operation is complete. If you want to +store them, you should pass them to a new variable. + + permanent_sqrt_and_sum_state_populations_vector <- us_state_populations_import$population%>%sqrt%>%sum%>%mean + permanent_sqrt_and_sum_state_populations_vector + + ## [1] 1256925 + +We Need a New Dataset +===================== + +Now that we have an understanding of the pipe operator, we are ready to +begin looking at and wrangling with some data. Unfortunately, for +historians, there are only a few easily available datasets--perhaps you +can help change this by making yours available to the public! We are +going to rely on the [history +data](https://cran.r-project.org/package=historydata) package +created by [Lincoln Mullen](https://lincolnmullen.com/). + +Lets go ahead and install and load the package: + + # Install historydata package + install.packages("historydata") + + # Load historydata package + library(historydata) + +This packages contains samples of historical datasets--the earlier U.S. +Census data sample was taken from this package. Throughout this +tutorial, we are specifically going to work with the early\_colleges +dataset that contains data about colleges founded before 1848. Lets +start by loading the data and view it. + + # Make sure you have installed the historydata package and loaded it before this + + data(early_colleges) + early_colleges + + ## # A tibble: 65 × 6 + ## college original_name city state established sponsorship + ## + ## 1 Harvard Cambridge MA 1636 Congregational; after 1805 Unitarian + ## 2 William and Mary Williamsburg VA 1693 Anglican + ## 3 Yale New Haven CT 1701 Congregational + ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational + ## 5 Princeton College of New Jersey Princeton NJ 1746 Presbyterian + ## 6 Columbia King's College New York NY 1754 Anglican + ## 7 Brown Providence RI 1765 Baptist + ## 8 Rutgers Queen's College New Brunswick NJ 1766 Dutch Reformed + ## 9 Dartmouth Hanover NH 1769 Congregational + ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican + ## # ... with 55 more rows + +As you can observe, this dataset contains the current name of the college, its +original name, the city and state where it was founded, when the college was +established, and its sponsorship. As we discussed earlier, before we can +work with a dataset, it is important to think about how to organize the +data. Let's see if any of our data is not in a "tidy" format. Do you see +any cells that do not match the three criteria for tidy data? + +If you guessed the sponsorship of Harvard, you are correct. In addition +to noting the original sponsorship, it also mentions that it changed +sponsorship in 1805. Usually, you want to keep as much information about +your data that you can, but for the purposes of this tutorial, we are +going to change the column to only have the original sponsorship. + + early_colleges[1,6] <- "Congregational" + early_colleges + + ## # A tibble: 65 × 6 + ## college original_name city state established sponsorship + ## + ## 1 Harvard Cambridge MA 1636 Congregational + ## 2 William and Mary Williamsburg VA 1693 Anglican + ## 3 Yale New Haven CT 1701 Congregational + ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational + ## 5 Princeton College of New Jersey Princeton NJ 1746 Presbyterian + ## 6 Columbia King's College New York NY 1754 Anglican + ## 7 Brown Providence RI 1765 Baptist + ## 8 Rutgers Queen's College New Brunswick NJ 1766 Dutch Reformed + ## 9 Dartmouth Hanover NH 1769 Congregational + ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican + ## # ... with 55 more rows + +Now that we have our data in a tidy format, we can shape it through the +dplyr package. + +What is Dplyr? +============== + +[Dplyr](https://cran.rstudio.com/web/packages/dplyr/vignettes/dplyr.html) +is another part of the tidyverse that provides functions for +manipulating and transforming your data. Because we are keeping our data +"tidy," we only need a small set of tools to explore our data. Compared +to base R, using dplyr is often faster, and guarantees that if our +input is tidy then our output will also be tidy. Perhaps most importantly, +dplyr makes our code easier to read and utilizes "verbs" that are, in +most cases, intuitive. Each function in dplyr corresponds to these verbs, +with the five key ones being filter, select, arrange, mutate, and +summarise--dplyr uses the British spelling. Let's go through each of them +individually to see how they work in practice. + +### Select + +If we look at the early\_colleges data, we can observe that there are a +lot of NA's in the original names column. NA signifies that the data is +not available, and we may want to view our data with this column +removed. dplyr's select() function gives us the ability to do this. It +takes the data frame you want to manipulate as the first argument, followed by a +list signifying which columns you would like to keep: + + # Remove the original names column using select() + # Note that you do not have to append the column name with a $ to the end of early_colleges since + # dplyr automatically assumes that a "," represents AND + + select(early_colleges, college, city, state, established, sponsorship) + + ## # A tibble: 65 × 5 + ## college city state established sponsorship + ## + ## 1 Harvard Cambridge MA 1636 congregational + ## 2 William and Mary Williamsburg VA 1693 Anglican + ## 3 Yale New Haven CT 1701 Congregational + ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational + ## 5 Princeton Princeton NJ 1746 Presbyterian + ## 6 Columbia New York NY 1754 Anglican + ## 7 Brown Providence RI 1765 Baptist + ## 8 Rutgers New Brunswick NJ 1766 Dutch Reformed + ## 9 Dartmouth Hanover NH 1769 Congregational + ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican + ## # ℹ 55 more rows + +Let's also go ahead and see how to write this using the pipe operator +(%>%): + + early_colleges%>% + select(college, city, state, established, sponsorship) + + ## # A tibble: 65 × 5 + ## college city state established sponsorship + ## + ## 1 Harvard Cambridge MA 1636 congregational + ## 2 William and Mary Williamsburg VA 1693 Anglican + ## 3 Yale New Haven CT 1701 Congregational + ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational + ## 5 Princeton Princeton NJ 1746 Presbyterian + ## 6 Columbia New York NY 1754 Anglican + ## 7 Brown Providence RI 1765 Baptist + ## 8 Rutgers New Brunswick NJ 1766 Dutch Reformed + ## 9 Dartmouth Hanover NH 1769 Congregational + ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican + ## # ℹ 55 more rows + +Referencing each of the columns that we want to keep just to get rid of +one is a little tedous. We can use the minus symbol (-) to demonstrate +that we want to remove a column. + + early_colleges%>% + select(-original_name) + + ## # A tibble: 65 × 5 + ## college city state established sponsorship + ## + ## 1 Harvard Cambridge MA 1636 congregational + ## 2 William and Mary Williamsburg VA 1693 Anglican + ## 3 Yale New Haven CT 1701 Congregational + ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational + ## 5 Princeton Princeton NJ 1746 Presbyterian + ## 6 Columbia New York NY 1754 Anglican + ## 7 Brown Providence RI 1765 Baptist + ## 8 Rutgers New Brunswick NJ 1766 Dutch Reformed + ## 9 Dartmouth Hanover NH 1769 Congregational + ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican + ## # ℹ 55 more rows + +### Filter + +The filter() function does the same thing as the select function but +rather than choosing the column name, we can use it to filter rows using +a test requirement. For instance, we can view all the colleges that +existed before the turn of the century. + + early_colleges%>% + filter(established < 1800) + + ## # A tibble: 20 × 6 + ## college original_name city state established sponsorship + ## + ## 1 Harvard Cambridge MA 1636 Congregational + ## 2 William and Mary Williamsburg VA 1693 Anglican + ## 3 Yale New Haven CT 1701 Congregational + ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational + ## 5 Princeton College of New Jersey Princeton NJ 1746 Presbyterian + ## 6 Columbia King's College New York NY 1754 Anglican + ## 7 Brown Providence RI 1765 Baptist + ## 8 Rutgers Queen's College New Brunswick NJ 1766 Dutch Reformed + ## 9 Dartmouth Hanover NH 1769 Congregational + ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican + ## 11 Hampden-Sydney Hampden-Sydney VA 1775 Presbyterian + ## 12 Transylvania Lexington KY 1780 Disciples of Christ + ## 13 Georgia, Univ. of Athens GA 1785 Secular + ## 14 Georgetown Washington DC 1789 Roman Catholic + ## 15 North Carolina, Univ. of Chapel Hill NC 1789 Secular + ## 16 Vermont, Univ. of Burlington VT 1791 Nondenominational + ## 17 Williams Williamstown MA 1793 Congregational + ## 18 Tennessee, Univ. of Blount College Knoxville TN 1794 Secular + ## 19 Union College Schenectady NY 1795 Presbyterian with Congregational + ## 20 Marietta Marietta OH 1797 Congregational + +### Mutate + +The mutate command allows you to add a column to your data frame. Right +now, we have the city and state in two separate columns. We can use the +paste command to combine two strings and specify a seperator. Let's place +them in a single column called "location." + + early_colleges%>%mutate(location=paste(city,state,sep=",")) + + ## # A tibble: 65 × 7 + ## college original_name city state established sponsorship location + ## + ## 1 Harvard Cambridge MA 1636 Congregational Cambridge,MA + ## 2 William and Mary Williamsburg VA 1693 Anglican Williamsburg,VA + ## 3 Yale New Haven CT 1701 Congregational New Haven,CT + ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational Philadelphia,PA + ## 5 Princeton College of New Jersey Princeton NJ 1746 Presbyterian Princeton,NJ + ## 6 Columbia King's College New York NY 1754 Anglican New York,NY + ## 7 Brown Providence RI 1765 Baptist Providence,RI + ## 8 Rutgers Queen's College New Brunswick NJ 1766 Dutch Reformed New Brunswick,NJ + ## 9 Dartmouth Hanover NH 1769 Congregational Hanover,NH + ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican Charleston,SC + ## # ... with 55 more rows + +Again, you need to remember that dplyr does not save the data or +manipulate the original. Instead, it creates a temporary data frame at +each step. If you want to keep it, you need to create a permanent +variable. + + early_colleges_with_location <- early_colleges%>% + mutate(location=paste(city, state, sep=",")) + + # View the new tibble with the location added + early_colleges_with_location + + ## # A tibble: 65 × 7 + ## college original_name city state established sponsorship location + ## + ## 1 Harvard Cambridge MA 1636 Congregational Cambridge,MA + ## 2 William and Mary Williamsburg VA 1693 Anglican Williamsburg,VA + ## 3 Yale New Haven CT 1701 Congregational New Haven,CT + ## 4 Pennsylvania, Univ. of Philadelphia PA 1740 Nondenominational Philadelphia,PA + ## 5 Princeton College of New Jersey Princeton NJ 1746 Presbyterian Princeton,NJ + ## 6 Columbia King's College New York NY 1754 Anglican New York,NY + ## 7 Brown Providence RI 1765 Baptist Providence,RI + ## 8 Rutgers Queen's College New Brunswick NJ 1766 Dutch Reformed New Brunswick,NJ + ## 9 Dartmouth Hanover NH 1769 Congregational Hanover,NH + ## 10 Charleston, Coll. Of Charleston SC 1770 Anglican Charleston,SC + ## # ... with 55 more rows + +### Arrange + +The arrange() function allows us to order our columns in a new way. +Currently, the colleges are organized by year in ascending order. Lets +place them in descending order of establishment, in this case, from the end of the Mexican-American War. + + early_colleges %>% + arrange(desc(established)) + + ## # A tibble: 65 × 6 + ## college original_name city state established sponsorship + ## + ## 1 Wisconsin, Univ. of Madison WI 1848 Secular + ## 2 Earlham Richmond IN 1847 Quaker + ## 3 Beloit Beloit WI 1846 Congregational + ## 4 Bucknell Lewisburg PA 1846 Baptist + ## 5 Grinnell Grinnell IA 1846 Congregational + ## 6 Mount Union Alliance OH 1846 Methodist + ## 7 Louisiana, Univ. of New Orleans LA 1845 Secular + ## 8 U.S. Naval Academy Annapolis MD 1845 Secular + ## 9 Mississipps, Univ. of Oxford MI 1844 Secular + ## 10 Holy Cross Worchester MA 1843 Roman Catholic + ## # ... with 55 more rows + +### Summarise + +The last key function in dplyr is summarise()--note the British +spelling. Summarise() takes a function or operation, and is usually used +to create a data frame that contains summary statistics for plotting. We +will use it to calculate the average year that colleges before 1848 were +founded. + + early_colleges%>%summarise(mean(established)) + + ## # A tibble: 1 x 1 + ## `mean(established)` + ## + ## 1 1809.831 + +Putting it All Together +======================= + +Now that we have gone through the five main verbs for dplyr, we can use +them to create a quick visualization of our data. Let's go ahead and +create a bar graph showing the number of secular and non-secular +colleges founded before the U.S. War of 1812: + + secular_colleges_before_1812<-early_colleges%>% + filter(established < 1812)%>% + mutate(is_secular=ifelse(sponsorship!="Secular", "no", "yes")) + + ggplot(secular_colleges_before_1812) + + geom_bar(aes(x=is_secular, fill=is_secular))+ + labs(x="Is the college secular?") + +{% include figure.html filename="en-or-data-wrangling-and-management-in-r-03.png" caption="Number of secular and non-secular colleges before War of 1812" %} + +Again, by making a quick change to our code, we can also look at the +number of secular versus non-secular colleges founded after the start of +the War of 1812: + + secular_colleges_after_1812<-early_colleges%>% + filter(established > 1812)%>% + mutate(is_secular=ifelse(sponsorship!="Secular", "no", "yes")) + + ggplot(secular_colleges_after_1812) + + geom_bar(aes(x=is_secular, fill=is_secular))+ + labs(x="Is the college secular?") + +({% include figure.html filename="en-or-data-wrangling-and-management-in-r-04.png" caption="Number of secular and non-secular colleges after War of 1812" %} + +Conclusion +========== + +This tutorial should put you well on the way to thinking about how to +organize and manipulate your data in R. Later, you will probably want to +graph your data in some way. I recommend that you begin looking at the +[ggplot2](https://ggplot2.tidyverse.org/) package for a set of tools that work +well with dplyr. In addition, you may want to examine some of the +other functions that come with dplyr to hone your skills. Either way, +this should provide a good foundation to build on and cover a lot of +the common problems you will encounter. diff --git a/en/lessons/dealing-with-big-data-and-network-analysis-using-neo4j.md b/en/lessons/dealing-with-big-data-and-network-analysis-using-neo4j.md index 013ebe33ee..32818d89fb 100755 --- a/en/lessons/dealing-with-big-data-and-network-analysis-using-neo4j.md +++ b/en/lessons/dealing-with-big-data-and-network-analysis-using-neo4j.md @@ -16,7 +16,7 @@ topics: [network-analysis, data-visualization] activity: analyzing abstract: "In this lesson we will learn how to use a graph database to store and analyze complex networked information. This tutorial will focus on the Neo4j graph database, and the Cypher query language that comes with it." layout: lesson -redirect_from: /lessons/dealing-with-big-data-and-network-analysis-using-neo4j +redirect_from: /lessons/dealing-with-big-data-and-network-analysis-using-neo4j/ avatar_alt: Constellation chart doi: 10.46430/phen0074 --- @@ -62,8 +62,8 @@ The final section of this lesson contains code and data to illustrate the key po Although beyond the scope of this tutorial, those interested in trying to better understand social networks can refer to a number of sources. -Sociologists Robert A. Hanneman and Mark Riddle maintain an [on-line textbook on network analysis](http://faculty.ucr.edu/~hanneman/nettext/). -There are also regular conferences hosted and useful resources available from the [International Network for Social Network Analysis](http://www.insna.org). +Sociologists Robert A. Hanneman and Mark Riddle maintain an [on-line textbook on network analysis](https://faculty.ucr.edu/~hanneman/nettext/). +There are also regular conferences hosted and useful resources available from the [International Network for Social Network Analysis](https://www.insna.org).
    I strongly recommend that you read the lesson through before trying the example data. @@ -82,7 +82,7 @@ ties. (Nodes are also referred to as "vertices" and ties are referred to as "edg Databases are designed for dealing with large amounts of data. However, when working with small datasets it is often more efficient not to use a database. The *Programming Historian* has excellent tutorials for dealing with network data. -For an introduction, see [Exploring and Analyzing Network Data with Python](/lessons/exploring-and-analyzing-network-data-with-python). +For an introduction, see [Exploring and Analyzing Network Data with Python](/en/lessons/exploring-and-analyzing-network-data-with-python). # Installing and creating a Neo4j database @@ -348,7 +348,7 @@ provides us with the basic corporate interlock network that existed in Canada in If we use the web interface that comes with Neo4j we'll be able to see what parts of this network looks like by using a simple query. With the Neo4j database running, we can open up the built in browser to make more Cypher queries. -(Or we can put the following URL into a browser [http://localhost:7474/browser/](http://localhost:7474/browser/). +(Or we can put the following URL into a browser [https://localhost:7474/browser/](https://localhost:7474/browser/). Add the following Cypher query. @@ -402,7 +402,7 @@ web site. # Putting it all together: A working example -If we return to the [web interface on your local machine](http://localhost:7474) we can query our new database. +If we return to the [web interface on your local machine](https://localhost:7474) we can query our new database. Let's look at the firms that have the greatest number of connections (i.e. the highest degree). To calculate degree we can make a simple query with Cypher. @@ -438,7 +438,7 @@ return c0, r, c1; {% include figure.html filename="graph_example.png" caption="Example graph" %} -You can download the data used in this lesson [here](http://jgmackay.com/) (search for the relevant blog posts). +You can download the data used in this lesson [here](https://jgmackay.com/) (search for the relevant blog posts). If you make use of this data, please cite the following in addition to this lesson: Mackay, Jon. 2017. "Canadian Regional and National Business Elites in 1912: Who Was Connected, Who Wasn't and diff --git a/en/lessons/detecting-text-reuse-with-passim.md b/en/lessons/detecting-text-reuse-with-passim.md index 93d55f4c1d..e00ac86bc8 100644 --- a/en/lessons/detecting-text-reuse-with-passim.md +++ b/en/lessons/detecting-text-reuse-with-passim.md @@ -40,7 +40,7 @@ The following list includes just some of the libraries available that perform au - [Basic Local Alignment Search Tool (BLAST)](https://blast.ncbi.nlm.nih.gov/Blast.cgi) - [Tesserae](https://github.com/tesserae/tesserae) (PHP, Perl) - [TextPAIR (Pairwise Alignment for Intertextual Relations)](https://github.com/ARTFL-Project/text-pair) -- [Passim](https://github.com/dasmiq/passim) (Scala) developed by [David Smith](http://www.ccs.neu.edu/home/dasmith/ +- [Passim](https://github.com/dasmiq/passim) (Scala) developed by [David Smith](https://www.ccs.neu.edu/home/dasmith/ ) (Northeastern University) For this tutorial we chose the Passim library for three main reasons. Firstly, it can be adapted to a variety of use cases as it works well on a small text collection as well as on a large-scale corpus. Secondly, while the documentation for Passim is extensive, because of its relatively advanced user audience, a more user-centered step-by-step tutorial about detecting text reuse with Passim would be beneficial to the user community. Lastly, the following examples illustrate the variety of scenarios in which text reuse is a useful methodology: @@ -204,7 +204,7 @@ export PATH="/home/simon/Passim/bin:$PATH" ### Installing Spark -1. Navigate to the [download section](http://spark.apache.org/downloads) of the Spark website and select Spark release version '3.x.x' (where '*x*' means any version that starts with '3.'), and package type 'Pre-built for Apache Hadoop 2.7' from the dropdown menus. +1. Navigate to the [download section](https://spark.apache.org/downloads) of the Spark website and select Spark release version '3.x.x' (where '*x*' means any version that starts with '3.'), and package type 'Pre-built for Apache Hadoop 2.7' from the dropdown menus. 2. Extract the compressed binaries to a directory of your choice (e.g. `/Applications`): ```bash @@ -252,7 +252,7 @@ If the command above returns `1.8.0_252` or similar, then you have Java Developm ### Compiling Passim from the Sources -Refer to the [compilation instructions for macOS](#compiling-passim-from-the-sources-(macOS)), as they are the same for the Linux environment. +Refer to the [compilation instructions for macOS](#compiling-passim-from-the-sources-macos), as they are the same for the Linux environment. ### Installing Spark @@ -402,7 +402,7 @@ Ultimately, what constitutes a document, and how these documents should be divid ## Basic JSON format -The input format for Passim consists of JSON documents in the [JSON lines format](http://jsonlines.org/) (i.e. each line of text contains a single JSON document). +The input format for Passim consists of JSON documents in the [JSON lines format](https://jsonlines.org/) (i.e. each line of text contains a single JSON document). The following file content for a file named `test.json` illustrates a minimal example of the input format for Passim: @@ -647,7 +647,7 @@ The following example illustrates a step-by-step approach to troubleshooting thi Passim has failed to recognize the coordinate field as containing integer values and it has interpreted as a long data type. At this point, we need to change the type of the sub-fields of `coords` (i.e. `h`, `w`, `x`, and `y`) from `"type": "long"` to `"type": "integer"`. This type mismatch needs to be fixed, otherwise Passim will treat `int` values as if they were `long`, thus potentially leading to issues or inconsistencies in the generated output. -We can now save the schema for later into a new file (`passim.schema`) for later use. This schema is needed when processing the input data provided for [the second case study](#case-study-2:-text-reuse-in-a-large-corpus-of-historical-newspapers) presented in this lesson. +We can now save the schema for later into a new file (`passim.schema`) for later use. This schema is needed when processing the input data provided for [the second case study](#case-study-2-text-reuse-in-a-large-corpus-of-historical-newspapers) presented in this lesson. # Running Passim @@ -705,9 +705,9 @@ You are now ready to go forward with your first text reuse project. >>> SPARK_SUBMIT_ARGS='--master local[12] --driver-memory 8G --executor-memory 4G' passim passim_in.json passim_output_bible/ ``` -For now, do not worry about the additional arguments `SPARK_SUBMIT_ARGS='--master local[12] --driver-memory 8G --executor-memory 4G'`; in the section ["Case Study 2"](#case-study-2:-text-reuse-in-a-large-corpus-of-historical-newspapers) we will explain them in detail. +For now, do not worry about the additional arguments `SPARK_SUBMIT_ARGS='--master local[12] --driver-memory 8G --executor-memory 4G'`; in the section ["Case Study 2"](#case-study-2-text-reuse-in-a-large-corpus-of-historical-newspapers) we will explain them in detail. -This test case takes approximatively eight minutes on a recent laptop with eight threads. You can also follow the progress of the detection at http://localhost:4040 — an interactive dashboard created by Spark (Note: the dashboard will shut down as soon as Passim has finished running). +This test case takes approximatively eight minutes on a recent laptop with eight threads. You can also follow the progress of the detection at https://localhost:4040 — an interactive dashboard created by Spark (Note: the dashboard will shut down as soon as Passim has finished running). ## Case study 2: Text Reuse in a large corpus of historical newspapers @@ -895,15 +895,15 @@ MR gratefully acknowledges the financial support of the Swiss National Science F # Bibliography -1. Greta Franzini, Maria Moritz, Marco Büchler, Marco Passarotti. Using and evaluating TRACER for an Index fontium computatus of the Summa contra Gentiles of Thomas Aquinas. In *Proceedings of the Fifth Italian Conference on Computational Linguistics (CLiC-it 2018)*. (2018). [Link](http://ceur-ws.org/Vol-2253/paper22.pdf) -2. David A. Smith, Ryan Cordell, Abby Mullen. Computational Methods for Uncovering Reprinted Texts in Antebellum Newspapers. *American Literary History* **27**, E1–E15 Oxford University Press, 2015. [Link](http://dx.doi.org/10.1093/alh/ajv029) -3. Ryan Cordell. Reprinting Circulation, and the Network Author in Antebellum Newspapers. *American Literary History* **27**, 417–445 Oxford University Press (OUP), 2015. [Link](http://dx.doi.org/10.1093/alh/ajv028) -4. Daniel Vogler, Linards Udris, Mark Eisenegger. Measuring Media Content Concentration at a Large Scale Using Automated Text Comparisons. *Journalism Studies* **0**, 1–20 Taylor & Francis, 2020. [Link](http://dx.doi.org/10.1080/1461670x.2020.1761865) +1. Greta Franzini, Maria Moritz, Marco Büchler, Marco Passarotti. Using and evaluating TRACER for an Index fontium computatus of the Summa contra Gentiles of Thomas Aquinas. In *Proceedings of the Fifth Italian Conference on Computational Linguistics (CLiC-it 2018)*. (2018). [Link](https://ceur-ws.org/Vol-2253/paper22.pdf) +2. David A. Smith, Ryan Cordell, Abby Mullen. Computational Methods for Uncovering Reprinted Texts in Antebellum Newspapers. *American Literary History* **27**, E1–E15 Oxford University Press, 2015. [Link](https://dx.doi.org/10.1093/alh/ajv029) +3. Ryan Cordell. Reprinting Circulation, and the Network Author in Antebellum Newspapers. *American Literary History* **27**, 417–445 Oxford University Press (OUP), 2015. [Link](https://dx.doi.org/10.1093/alh/ajv028) +4. Daniel Vogler, Linards Udris, Mark Eisenegger. Measuring Media Content Concentration at a Large Scale Using Automated Text Comparisons. *Journalism Studies* **0**, 1–20 Taylor & Francis, 2020. [Link](https://dx.doi.org/10.1080/1461670x.2020.1761865) 5. Lincoln Mullen. textreuse: Detect Text Reuse and Document Similarity. (2016). [Link](https://github.com/ropensci/textreuse) -6. Marco Büchler, Philip R. Burns, Martin Müller, Emily Franzini, Greta Franzini. Towards a Historical Text Re-use Detection. 221–238 In *Text Mining: From Ontology Learning to Automated Text Processing Applications*. Springer International Publishing, 2014. [Link](http://dx.doi.org/10.1007/978-3-319-12655-5_11) -7. Paul Vierthaler, Meet Gelein. A BLAST-based, Language-agnostic Text Reuse Algorithm with a MARKUS Implementation and Sequence Alignment Optimized for Large Chinese Corpora. *Journal of Cultural Analytics* (2019). [Link](http://dx.doi.org/10.22148/16.034) +6. Marco Büchler, Philip R. Burns, Martin Müller, Emily Franzini, Greta Franzini. Towards a Historical Text Re-use Detection. 221–238 In *Text Mining: From Ontology Learning to Automated Text Processing Applications*. Springer International Publishing, 2014. [Link](https://dx.doi.org/10.1007/978-3-319-12655-5_11) +7. Paul Vierthaler, Meet Gelein. A BLAST-based, Language-agnostic Text Reuse Algorithm with a MARKUS Implementation and Sequence Alignment Optimized for Large Chinese Corpora. *Journal of Cultural Analytics* (2019). [Link](https://dx.doi.org/10.22148/16.034) 8. Aleksi Vesanto, Asko Nivala, Heli Rantala, Tapio Salakoski, Hannu Salmi, Filip Ginter. Applying BLAST to Text Reuse Detection in Finnish Newspapers and Journals, 1771-1910. 54–58 In *Proceedings of the NoDaLiDa 2017 Workshop on Processing Historical Language*. Linköping University Electronic Press, 2017. [Link](https://aclanthology.org/W17-0510.pdf) 9. Hannu Salmi, Heli Rantala, Aleksi Vesanto, Filip Ginter. The long-term reuse of text in the Finnish press, 1771–1920. **2364**, 394–544 In *CEUR Workshop Proceedings*. (2019). -10. Axel J Soto, Abidalrahman Mohammad, Andrew Albert, Aminul Islam, Evangelos Milios, Michael Doyle, Rosane Minghim, Maria Cristina de Oliveira. Similarity-Based Support for Text Reuse in Technical Writing. 97–106 In *Proceedings of the 2015 ACM Symposium on Document Engineering*. ACM, 2015. [Link](http://dx.doi.org/10.1145/2682571.2797068) +10. Axel J Soto, Abidalrahman Mohammad, Andrew Albert, Aminul Islam, Evangelos Milios, Michael Doyle, Rosane Minghim, Maria Cristina de Oliveira. Similarity-Based Support for Text Reuse in Technical Writing. 97–106 In *Proceedings of the 2015 ACM Symposium on Document Engineering*. ACM, 2015. [Link](https://doi.org/10.1145/2682571.2797068) 11. Alexandra Schofield, Laure Thompson, David Mimno. Quantifying the Effects of Text Duplication on Semantic Models. 2737–2747 In *Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing*. Association for Computational Linguistics, 2017. [https://doi.org/10.18653/v1/D17-1290](https://perma.cc/KSK6-5TXP) 12. Matteo Romanello, Aurélien Berra, Alexandra Trachsel. Rethinking Text Reuse as Digital Classicists. *Digital Humanities conference*, 2014. [Link](https://web.archive.org/web/20140829121705/https://wiki.digitalclassicist.org/Text_Reuse) diff --git a/en/lessons/displaying-georeferenced-map-knightlab-storymap-js.md b/en/lessons/displaying-georeferenced-map-knightlab-storymap-js.md index ffe4b03d3c..d6209f2aed 100644 --- a/en/lessons/displaying-georeferenced-map-knightlab-storymap-js.md +++ b/en/lessons/displaying-georeferenced-map-knightlab-storymap-js.md @@ -25,7 +25,7 @@ abstract: In this lesson, you will learn how to display a georeferenced map from # Lesson Goals -Georeferencing is the process of assigning geographic coordinates to a scanned map or raster image. Many historians are now [georeferencing](https://perma.cc/5CLB-HNXN) historical maps to study how places have changed over time. After georeferencing a historic map, you may want to display it online to share your research or tell a story. In this lesson, we will show you how to share a georeferenced map online using StoryMap JS, an interactive web-based mapping platform. [StoryMap JS](https://perma.cc/6PNW-KTM3) is an open-source tool from Knight Lab at Northwestern University, which allows you to integrate historical and contemporary maps into interactive stories. If you haven’t yet created a georeferenced map, the *Programming Historian* lessons on [Georeferencing in QGIS 2.0](/lessons/georeferencing-qgis) and [Introduction to Map Warper](/en/lessons/introduction-map-warper) offer detailed tutorials. +Georeferencing is the process of assigning geographic coordinates to a scanned map or raster image. Many historians are now [georeferencing](https://perma.cc/5CLB-HNXN) historical maps to study how places have changed over time. After georeferencing a historic map, you may want to display it online to share your research or tell a story. In this lesson, we will show you how to share a georeferenced map online using StoryMap JS, an interactive web-based mapping platform. [StoryMap JS](https://perma.cc/6PNW-KTM3) is an open-source tool from Knight Lab at Northwestern University, which allows you to integrate historical and contemporary maps into interactive stories. If you haven’t yet created a georeferenced map, the *Programming Historian* lessons on [Georeferencing in QGIS 2.0](/en/lessons/georeferencing-qgis) and [Introduction to Map Warper](/en/lessons/introduction-map-warper) offer detailed tutorials. # Creating your narrative with StoryMap JS @@ -58,7 +58,7 @@ To access the StoryMap JS authoring tool, go to the [StoryMap JS website](https: {% include figure.html filename="make_storymapjs.png" caption="Story Map JS: Make a StoryMap." %} -You will be prompted to login with a Google account. If you don’t have one, you can create one at [gmail.com](http://gmail.com). +You will be prompted to login with a Google account. If you don’t have one, you can create one at [gmail.com](https://gmail.com). {% include figure.html filename="gmail_signin.png" caption="StoryMap JS: Sign in with Google." %} diff --git a/en/lessons/downloading-multiple-records-using-query-strings.md b/en/lessons/downloading-multiple-records-using-query-strings.md index cc2bbe44dd..4aa938051d 100644 --- a/en/lessons/downloading-multiple-records-using-query-strings.md +++ b/en/lessons/downloading-multiple-records-using-query-strings.md @@ -16,11 +16,11 @@ exclude_from_check: activity: acquiring topics: [web-scraping] abstract: "Downloading a single record from a website is easy, but downloading many records at a time – an increasingly frequent need for a historian – is much more efficient using a programming language such as Python. In this lesson, we will write a program that will download a series of records from the Old Bailey Online using custom search criteria, and save them to a directory on our computer." -previous: output-keywords-in-context-in-html-file +previous: /en/lessons/output-keywords-in-context-in-html-file series_total: 15 lessons sequence: 15 python_warning: false -redirect_from: /lessons/downloading-multiple-records-using-query-strings +redirect_from: /lessons/downloading-multiple-records-using-query-strings/ avatar_alt: Figures working in a mine, pushing carts doi: 10.46430/phen0005 --- @@ -1166,20 +1166,20 @@ have recently released an API and the documentation can be quite helpful: - Old Bailey Online API - () -- Python Best way to create directory if it doesn’t exist for file write? () - - - [Old Bailey Online]: http://www.oldbaileyonline.org/ - [Automated Downloading with WGET]: /lessons/automated-downloading-with-wget - [Benjamin Bowsey’s case]: http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33 - [advanced search form]: http://www.oldbaileyonline.org/forms/formMain.jsp - [Viewing HTML Files]: /lessons/viewing-html-files - [Working with Webpages]: /lessons/working-with-web-pages - [From HTML to a List of Words 2]: /lessons/from-html-to-list-of-words-2 - [range]: https://docs.python.org/3/tutorial/controlflow.html#the-range-function - [regular expressions]: https://docs.python.org/3/library/re.html - [Counting Frequencies]: /lessons/counting-frequencies - [time out]: http://www.checkupdown.com/status/E408.html - [Python Programming Basics]: /lessons/introduction-and-installation - [try / except]: http://docs.python.org/tutorial/errors.html + () +- Python Best way to create directory if it doesn’t exist for file write? () + + +- [Old Bailey Online](https://www.oldbaileyonline.org/) +- [Automated Downloading with WGET](/en/lessons/automated-downloading-with-wget) +- [Benjamin Bowsey’s case](https://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33) +- [advanced search form](https://www.oldbaileyonline.org/forms/formMain.jsp) +- [Viewing HTML Files](/en/lessons/viewing-html-files) +- [Working with Webpages](/en/lessons/working-with-web-pages) +- [From HTML to a List of Words 2](/en/lessons/from-html-to-list-of-words-2) +- [range](https://docs.python.org/3/tutorial/controlflow.html#the-range-function) +- [regular expressions](https://docs.python.org/3/library/re.html) +- [Counting Frequencies](/en/lessons/counting-frequencies) +- [time out](https://www.checkupdown.com/status/E408.html) +- [Python Programming Basics](/en/lessons/introduction-and-installation) +- [try / except](https://docs.python.org/tutorial/errors.html) \ No newline at end of file diff --git a/en/lessons/editing-audio-with-audacity.md b/en/lessons/editing-audio-with-audacity.md index b1b1b92cac..31cab32dde 100755 --- a/en/lessons/editing-audio-with-audacity.md +++ b/en/lessons/editing-audio-with-audacity.md @@ -14,7 +14,7 @@ activity: transforming topics: [data-manipulation] abstract: "In this lesson you will learn how to use Audacity to load, record, edit, mix, and export audio files." review-ticket: https://github.com/programminghistorian/ph-submissions/issues/15 -redirect_from: /lessons/editing-audio-with-audacity +redirect_from: /lessons/editing-audio-with-audacity/ avatar_alt: Two gramophones facing each other doi: 10.46430/phen0050 --- @@ -29,7 +29,7 @@ doi: 10.46430/phen0050 For those interested in audio, basic sound editing skills go a long way. Being able to handle and manipulate the materials can help you take control of your object of study: you can zoom in and extract particular moments to analyze, process the audio, and upload the materials to a server to compliment a blog post on the topic. On a more practical level, these skills could also allow you to record and package recordings of yourself or others for distribution. That guest lecture taking place in your department? Record it and edit it yourself! Doing so is a lightweight way to distribute resources among various institutions, and it also helps make the materials more accessible for readers and listeners with a wide variety of learning needs. -In this lesson you will learn how to use *[Audacity](http://audacityteam.org/)* to load, record, edit, mix, and export audio files. Sound editing platforms are often expensive and offer extensive capabilities that can be overwhelming to the first-time user, but *Audacity* is a free and open source alternative that offers powerful capabilities for sound editing with a low barrier for entry. +In this lesson you will learn how to use *[Audacity](https://audacityteam.org/)* to load, record, edit, mix, and export audio files. Sound editing platforms are often expensive and offer extensive capabilities that can be overwhelming to the first-time user, but *Audacity* is a free and open source alternative that offers powerful capabilities for sound editing with a low barrier for entry. For this lesson we will work with two audio files: a recording of [Bach's Goldberg Variations]({{ root_url }}/assets/editing-audio-with-audacity/bach-goldberg-variations.mp3) and another recording of your own voice that will be made in the course of the lesson. @@ -41,7 +41,7 @@ First, download the necessary files. You will need the [mp3 file of Bach's Goldberg Variations]({{ root_url }}/assets/editing-audio-with-audacity/bach-goldberg-variations.mp3). To download, right click [here]({{ root_url }}/assets/editing-audio-with-audacity/bach-goldberg-variations.mp3) and select 'Save Link As' to download the file onto your computer as an MP3. -Next, download and install *Audacity*, which is available on [the project site](http://audacityteam.org/). *Audacity* can be used on Mac OSX, Windows, or Linux. +Next, download and install *Audacity*, which is available on [the project site](https://audacityteam.org/). *Audacity* can be used on Mac OSX, Windows, or Linux. Download the program and double-click to install. @@ -53,7 +53,7 @@ The interface will change to reflect the loaded data: *Audacity* converts your sound into a waveform, a commonly used mode for representing sound. The x-axis represents time as seconds (or minutes and seconds, depending on the length of the clip). The beginning of the sound occurs at the far left of the interface, and *Audacity* ticks off periodic time markers as the wave continues to the right. If we click the play button, *Audacity* will move from left to right over the sound, with a vertical line representing our currrent point in the clip. -The y-axis represents amplitude, what we experience as loudness or volume. By default, the y-axis measures volume on a vertical linear scale from -1 to 1: the -1 and 1 extremes represent the loudest possible recorded sound without distortion, while 0 represents silence. So silence begins as a flat line, and the sound will get taller and deeper as it increases in intensity. For more information on why some of the numbers are negative, check out Jeffrey Hass' very short [primer to acoustics](http://web.archive.org/web/20161119231053/http://www.indiana.edu:80/~emusic/acoustics/amplitude.htm). +The y-axis represents amplitude, what we experience as loudness or volume. By default, the y-axis measures volume on a vertical linear scale from -1 to 1: the -1 and 1 extremes represent the loudest possible recorded sound without distortion, while 0 represents silence. So silence begins as a flat line, and the sound will get taller and deeper as it increases in intensity. For more information on why some of the numbers are negative, check out Jeffrey Hass' very short [primer to acoustics](https://web.archive.org/web/20161119231053/https://www.indiana.edu:80/~emusic/acoustics/amplitude.htm). *Audacity*'s representation of time and amplitude are your first and easiest point of reference for editing sound, and the tool offers handy ways to navigate around them. I keep calling this a wave, but it doesn't look all that much like one just yet. Let's take a closer look by selecting a piece of the audio track. @@ -89,7 +89,7 @@ You will be presented with something that looks like this: Our original Bach recording stays at the top of the interface, while our new recording gets added below it. By default, *Audacity* will not overwrite your previous recording. Instead, it isolates both soundstreams, or tracks, allowing us to manipulate separate components before we mix them together in a final recording. We can make changes to one without affecting the other. Note how, time-wise, the new track by default was recorded at the beginning of the audacity project. For right now, the Bach and vocal tracks both begin at the same time. There are potentially some other imperfections in your unique recording, some of which we can fix. -Finally, note how in my example there are two waveforms for the Bach recording but only one for the recording of my own voice. The Bach recording was made in *stereo*, meaning there were two input feeds, while my own recording was made in *mono*. Audacity allows you to record in both, and either one will work for this lesson, so don't worry if your recording appears in stereo. You can change from mono to stereo recording and vice versa from the 'Edit' toolbar, accessible from the 'Toolbars' portion of the 'View' menu. For more information on mono vs stereo, check out this [reading](http://www.diffen.com/difference/Mono_vs_Stereo). +Finally, note how in my example there are two waveforms for the Bach recording but only one for the recording of my own voice. The Bach recording was made in *stereo*, meaning there were two input feeds, while my own recording was made in *mono*. Audacity allows you to record in both, and either one will work for this lesson, so don't worry if your recording appears in stereo. You can change from mono to stereo recording and vice versa from the 'Edit' toolbar, accessible from the 'Toolbars' portion of the 'View' menu. For more information on mono vs stereo, check out this [reading](https://www.diffen.com/difference/Mono_vs_Stereo). An aside: it can frequently be helpful to turn your laptop's sound output into its input, so that you can record the sounds playing from your computer without worrying about extraneous noise from the outside world or to rerecord digital audio. For information on how to carry out this process, check out [Soundflower](https://github.com/mattingalls/Soundflower). @@ -143,7 +143,7 @@ But we will eventually want to transition the track's focus away from the intro * Selecting "Crossfade Tracks" from the Effect menu will tell Audacity to fade out the top track while fading in the bottom track - the positioning of the tracks matters in this case. -*Audacity* will prompt you with options for your crossfade, but for now it is fine to go with the default setting of "Constant Gain." This setting ensures that both tracks will fade in or linearly (for more information, check out the *Audacity* [documentation on crossfades](http://manual.audacityteam.org/man/crossfade_tracks.html)) +*Audacity* will prompt you with options for your crossfade, but for now it is fine to go with the default setting of "Constant Gain." This setting ensures that both tracks will fade in or linearly (for more information, check out the *Audacity* [documentation on crossfades](https://manual.audacityteam.org/man/crossfade_tracks.html)) {% include figure.html filename="editing-audio-with-audacity-13.png" caption="Post-crossfade" %} @@ -157,6 +157,6 @@ By default, everything you do in *Audacity* is saved in the tool's own filetype, Doing so will mix the multiple tracks down to a single audio file and give you the opportunity to provide your work with metadata. -There are a range of different options for you to refine the exporting process, but the most important is "File Type." MP3 and Ogg are good options for audio meant to be displayed on the web, as they both compress the files so that they will be quicker to load. For best results, you can actually include both formats and only display the one as a fallback when one is not supported by a user's web browser. For more information, *NCH Software* provides a [good technical breakdown of the different options](http://www.nch.com.au/acm/formats.html), while Jonathan Sterne has done [fascinating work](https://www.dukeupress.edu/mp3) on the cultural implications of such format decisions. And the W3Schools offer a [good comparison](http://www.w3schools.com/html/html5_audio.asp) of these file formats for use in web development. +There are a range of different options for you to refine the exporting process, but the most important is "File Type." MP3 and Ogg are good options for audio meant to be displayed on the web, as they both compress the files so that they will be quicker to load. For best results, you can actually include both formats and only display the one as a fallback when one is not supported by a user's web browser. For more information, *NCH Software* provides a [good technical breakdown of the different options](https://www.nch.com.au/acm/formats.html), while Jonathan Sterne has done [fascinating work](https://www.dukeupress.edu/mp3) on the cultural implications of such format decisions. And the W3Schools offer a [good comparison](https://www.w3schools.com/html/html5_audio.asp) of these file formats for use in web development. Congratulations! You have successfully produced a baby podcast. It might not seem like much, but I frequently employ this same bag of tricks for presentations, websites, and scholarship. This lesson has by no means begun to exhaust the many topics under that umbrella. But it should have given you some basic tools useful to working with sound in digital humanities projects. diff --git a/en/lessons/exploring-and-analyzing-network-data-with-python.md b/en/lessons/exploring-and-analyzing-network-data-with-python.md index 0b2a6ff95a..fb8f34fa17 100755 --- a/en/lessons/exploring-and-analyzing-network-data-with-python.md +++ b/en/lessons/exploring-and-analyzing-network-data-with-python.md @@ -22,7 +22,7 @@ activity: analyzing topics: [network-analysis, data-visualization] date: 2017-08-23 abstract: "This lesson introduces network metrics and how to draw conclusions from them when working with humanities data. You will learn how to use the NetworkX Python package to produce and work with these network statistics." -redirect_from: /lessons/exploring-and-analyzing-network-data-with-python +redirect_from: /lessons/exploring-and-analyzing-network-data-with-python/ avatar_alt: Train tracks intersecting doi: 10.46430/phen0064 modified: 2023-08-25 @@ -37,23 +37,23 @@ tested-date: 2023-08-21 ## Lesson Goals In this tutorial, you will learn: -- To use the [**NetworkX**](https://perma.cc/P9PX-GUE6) package for working with network data in [**Python**](/lessons/introduction-and-installation); and +- To use the [**NetworkX**](https://perma.cc/P9PX-GUE6) package for working with network data in [**Python**](/en/lessons/introduction-and-installation); and - To analyze humanities network data to find: - Network structure and path lengths, - Important or central nodes, and - Communities and subgroups -**n.b.:** This is a tutorial for exploring network statistics and metrics. We will therefore focus on ways to analyze, and draw conclusions from, networks without visualizing them. You'll likely want a combination of visualization and network metrics in your own project, and so we recommend this article as a companion to [this earlier Programming Historian tutorial](/lessons/creating-network-diagrams-from-historical-sources). +**n.b.:** This is a tutorial for exploring network statistics and metrics. We will therefore focus on ways to analyze, and draw conclusions from, networks without visualizing them. You'll likely want a combination of visualization and network metrics in your own project, and so we recommend this article as a companion to [this earlier Programming Historian tutorial](/en/lessons/creating-network-diagrams-from-historical-sources). ## Prerequisites This tutorial assumes that you have: -- a basic familiarity with networks and/or have read ["From Hermeneutics to Data to Networks: Data Extraction and Network Visualization of Historical Sources"](/lessons/creating-network-diagrams-from-historical-sources) by Martin Düring here on *Programming Historian*; -- Installed Python 3, not the Python 2 that is installed natively in Unix-based operating systems such as Macs (If you need assistance installing Python 3, check out the [Hitchhiker's Guide to Python](http://docs.python-guide.org/en/latest/starting/installation/)); and +- a basic familiarity with networks and/or have read ["From Hermeneutics to Data to Networks: Data Extraction and Network Visualization of Historical Sources"](/en/lessons/creating-network-diagrams-from-historical-sources) by Martin Düring here on *Programming Historian*; +- Installed Python 3, not the Python 2 that is installed natively in Unix-based operating systems such as Macs (If you need assistance installing Python 3, check out the [Hitchhiker's Guide to Python](https://docs.python-guide.org/en/latest/starting/installation/)); and - Installed the `pip` package installer.[^pipinstall] -It's possible to have two versions of Python (2 *and* 3) installed on your computer at one time. For this reason, when accessing Python 3 you will often have to explicitly declare it by typing `python3` and `pip3` instead of simply `python` and `pip`. Check out the *Programming Historian* tutorials on [installing Python](/lessons/introduction-and-installation) and [working with pip](/lessons/installing-python-modules-pip) for more information. +It's possible to have two versions of Python (2 *and* 3) installed on your computer at one time. For this reason, when accessing Python 3 you will often have to explicitly declare it by typing `python3` and `pip3` instead of simply `python` and `pip`. Check out the *Programming Historian* tutorials on [installing Python](/en/lessons/introduction-and-installation) and [working with pip](/en/lessons/installing-python-modules-pip) for more information. ## What might you learn from network data? @@ -69,13 +69,13 @@ This tutorial will help you answer questions such as: Before there were Facebook friends, there was the Society of Friends, known as the Quakers. Founded in England in the mid-seventeenth century, the Quakers were Protestant Christians who dissented from the official Church of England and promoted broad religious toleration, preferring Christians' supposed "inner light" and consciences to state-enforced orthodoxy. Quakers' numbers grew rapidly in the mid- to late-seventeenth century and their members spread through the British Isles, Europe, and the New World colonies---especially Pennsylvania, founded by Quaker leader William Penn and the home of your four authors. -Since scholars have long linked Quakers' growth and endurance to the effectiveness of their networks, the data used in this tutorial is a list of names and relationships among the earliest seventeenth-century Quakers. This dataset is derived from the *[Oxford Dictionary of National Biography](http://www.oxforddnb.com)* and from the ongoing work of the *[Six Degrees of Francis Bacon](http://www.sixdegreesoffrancisbacon.com)* project, which is reconstructing the social networks of early modern Britain (1500-1700). +Since scholars have long linked Quakers' growth and endurance to the effectiveness of their networks, the data used in this tutorial is a list of names and relationships among the earliest seventeenth-century Quakers. This dataset is derived from the *[Oxford Dictionary of National Biography](https://www.oxforddnb.com)* and from the ongoing work of the *[Six Degrees of Francis Bacon](https://www.sixdegreesoffrancisbacon.com)* project, which is reconstructing the social networks of early modern Britain (1500-1700). # Data Prep and NetworkX Installation Before beginning this tutorial, you will need to download two files that together constitute our network dataset. The file quakers_nodelist.csv is a list of early modern Quakers (nodes) and the file quakers_edgelist.csv is a list of relationships between those Quakers (edges). To download these files, simply right-click on the links and select "Save Link As...". -It will be extremely helpful to familiarize yourself with the structure of the dataset before continuing. For more on the general structure of network datasets, see [this tutorial](/lessons/creating-network-diagrams-from-historical-sources#developing-a-coding-scheme). When you open the node file in the program of your choice, you will see that each Quaker is primarily identified by their name. Each Quaker node also has a number of associated attributes including historical significance, gender, birth/death dates, and SDFB ID---a unique numerical identifier that will enable you to cross-reference nodes in this dataset with the original *Six Degrees of Francis Bacon* dataset, if desired. Here are the first few lines: +It will be extremely helpful to familiarize yourself with the structure of the dataset before continuing. For more on the general structure of network datasets, see [this tutorial](/en/lessons/creating-network-diagrams-from-historical-sources#developing-a-coding-scheme). When you open the node file in the program of your choice, you will see that each Quaker is primarily identified by their name. Each Quaker node also has a number of associated attributes including historical significance, gender, birth/death dates, and SDFB ID---a unique numerical identifier that will enable you to cross-reference nodes in this dataset with the original *Six Degrees of Francis Bacon* dataset, if desired. Here are the first few lines: ``` Name,Historical Significance,Gender,Birthdate,Deathdate,ID @@ -99,9 +99,9 @@ George Keith,Franciscus Mercurius van Helmont George Keith,William Penn ``` -Now that you've downloaded the Quaker data and had a look at how it's structured, it's time to begin working with that data in Python. Once both Python and pip are installed (see Prerequisites, above) you'll want to install NetworkX, by typing this into your [command line](/lessons/intro-to-bash):[^pip] +Now that you've downloaded the Quaker data and had a look at how it's structured, it's time to begin working with that data in Python. Once both Python and pip are installed (see Prerequisites, above) you'll want to install NetworkX, by typing this into your [command line](/en/lessons/intro-to-bash):[^pip] -``` +```python pip3 install networkx==3.1 ``` @@ -113,7 +113,7 @@ And that's it! You're ready to start coding. ## Reading files, importing data -Start a new, blank plaintext file in the same directory as your data files called `quaker_network.py` (For more details on installing and running Python, see [this tutorial](/lessons/mac-installation)). At the top of that file, import the libraries you need. You'll need three libraries---the one we just installed, and two built-in Python libraries. You can type: +Start a new, blank plaintext file in the same directory as your data files called `quaker_network.py` (For more details on installing and running Python, see [this tutorial](/en/lessons/mac-installation)). At the top of that file, import the libraries you need. You'll need three libraries---the one we just installed, and two built-in Python libraries. You can type: ```python import csv @@ -137,7 +137,7 @@ with open('quakers_edgelist.csv', 'r') as edgecsv: # Open the file edges = [tuple(e) for e in edgereader][1:] # Retrieve the data ``` -This code performs similar functions to the ones in [this tutorial](/lessons/working-with-text-files) but uses the CSV module to load your nodes and edges. You'll go back and get more node information later, but for now you need two things: the full list of nodes and a list of edge pairs (as tuples of nodes).[^slicing] These are the forms NetworkX will need to create a "graph object," a special NetworkX data type you'll learn about in the next section. +This code performs similar functions to the ones in [this tutorial](/en/lessons/working-with-text-files) but uses the CSV module to load your nodes and edges. You'll go back and get more node information later, but for now you need two things: the full list of nodes and a list of edge pairs (as tuples of nodes).[^slicing] These are the forms NetworkX will need to create a "graph object," a special NetworkX data type you'll learn about in the next section. At this stage, before you start using NetworkX, you can do some basic sanity checks to make sure that your data loaded correctly using built-in Python functions and methods. Typing @@ -326,11 +326,11 @@ After seeing what the *dataset* looks like, it's important to see what the *netw The network's shape and basic properties will give you a handle on what you're working with and what analyses seem reasonable. You already know the number of nodes and edges, but what does the network 'look' like? Do nodes cluster together, or are they equally spread out? Are there complex structures, or is every node arranged along a straight line? -The visualization below, created in network visualization tool [Gephi](https://gephi.org/), will give you an idea of the topology of this network.[^singletons] You could create a similar graph in Palladio following [this tutorial](/lessons/creating-network-diagrams-from-historical-sources). +The visualization below, created in network visualization tool [Gephi](https://gephi.org/), will give you an idea of the topology of this network.[^singletons] You could create a similar graph in Palladio following [this tutorial](/en/lessons/creating-network-diagrams-from-historical-sources). {% include figure.html filename="exploring-and-analyzing-network-data-with-python-1.png" caption="Force-directed network visualization of the Quaker data, created in Gephi" %} -There are lots of ways to visualize a network, and a [force-directed layout](https://en.wikipedia.org/wiki/Force-directed_graph_drawing), of which the above image is an example, is among the most common. Force-directed graphs attempt to find the optimum placement for nodes with a calculation based on the [tension of springs in Hooke's Law](http://6dfb.tumblr.com/post/159420498411/ut-tensio-sic-vis-introducing-the-hooke-graph), which for smaller graphs often creates clean, easy-to-read visualizations. The visualization embedded above shows you there is a single large **component** of connected nodes (in the center) and several small components with just one or two connections around the edges. This is a fairly common network structure. Knowing that there are multiple components in the network will usefully limit the calculations you'll want to perform on it. By displaying the number of connections (known as **degree**, see below) as the size of nodes, the visualization also shows that there are a few nodes with lots of connections that keep the central component tied together. These large nodes are known as **hubs**, and the fact that they show up so clearly here gives you a clue as to what you'll find when you measure **centrality** in the next section. +There are lots of ways to visualize a network, and a [force-directed layout](https://en.wikipedia.org/wiki/Force-directed_graph_drawing), of which the above image is an example, is among the most common. Force-directed graphs attempt to find the optimum placement for nodes with a calculation based on the [tension of springs in Hooke's Law](https://6dfb.tumblr.com/post/159420498411/ut-tensio-sic-vis-introducing-the-hooke-graph), which for smaller graphs often creates clean, easy-to-read visualizations. The visualization embedded above shows you there is a single large **component** of connected nodes (in the center) and several small components with just one or two connections around the edges. This is a fairly common network structure. Knowing that there are multiple components in the network will usefully limit the calculations you'll want to perform on it. By displaying the number of connections (known as **degree**, see below) as the size of nodes, the visualization also shows that there are a few nodes with lots of connections that keep the central component tied together. These large nodes are known as **hubs**, and the fact that they show up so clearly here gives you a clue as to what you'll find when you measure **centrality** in the next section. Visualizations, however, only get you so far. The more networks you work with, the more you realize most appear similar enough that it's hard to tell one from the next. Quantitative metrics let you differentiate networks, learn about their topologies, and turn a jumble of nodes and edges into something you can learn from. @@ -347,7 +347,7 @@ print("Network density:", density) The output of density is a number, so that's what you'll see when you print the value. In this case, the density of our network is approximately 0.0248. On a scale of 0 to 1, not a very dense network, which comports with what you can see in the visualization.[^density] A 0 would mean that there are no connections at all, and a 1 would indicate that all *possible* edges are present (a perfectly connected network): this Quaker network is on the lower end of that scale, but still far from 0. -A shortest path measurement is a bit more complex. It calculates the shortest possible series of nodes and edges that stand between any two nodes, something hard to see in large network visualizations. This measure is essentially finding friends-of-friends---if my mother knows someone that I don't, then mom is the shortest path between me and that person. The Six Degrees of Kevin Bacon game, from which [our project](http://sixdegreesoffrancisbacon.com/) takes its name, is basically a game of finding shortest paths (with a **path length** of six or less) from Kevin Bacon to any other actor. +A shortest path measurement is a bit more complex. It calculates the shortest possible series of nodes and edges that stand between any two nodes, something hard to see in large network visualizations. This measure is essentially finding friends-of-friends---if my mother knows someone that I don't, then mom is the shortest path between me and that person. The Six Degrees of Kevin Bacon game, from which [our project](https://sixdegreesoffrancisbacon.com/) takes its name, is basically a game of finding shortest paths (with a **path length** of six or less) from Kevin Bacon to any other actor. To calculate a shortest path, you'll need to pass several input variables (information you give to a Python function): the whole graph, your source node, and your target node. Let's find the shortest path between Margaret Fell and George Whitehead. Since we used names to uniquely identify our nodes in the network, you can access those nodes (as the **source** and **target** of your path), using the names directly. @@ -488,7 +488,7 @@ Another common thing to ask about a network dataset is what the subgroups or com Very dense networks are often more difficult to split into sensible partitions. Luckily, as you discovered earlier, this network is not all that dense. There aren't nearly as many actual connections as possible connections, and there are several altogether disconnected components. Its worthwhile partitioning this sparse network with modularity and seeing if the result make historical and analytical sense. -Community detection and partitioning in NetworkX requires a little more setup than some of the other metrics. There are some built-in approaches to community detection (like [minimum cut](https://perma.cc/B6CN-LQX4), but modularity is not included with NetworkX. Fortunately there's an [additional python module](https://github.com/taynaud/python-louvain/) you can use with NetworkX, which you already installed and imported at the beginning of this tutorial. You can read the [full documentation](http://perso.crans.org/aynaud/communities/api.html) for all of the functions it offers, but for most community detection purposes you'll only want `best_partition()`: +Community detection and partitioning in NetworkX requires a little more setup than some of the other metrics. There are some built-in approaches to community detection (like [minimum cut](https://perma.cc/B6CN-LQX4), but modularity is not included with NetworkX. Fortunately there's an [additional python module](https://github.com/taynaud/python-louvain/) you can use with NetworkX, which you already installed and imported at the beginning of this tutorial. You can read the [full documentation](https://perso.crans.org/aynaud/communities/api.html) for all of the functions it offers, but for most community detection purposes you'll only want `best_partition()`: ```python communities = community.greedy_modularity_communities(G) @@ -541,7 +541,7 @@ Working with NetworkX alone will get you far, and you can find out a lot about m # Exporting Data -NetworkX supports a very large number of file formats for [data export](https://perma.cc/CYJ5-P6MR). If you wanted to export a plaintext edgelist to load into Palladio, there's a [convenient wrapper](https://perma.cc/MW25-9VMN) for that. Frequently at *Six Degrees of Francis Bacon*, we export NetworkX data in [D3's specialized JSON format](https://perma.cc/454D-C3FS), for visualization in the browser. You could even [export](https://perma.cc/PGS5-SKYC) your graph as a [Pandas dataframe](http://pandas.pydata.org/) if there were more advanced statistical operations you wanted to run. There are lots of options, and if you've been diligently adding all your metrics back into your Graph object as attributes, all your data will be exported in one fell swoop. +NetworkX supports a very large number of file formats for [data export](https://perma.cc/CYJ5-P6MR). If you wanted to export a plaintext edgelist to load into Palladio, there's a [convenient wrapper](https://perma.cc/MW25-9VMN) for that. Frequently at *Six Degrees of Francis Bacon*, we export NetworkX data in [D3's specialized JSON format](https://perma.cc/454D-C3FS), for visualization in the browser. You could even [export](https://perma.cc/PGS5-SKYC) your graph as a [Pandas dataframe](https://pandas.pydata.org/) if there were more advanced statistical operations you wanted to run. There are lots of options, and if you've been diligently adding all your metrics back into your Graph object as attributes, all your data will be exported in one fell swoop. Most of the export options work in roughly the same way, so for this tutorial you'll learn how to export your data into Gephi's GEXF format. Once you've exported the file, you can upload it [directly into Gephi](https://gephi.org/quickstart/) for visualization. diff --git a/en/lessons/extracting-illustrated-pages.md b/en/lessons/extracting-illustrated-pages.md index 076bf92ac3..7c249a08fb 100644 --- a/en/lessons/extracting-illustrated-pages.md +++ b/en/lessons/extracting-illustrated-pages.md @@ -35,7 +35,7 @@ To see how many *unillustrated* pages have been filtered out, compare against th {% include figure.html filename="parley-full-thumbnails.png" caption="View of HathiTrust thumbnails for all pages." %} -This lesson shows how complete these filtering and downloading steps for public-domain text volumes held by HathiTrust (HT) and Internet Archive (IA), two of the largest digital libraries in the world. It will be of interest to anyone who wants to create image corpora in order to learn about the history of illustration and the layout (*mise en page*) of books. Visual approaches to digital bibliography are becoming popular, following the pioneering efforts of [EBBA](https://ebba.english.ucsb.edu/) and [AIDA](http://projectaida.org/). Recently completed or funded projects explore ways to [identify footnotes](https://web.archive.org/web/20190526050917/http://culturalanalytics.org/2018/12/detecting-footnotes-in-32-million-pages-of-ecco/) and [track marginalia](http://www.ccs.neu.edu/home/dasmith/ichneumon-proposal.pdf), to give just two [examples](https://www.neh.gov/divisions/odh/grant-news/announcing-new-2017-odh-grant-awards). +This lesson shows how complete these filtering and downloading steps for public-domain text volumes held by HathiTrust (HT) and Internet Archive (IA), two of the largest digital libraries in the world. It will be of interest to anyone who wants to create image corpora in order to learn about the history of illustration and the layout (*mise en page*) of books. Visual approaches to digital bibliography are becoming popular, following the pioneering efforts of [EBBA](https://ebba.english.ucsb.edu/) and [AIDA](https://projectaida.org/). Recently completed or funded projects explore ways to [identify footnotes](https://web.archive.org/web/20190526050917/https://culturalanalytics.org/2018/12/detecting-footnotes-in-32-million-pages-of-ecco/) and [track marginalia](https://www.ccs.neu.edu/home/dasmith/ichneumon-proposal.pdf), to give just two [examples](https://www.neh.gov/divisions/odh/grant-news/announcing-new-2017-odh-grant-awards). My own research tries to answer empirical questions about changes in the frequency and mode of illustration in nineteenth-century medical and educational texts. This involves aggregating counts of pictures per book and trying to estimate what printing process was used to make those pictures. A more targeted use case for extracting picture pages might be the collation of illustrations across [different editions](https://www.cambridge.org/core/books/cambridge-companion-to-robinson-crusoe/iconic-crusoe-illustrations-and-images-of-robinson-crusoe/B83352C33FB1A9929A856FFA8E2D0CD0/core-reader) of the same book. Future work might profitably investigate the visual characteristics and *meaning* of the extracted pictures: their color, size, theme, genre, number of figures, and so on. @@ -116,7 +116,7 @@ Anaconda is the leading scientific Python distribution. Its `conda` package mana Download and install [Miniconda](https://conda.io/miniconda.html). Choose the latest stable release of Python 3. If everything goes well, you should be able to run `which conda` (linux/macOS) or `where conda` (Windows) in your shell and see the location of the executable program in the output. -Anaconda has a handy [cheat sheet](http://web.archive.org/web/20190115051900/https://conda.io/docs/_downloads/conda-cheatsheet.pdf) for frequently used commands. +Anaconda has a handy [cheat sheet](https://web.archive.org/web/20190115051900/https://conda.io/docs/_downloads/conda-cheatsheet.pdf) for frequently used commands. ### Create an Environment diff --git a/en/lessons/extracting-keywords.md b/en/lessons/extracting-keywords.md index c505eda463..90dea4b80a 100644 --- a/en/lessons/extracting-keywords.md +++ b/en/lessons/extracting-keywords.md @@ -17,7 +17,7 @@ activity: acquiring topics: [data-manipulation] abstract: "This lesson will teach you how to use Python to extract a set of keywords very quickly and systematically from a set of texts." python_warning: false -redirect_from: /lessons/extracting-keywords +redirect_from: /lessons/extracting-keywords/ avatar_alt: Woman churning butter or milk doi: 10.46430/phen0045 --- @@ -30,7 +30,7 @@ doi: 10.46430/phen0045 ## Lesson Goals -If you have a copy of a text in electronic format stored on your computer, it is relatively easy to keyword search for a single term. Often you can do this by using the built-in search features in your favourite text editor. However, scholars are increasingly needing to find instances of many terms within a text or texts. For example, a scholar may want to use a [gazetteer](http://en.wikipedia.org/wiki/Gazetteer) to extract all mentions of English placenames within a collection of texts so that those places can later be plotted on a map. Alternatively, they may want to extract all male given names, all pronouns, [stop words](http://en.wikipedia.org/wiki/Stop_words), or any other set of words. Using those same built-in search features to achieve this more complex goal is time consuming and clunky. This lesson will teach you how to use Python to extract a set of keywords very quickly and systematically from a set of texts. +If you have a copy of a text in electronic format stored on your computer, it is relatively easy to keyword search for a single term. Often you can do this by using the built-in search features in your favourite text editor. However, scholars are increasingly needing to find instances of many terms within a text or texts. For example, a scholar may want to use a [gazetteer](https://en.wikipedia.org/wiki/Gazetteer) to extract all mentions of English placenames within a collection of texts so that those places can later be plotted on a map. Alternatively, they may want to extract all male given names, all pronouns, [stop words](https://en.wikipedia.org/wiki/Stop_words), or any other set of words. Using those same built-in search features to achieve this more complex goal is time consuming and clunky. This lesson will teach you how to use Python to extract a set of keywords very quickly and systematically from a set of texts. It is expected that once you have completed this lesson, you will be able to generalise the skills to extract custom sets of keywords from any set of locally saved files. @@ -38,21 +38,21 @@ It is expected that once you have completed this lesson, you will be able to gen This lesson is useful for anyone who works with historical sources that are stored locally on their own computer, and that are transcribed into mutable electronic text (eg, .txt, .xml, .rtf, .md). It is particularly useful for people interested in identifying subsets of documents containing one or more of a fairly large number of keywords. This might be useful for identifying a relevant subset for closer reading, or for extracting and structuring the keywords in a format that can be used in another tool: as input for a mapping exercise, for example. -The present tutorial will show users how to extract all mentions of English and Welsh county names from a series of 6,692 mini-biographies of individuals who began their studies at the University of Oxford during the reign of James I of England (1603-1625). These records were transcribed by [British History Online](http://www.british-history.ac.uk/alumni-oxon/1500-1714), from the printed version of *Alumni Oxonienses, 1500-1714*. These biographies contain information about each graduate, which includes the date of their studies and the college(s) they attended. Often entries contain additional information when known, including date or birth and death, the name or occupation of their father, where they originated, and what they went on to do in later life. The biographies are a rich resource, providing reasonably comparable data about a large number of similar individuals (rich men who went to Oxford). The 6,692 entries have been pre-processed by the author and saved to a [CSV file](http://en.wikipedia.org/wiki/Comma-separated_values) with one entry per row. +The present tutorial will show users how to extract all mentions of English and Welsh county names from a series of 6,692 mini-biographies of individuals who began their studies at the University of Oxford during the reign of James I of England (1603-1625). These records were transcribed by [British History Online](https://www.british-history.ac.uk/alumni-oxon/1500-1714), from the printed version of *Alumni Oxonienses, 1500-1714*. These biographies contain information about each graduate, which includes the date of their studies and the college(s) they attended. Often entries contain additional information when known, including date or birth and death, the name or occupation of their father, where they originated, and what they went on to do in later life. The biographies are a rich resource, providing reasonably comparable data about a large number of similar individuals (rich men who went to Oxford). The 6,692 entries have been pre-processed by the author and saved to a [CSV file](https://en.wikipedia.org/wiki/Comma-separated_values) with one entry per row. -In this tutorial, the dataset involves geographical keywords. Once extracted, these placenames could be geo-referenced to their place on the globe and then mapped using digital mapping. This might make it possible to discern which colleges attracted students from what parts of the country, or to determine if these patterns changed over time. For a practical tutorial on taking this next step, see the lesson by Fred Gibbs mentioned at the end of this lesson. Readers may also be interested in [georeferencing in QGIS 2.0](/lessons/georeferencing-qgis), also available from the *Programming Historian*. +In this tutorial, the dataset involves geographical keywords. Once extracted, these placenames could be geo-referenced to their place on the globe and then mapped using digital mapping. This might make it possible to discern which colleges attracted students from what parts of the country, or to determine if these patterns changed over time. For a practical tutorial on taking this next step, see the lesson by Fred Gibbs mentioned at the end of this lesson. Readers may also be interested in [georeferencing in QGIS 2.0](/en/lessons/georeferencing-qgis), also available from the *Programming Historian*. This approach is not limited to geographical keywords, however. It could also be used to extract given names, prepositions, food words, or any other set of terms defined by the user. This process could therefore be useful for someone seeking to isolate individual entries containing any of these keywords, or for someone looking to calculate the frequency of their keywords within a corpus of texts. This tutorial provides pathways into textual or geospatial analyses, rather than research answers in its own right. -Data management skills are increasingly crucial for historians and textual scholars, and anyone working with particularly messy or difficult texts might consider looking into [Cleaning Data with OpenRefine](/lessons/cleaning-data-with-openrefine) by Seth van Hooland et al. The approach outlined in this tutorial is not optimised for working with poorly transcribed texts such as text converted through [Optical Character Recognition](https://en.wikipedia.org/wiki/Optical_character_recognition) if the software has a high error rate. Readers working with early modern texts with non-standardised spelling may also find this approach challenging, as the gazetteer one uses must contain exact matches of the words sought. However, with a good enough gazetteer, this approach could prove quite useful for early modernites, and may exceed what's practical with traditional keyword searching by making [fuzzy searching](https://en.wikipedia.org/wiki/Approximate_string_matching) possible. +Data management skills are increasingly crucial for historians and textual scholars, and anyone working with particularly messy or difficult texts might consider looking into [Cleaning Data with OpenRefine](/en/lessons/cleaning-data-with-openrefine) by Seth van Hooland et al. The approach outlined in this tutorial is not optimised for working with poorly transcribed texts such as text converted through [Optical Character Recognition](https://en.wikipedia.org/wiki/Optical_character_recognition) if the software has a high error rate. Readers working with early modern texts with non-standardised spelling may also find this approach challenging, as the gazetteer one uses must contain exact matches of the words sought. However, with a good enough gazetteer, this approach could prove quite useful for early modernites, and may exceed what's practical with traditional keyword searching by making [fuzzy searching](https://en.wikipedia.org/wiki/Approximate_string_matching) possible. -This tutorial assumes that you have already installed Python version 3 on your computer. The lesson will use the Command Line to run Python, as this is more flexible and makes it possible for use in classrooms or computer labs in which students do not have the ability to download and install interactive development environments (IDEs) like Komodo Edit. Readers who would prefer to use an IDE might like to first read [Python Introduction and Installation](/lessons/introduction-and-installation), but this is optional. The tutorial also makes some basic assumptions about your Python skills. It assumes you know what the following Python data structures are (though not knowing will not prevent the code from working should you follow all of the steps in the tutorial): +This tutorial assumes that you have already installed Python version 3 on your computer. The lesson will use the Command Line to run Python, as this is more flexible and makes it possible for use in classrooms or computer labs in which students do not have the ability to download and install interactive development environments (IDEs) like Komodo Edit. Readers who would prefer to use an IDE might like to first read [Python Introduction and Installation](/en/lessons/introduction-and-installation), but this is optional. The tutorial also makes some basic assumptions about your Python skills. It assumes you know what the following Python data structures are (though not knowing will not prevent the code from working should you follow all of the steps in the tutorial): * [List](https://docs.python.org/3/tutorial/datastructures.html) * [For Loop](https://docs.python.org/3/tutorial/controlflow.html) * [String](https://docs.python.org/3/library/string.html) -The lesson touches on Regular Expressions, so some readers may find it handy to have the relevant Programming Historian lessons by [Doug Knox](/lessons/understanding-regular-expressions) or [Laura Turner O'Hara](/lessons/cleaning-ocrd-text-with-regular-expressions) open to consult as needed. +The lesson touches on Regular Expressions, so some readers may find it handy to have the relevant Programming Historian lessons by [Doug Knox](/en/lessons/understanding-regular-expressions) or [Laura Turner O'Hara](/en/lessons/cleaning-ocrd-text-with-regular-expressions) open to consult as needed. ## Familiarising yourself with the data @@ -64,7 +64,7 @@ The first step of this process is to take a look at the data that we will be usi Download the dataset and spend a couple of minutes looking at the types of information available. You should notice three columns of information. The first, 'Name', contains the name of the graduate. The second: 'Details', contains the biographical information known about that person. The final column, 'Matriculation Year', contains the year in which the person matriculated (began their studies). This final column was extracted from the details column in the pre-processing stage of this tutorial. The first two columns are as you would find them on the British History Online version of the *Alumni Oxonienses*. If you notice more than three columns then your spreadsheet programme has incorrectly set the [delimiter](https://en.wikipedia.org/wiki/Delimiter) between columns. It should be set to "," (double quotes, comma). How you do this depends on your spreadsheet programme, but you should be able to find the solution online. -Most (but not all) of these bibliographic entries contain enough information to tell us what county the graduate came from. Notice that a large number of entries contain placenames that correspond to either major cities ('of London', in the first entry) or English counties ('of Middlesex' in entry 5 or 'of Wilts' - short for Wiltshire in entry 6). If you are not British you may not be familiar with these county names. You can find a list of [historic counties of England](http://en.wikipedia.org/wiki/Historic_counties_of_England) on Wikipedia. +Most (but not all) of these bibliographic entries contain enough information to tell us what county the graduate came from. Notice that a large number of entries contain placenames that correspond to either major cities ('of London', in the first entry) or English counties ('of Middlesex' in entry 5 or 'of Wilts' - short for Wiltshire in entry 6). If you are not British you may not be familiar with these county names. You can find a list of [historic counties of England](https://en.wikipedia.org/wiki/Historic_counties_of_England) on Wikipedia. Unfortunately, the information is not always available in the same format. Sometimes it's the first thing mentioned in an entry. Sometimes it's in the middle. Our challenge is to extract those counties of origin from within this messy text, and store it in a new column next to that person's entry. @@ -116,9 +116,9 @@ Worcestershire Yorkshire ``` -Make sure that there are no blank lines in the gazetteer file. If there are, your program will think all spaces are a matching keyword. Some text editing programs (particularly in Linux) will want to add a blank line at the end of your file. If this is the case, try another text editor. It's best to use software that puts you in control. For more on this problem, see [the explanation on Stack Overflow](http://stackoverflow.com/questions/3056740/gedit-adds-line-at-end-of-file) - with thanks to John Levin for the link. +Make sure that there are no blank lines in the gazetteer file. If there are, your program will think all spaces are a matching keyword. Some text editing programs (particularly in Linux) will want to add a blank line at the end of your file. If this is the case, try another text editor. It's best to use software that puts you in control. For more on this problem, see [the explanation on Stack Overflow](https://stackoverflow.com/questions/3056740/gedit-adds-line-at-end-of-file) - with thanks to John Levin for the link. -If you ever need to add to this set of keywords, you can open this file in your text editor and add new words, each on their own line. Komodo Edit is a good text editor for this task, especially if you have set it up to run with Python, but you can also use any plain text editor as long as it is *not* a [word processor](http://en.wikipedia.org/wiki/Word_processor) such as Microsoft Word or Open Office. Word processors are inappropriate for writing code because of how they stylise apostrophes and quotes, causing havoc for your code. +If you ever need to add to this set of keywords, you can open this file in your text editor and add new words, each on their own line. Komodo Edit is a good text editor for this task, especially if you have set it up to run with Python, but you can also use any plain text editor as long as it is *not* a [word processor](https://en.wikipedia.org/wiki/Word_processor) such as Microsoft Word or Open Office. Word processors are inappropriate for writing code because of how they stylise apostrophes and quotes, causing havoc for your code. ## Loading your texts @@ -158,7 +158,7 @@ The first line is a comment for our own benefit, to tells us (the human) what th The second line opens the `gazetteer.txt` file, and reads it, which is signified by the 'r' (as opposed to 'w' for write, or 'a' for append). That means we will not be changing the contents of the file. Only reading it. -The third line reads everything in that file, converts it to `lower()` case, and splits the contents into a Python list, using the [newline character](http://stackoverflow.com/questions/11497376/new-line-python) as the delimiter. Effectively that means each time the program comes across a new line, it stores it as a new entry. We then save that Python list containing the 39 counties into a variable that we have called `allKeywords`. +The third line reads everything in that file, converts it to `lower()` case, and splits the contents into a Python list, using the [newline character](https://stackoverflow.com/questions/11497376/new-line-python) as the delimiter. Effectively that means each time the program comes across a new line, it stores it as a new entry. We then save that Python list containing the 39 counties into a variable that we have called `allKeywords`. The fourth line closes the open text file. The fifth line prints out the results, and the sixth line tells us how many results were found. @@ -182,7 +182,7 @@ python extractKeywords.py ``` -Once you have run the program you should see your gazetteer printed as a Python list in the command output, along with the number of entries in your list (39). If you can, great! Move on to step 2. If the last line of your output tells you that there was 1 result, that means the code has not worked properly, since we know that there should be 39 keywords in your gazetteer. Double check your code to make sure you havn't included any typos. If you still can't solve the problem, try changing "\n" to "\r" on line three. Some text editors will use [carriage returns](http://en.wikipedia.org/wiki/Carriage_return) instead of 'newline characters' when creating a new line. The \r means 'carriage return' and should solve your problem if you're experiencing one. +Once you have run the program you should see your gazetteer printed as a Python list in the command output, along with the number of entries in your list (39). If you can, great! Move on to step 2. If the last line of your output tells you that there was 1 result, that means the code has not worked properly, since we know that there should be 39 keywords in your gazetteer. Double check your code to make sure you havn't included any typos. If you still can't solve the problem, try changing "\n" to "\r" on line three. Some text editors will use [carriage returns](https://en.wikipedia.org/wiki/Carriage_return) instead of 'newline characters' when creating a new line. The \r means 'carriage return' and should solve your problem if you're experiencing one. ### Step 2: Load the texts @@ -206,7 +206,7 @@ If the code worked, you should see a big wall of text. Those are the texts we in ### Step 3: Remove unwanted punctuation -When matching strings, you have to make sure the punctuation doesn't get in the way. Technically, 'London.' is a different string than 'London' or ';London' because of the added punctuation. These three strings which all mean the same thing to us as human readers will be viewed by the computer as distinct entities. To solve that problem, the easiest thing to do is just to remove all of the punctuation. You can do this with [regular expressions](http://en.wikipedia.org/wiki/Regular_expression), and [Doug Knox](/lessons/understanding-regular-expressions) and [Laura Turner O'Hara](/lessons/cleaning-ocrd-text-with-regular-expressions) have provided great introductions at *Programming Historian* for doing so. +When matching strings, you have to make sure the punctuation doesn't get in the way. Technically, 'London.' is a different string than 'London' or ';London' because of the added punctuation. These three strings which all mean the same thing to us as human readers will be viewed by the computer as distinct entities. To solve that problem, the easiest thing to do is just to remove all of the punctuation. You can do this with [regular expressions](https://en.wikipedia.org/wiki/Regular_expression), and [Doug Knox](/en/lessons/understanding-regular-expressions) and [Laura Turner O'Hara](/en/lessons/cleaning-ocrd-text-with-regular-expressions) have provided great introductions at *Programming Historian* for doing so. To keep things simple, this program will just replace the most common types of punctuation with nothing instead (effectively deleting punctuation). @@ -354,7 +354,7 @@ If you do not like the output format, you can change it by adjusting the second f.write(matchString) f.close() ``` -Note the 'a' instead of the 'r' we used earlier. This 'appends' the text to the file called `output.txt`, which will be saved in your working directory. You will have to take care, because running the program several times will continue to append all of the outputs to this file, creating a very long file. There are ways around this, which we will cover in a moment, and you might consider looking into how the 'w' (write) feature works, and experimenting with output formats. There is more information related to these features in ['Working with Text Files in Python'](/lessons/working-with-text-files). +Note the 'a' instead of the 'r' we used earlier. This 'appends' the text to the file called `output.txt`, which will be saved in your working directory. You will have to take care, because running the program several times will continue to append all of the outputs to this file, creating a very long file. There are ways around this, which we will cover in a moment, and you might consider looking into how the 'w' (write) feature works, and experimenting with output formats. There is more information related to these features in ['Working with Text Files in Python'](/en/lessons/working-with-text-files). ## Refining the Gazetteer @@ -441,7 +441,7 @@ There are a few extra lines of code here, but you didn't need to cut and paste a (Error: new-line character seen in unquoted field - do you need to open the file in universal-newline mode?). ``` -To solve this problem, open your CSV file in a spreadsheet program (eg., Excel) and 'Save As' and under format chose 'Windows Comma Separated (csv)'. This should solve the problem. To read more on this issue, see [Stack Overflow](http://stackoverflow.com/questions/17315635/csv-new-line-character-seen-in-unquoted-field-error) +To solve this problem, open your CSV file in a spreadsheet program (eg., Excel) and 'Save As' and under format chose 'Windows Comma Separated (csv)'. This should solve the problem. To read more on this issue, see [Stack Overflow](https://stackoverflow.com/questions/17315635/csv-new-line-character-seen-in-unquoted-field-error) --- @@ -565,4 +565,4 @@ This approach created longer and more complex code, but the result is a powerful ## Suggested Further Reading -Readers who have completed this lesson might be interested in then geo-referencing the output using the Google API and mapping the results. You can learn more about this process from Fred Gibbs's tutorial, [Extract and Geocode Placenames from a Text File](http://fredgibbs.net/tutorials/extract-geocode-placenames-from-text-file.html). This will let you visualise the practical outputs of this tutorial. Alternatively, readers may be interested in [Jim Clifford et. al's tutorial on georeferencing in QGIS 2.0](/lessons/georeferencing-qgis), an open source [GIS](https://en.wikipedia.org/wiki/Geographic_information_system) program. +Readers who have completed this lesson might be interested in then geo-referencing the output using the Google API and mapping the results. You can learn more about this process from Fred Gibbs's tutorial, [Extract and Geocode Placenames from a Text File](https://fredgibbs.net/tutorials/extract-geocode-placenames-from-text-file.html). This will let you visualise the practical outputs of this tutorial. Alternatively, readers may be interested in [Jim Clifford et. al's tutorial on georeferencing in QGIS 2.0](/en/lessons/georeferencing-qgis), an open source [GIS](https://en.wikipedia.org/wiki/Geographic_information_system) program. diff --git a/en/lessons/facial-recognition-ai-python.md b/en/lessons/facial-recognition-ai-python.md index 94bb84a6b2..a475becc0a 100644 --- a/en/lessons/facial-recognition-ai-python.md +++ b/en/lessons/facial-recognition-ai-python.md @@ -475,8 +475,8 @@ Additionally, there are several companies like [Roboflow](https://roboflow.com/) [^2]: Christina Kotchemidova, Why We Say “Cheese”: Producing the Smile in Snapshot Photography," *Critical Studies in Media Communication,* 22 no. 1 (2005): 2-25, [https://www.tandfonline.com/doi/abs/10.1080/0739318042000331853](https://www.tandfonline.com/doi/abs/10.1080/0739318042000331853). [^3]: Paul Viola and Michael Jones, "Rapid object detection using a boosted cascade of simple features," *Proceedings of the 2001 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, CVPR 2001* (2001): 1-9, [https://ieeexplore.ieee.org/document/990517/authors#authors](https://ieeexplore.ieee.org/document/990517/authors#authors). [^4]: Taylor R. Wondergem and Mihaela Friedlmeier, "Gender and Ethnic Differences in Smiling: A Yearbook Photographs Analysis from Kindergarten Through 12th Grade," *Sex Roles* 67, no. 7-8 (2012): 403-411. [https://doi.org/10.1007/s11199-012-0158-y](https://doi.org/10.1007/s11199-012-0158-y) -[^5]: Joy Buolamwini and Timnit Gebru, "Gender Shades: Intersectional Accuracy Disparities in Commercial Gender Classification," *Proceedings of Machine Learning Research,* 81 (2018): 1–15, [http://proceedings.mlr.press/v81/buolamwini18a/buolamwini18a.pdf](https://perma.cc/F8JT-R9KA). -[^6]: Hu Han and Anil K. Jain, "Age, Gender and Race Estimation from Unconstrained Face Images," (2014) [http://biometrics.cse.msu.edu/Publications/Face/HanJain_UnconstrainedAgeGenderRaceEstimation_MSUTechReport2014.pdf](https://perma.cc/J95Z-89FQ). +[^5]: Joy Buolamwini and Timnit Gebru, "Gender Shades: Intersectional Accuracy Disparities in Commercial Gender Classification," *Proceedings of Machine Learning Research,* 81 (2018): 1–15, [https://proceedings.mlr.press/v81/buolamwini18a/buolamwini18a.pdf](https://perma.cc/F8JT-R9KA). +[^6]: Hu Han and Anil K. Jain, "Age, Gender and Race Estimation from Unconstrained Face Images," (2014) [https://biometrics.cse.msu.edu/Publications/Face/HanJain_UnconstrainedAgeGenderRaceEstimation_MSUTechReport2014.pdf](https://perma.cc/J95Z-89FQ). [^7]: Angela Wang and Olga Russakovsky, "Overwriting Pretrained Bias with Finetuning Data," *2023 IEEE/CVF International Conference on Computer Vision (ICCV), Paris, France* (2023): 3934-3945, [https://openaccess.thecvf.com/content/ICCV2023/papers/Wang_Overwriting_Pretrained_Bias_with_Finetuning_Data_ICCV_2023_paper.pdf](https://perma.cc/2TE4-ED6Z). [^8]: Mei Wang, Weihong Deng, *et al.*, "Racial Faces in-the-Wild: Reducing Racial Bias by Information Maximization Adaptation Network," *Proceedings of the 2019 IEEE Computer Society Conference on Computer Vision and Pattern Recognition, CVPR 2019* (2019): 692-702, [https://arxiv.org/pdf/1812.00194.pdf](https://perma.cc/Y2Y3-G7R9). [^9]: Claudia Goldin and Lawrence F. Katz, "Putting the “Co” in Education: Timing, Reasons, and Consequences of College Coeducation from 1835 to the Present," *Journal of Human Capital*, 5 no. 4 (2011): 377-417. diff --git a/en/lessons/fetch-and-parse-data-with-openrefine.md b/en/lessons/fetch-and-parse-data-with-openrefine.md index d9604cff06..6ecefb326c 100755 --- a/en/lessons/fetch-and-parse-data-with-openrefine.md +++ b/en/lessons/fetch-and-parse-data-with-openrefine.md @@ -14,7 +14,7 @@ review-ticket: https://github.com/programminghistorian/ph-submissions/issues/69 activity: acquiring topics: [data-manipulation, web-scraping, api] abstract: "OpenRefine is a powerful tool for exploring, cleaning, and transforming data. In this lesson you will learn how to use Refine to fetch URLs and parse web content." -redirect_from: /lessons/fetch-and-parse-data-with-openrefine +redirect_from: /lessons/fetch-and-parse-data-with-openrefine/ avatar_alt: Machine for water filtration doi: 10.46430/phen0065 --- @@ -28,7 +28,7 @@ The Chronicling America website has recently been updated, so many elements of t # Lesson Goals OpenRefine is a powerful tool for exploring, cleaning, and transforming data. -An earlier Programming Historian lesson, ["Cleaning Data with OpenRefine"](/lessons/cleaning-data-with-openrefine), introduced the basic functionality of Refine to efficiently discover and correct inconsistency in a data set. +An earlier Programming Historian lesson, ["Cleaning Data with OpenRefine"](/en/lessons/cleaning-data-with-openrefine), introduced the basic functionality of Refine to efficiently discover and correct inconsistency in a data set. Building on those essential data wrangling skills, this lesson focuses on Refine's ability to fetch URLs and parse web content. Examples introduce some of the advanced features to transform and enhance a data set including: @@ -38,12 +38,12 @@ Examples introduce some of the advanced features to transform and enhance a data - use array functions to manipulate string values - use Jython to extend Refine's functionality -It will be helpful to have basic familiarity with [OpenRefine](/lessons/cleaning-data-with-openrefine), [HTML](/lessons/viewing-html-files), and programming concepts such as variables and loops to complete this lesson. +It will be helpful to have basic familiarity with [OpenRefine](/en/lessons/cleaning-data-with-openrefine), [HTML](/en/lessons/viewing-html-files), and programming concepts such as variables and loops to complete this lesson. ## Why Use OpenRefine? The ability to create data sets from unstructured documents available on the web opens possibilities for research using digitized primary materials, web archives, texts, and contemporary media streams. -Programming Historian lessons introduce a number of methods to gather and interact with this content, from [wget](/lessons/applied-archival-downloading-with-wget) to [Python](/lessons/intro-to-beautiful-soup). +Programming Historian lessons introduce a number of methods to gather and interact with this content, from [wget](/en/lessons/applied-archival-downloading-with-wget) to [Python](/en/lessons/intro-to-beautiful-soup). When working with text documents, Refine is particularly suited for this task, allowing users to fetch urls and directly process the results in an iterative, exploratory manner. David Huynh, the creator of Freebase Gridworks (2009) which became GoogleRefine (2010) and then OpenRefine (2012+), describes Refine as: @@ -52,7 +52,7 @@ David Huynh, the creator of Freebase Gridworks (2009) which became GoogleRefine - more provisional / exploratory / experimental / playful than a database [^huynh] Refine is a unique tool that combines the power of databases and scripting languages into an interactive and user friendly visual interface. -Because of this flexibility it has been embraced by [journalists](https://www.propublica.org/nerds/item/using-google-refine-for-data-cleaning), [librarians](http://web.archive.org/web/20180129051941/http://data-lessons.github.io/library-openrefine/), [scientists](http://www.datacarpentry.org/OpenRefine-ecology-lesson/), and others needing to wrangle data from diverse sources and formats into structured information. +Because of this flexibility it has been embraced by [journalists](https://www.propublica.org/nerds/item/using-google-refine-for-data-cleaning), [librarians](https://web.archive.org/web/20180129051941/https://data-lessons.github.io/library-openrefine/), [scientists](https://www.datacarpentry.org/OpenRefine-ecology-lesson/), and others needing to wrangle data from diverse sources and formats into structured information. {% include figure.html filename="openrefine.png" caption="OpenRefine terminal and GUI" %} @@ -76,11 +76,11 @@ This lesson presents three examples demonstrating workflows to harvest and proce This example downloads a single web page and parses it into a structured table using Refine's built in functions. A similar workflow can be applied to a list of URLs, often generated by parsing another web page, creating a flexible web harvesting tool. -The raw data for this example is an HTML copy of Shakespeare's [Sonnets](http://www.gutenberg.org/ebooks/1105) from [Project Gutenberg](http://www.gutenberg.org/). +The raw data for this example is an HTML copy of Shakespeare's [Sonnets](https://www.gutenberg.org/ebooks/1105) from [Project Gutenberg](https://www.gutenberg.org/). Processing a book of poems into structured data enables new ways of reading text, allowing us to sort, manipulate, and connect with other information.
    -Please note that Project Gutenberg provides feeds to bulk download catalog data. +Please note that Project Gutenberg provides feeds to bulk download catalog data. Their public website should not be used for web scraping purposes. A copy of the HTML ebook is hosted on GitHub for this example to avoid redirects built in to the Gutenberg site.
    @@ -408,7 +408,7 @@ The url will open in a new tab, returning a JSON response. Fetch the URLs using *url* column by selecting *Edit column* > *Add column by fetching urls*. Name the new column "fetch" and click *OK*. -In a few seconds, the operation should complete and the *fetch* column will be filled with [JSON](http://www.json.org/) data. +In a few seconds, the operation should complete and the *fetch* column will be filled with [JSON](https://www.json.org/) data. ## Parse JSON to Get Items @@ -491,9 +491,9 @@ This workflow uses the HTTP GET protocol, meaning the query is encoded in the UR Instead, many API services used to enhance text data, such as [geocoding](https://en.wikipedia.org/wiki/Geocoding) or [named entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition), use HTTP POST to transfer information to the server for processing. GREL does not have a built in function to use this type of API. -However, the expression window language can be changed to [Jython](http://www.jython.org/), providing a more complete scripting environment where it is possible to implement a POST request. +However, the expression window language can be changed to [Jython](https://www.jython.org/), providing a more complete scripting environment where it is possible to implement a POST request. -> [Jython](http://www.jython.org/) is Python implemented for the Java VM and comes bundled with Refine. +> [Jython](https://www.jython.org/) is Python implemented for the Java VM and comes bundled with Refine. > This means [Python 2](https://docs.python.org/2.7/) scripts using the Standard Library can be written or loaded into the expression window, and Refine will apply them to each cell in the transformation. > The [official documentation](https://github.com/OpenRefine/OpenRefine/wiki/Jython) is sparse, but the built-in Jython can be extended with non-standard libraries using a [work around](https://github.com/OpenRefine/OpenRefine/wiki/Extending-Jython-with-pypi-modules). > @@ -542,8 +542,8 @@ The URL could be replaced with cell variables to construct a query similar to th ## POST Request Urllib2 will automatically send a POST if data is added to the request object. -For example, [Text Processing](http://text-processing.com/) provides natural language processing APIs based on [Python NLTK](http://www.nltk.org/). -The documentation for the [Sentiment Analysis service](http://text-processing.com/docs/sentiment.html) provides a base URL and the name of the key used for the data to be analyzed. +For example, [Text Processing](https://text-processing.com/) provides natural language processing APIs based on [Python NLTK](https://www.nltk.org/). +The documentation for the [Sentiment Analysis service](https://text-processing.com/docs/sentiment.html) provides a base URL and the name of the key used for the data to be analyzed. No authentication is required and 1,000 calls per day are free for non-commercial use.[^use] This type of API is often demonstrated using [curl](https://curl.haxx.se/) on the commandline. @@ -607,7 +607,7 @@ else: ## Compare Sentiment -To practice constructing a POST request, read the documentation for [Sentiment Tool](http://sentiment.vivekn.com/docs/api/), another free API. +To practice constructing a POST request, read the documentation for [Sentiment Tool](https://sentiment.vivekn.com/docs/api/), another free API. Find the service URL and data key necessary to modify the Jython pattern above. Create a new column from *first* named `sentiment2` and test the script. @@ -633,7 +633,7 @@ Archaic words and phrases contribute significantly to the sonnets' sentiment, ye While comparing the metrics is fascinating, neither is likely to produce quality results for this data set. Rather than an accurate sentiment, we might be surprised to find a quantifiable dissonance between the sonnet's English and our modern web usage. However, a model optimized to Shakespeare's words could be developed using more appropriate training data. -To learn more about classifiers and how to implement one, see Vilja Hulden's PH lesson ["Supervised Classification: The Naive Bayesian Returns to the Old Bailey"](/lessons/naive-bayesian) or Steven Bird, Ewan Klein, and Edward Loper's ["Learning to Classify Text"](http://www.nltk.org/book/ch06.html) in the [NTLK Book](http://www.nltk.org/book/). +To learn more about classifiers and how to implement one, see Vilja Hulden's PH lesson ["Supervised Classification: The Naive Bayesian Returns to the Old Bailey"](/en/lessons/naive-bayesian) or Steven Bird, Ewan Klein, and Edward Loper's ["Learning to Classify Text"](https://www.nltk.org/book/ch06.html) in the [NTLK Book](https://www.nltk.org/book/). Accessing data and services on the web opens new possibilities and efficiencies for humanities research. While powerful, these APIs are often not aimed at humanities scholarship and may not be appropriate or optimized for our inquiries. @@ -646,7 +646,7 @@ We can critically evaluate data sources, algorithms, and API services, as well a With its unique ability to interactively wrangle data from raw aggregation to analysis, Refine supports exploratory research and offers a wonderfully fluid and playful approach to tabular data. OpenRefine is a flexible, pragmatic tool that simplifies routine tasks and, when combined with domain knowledge, extends research capabilities. -[^huynh]: David Huynh, "Google Refine", Computer-Assisted Reporting Conference 2011, [http://web.archive.org/web/20150528125345/http://davidhuynh.net/spaces/nicar2011/tutorial.pdf](http://web.archive.org/web/20150528125345/http://davidhuynh.net/spaces/nicar2011/tutorial.pdf). -[^use]: As of July 2017, see [API Documentation](http://text-processing.com/docs/index.html). -[^1]: Jacob Perkins, "Sentiment Analysis with Python NLTK Text Classification", [http://text-processing.com/demo/sentiment/](http://text-processing.com/demo/sentiment/). +[^huynh]: David Huynh, "Google Refine", Computer-Assisted Reporting Conference 2011, [https://web.archive.org/web/20150528125345/https://davidhuynh.net/spaces/nicar2011/tutorial.pdf](https://web.archive.org/web/20150528125345/https://davidhuynh.net/spaces/nicar2011/tutorial.pdf). +[^use]: As of July 2017, see [API Documentation](https://text-processing.com/docs/index.html). +[^1]: Jacob Perkins, "Sentiment Analysis with Python NLTK Text Classification", [https://text-processing.com/demo/sentiment/](https://text-processing.com/demo/sentiment/). [^2]: Vivek Narayanan, Ishan Arora, and Arjun Bhatia, "Fast and accurate sentiment classification using an enhanced Naive Bayes model", 2013, [arXiv:1305.6143](https://arxiv.org/abs/1305.6143). diff --git a/en/lessons/from-html-to-list-of-words-1.md b/en/lessons/from-html-to-list-of-words-1.md index cd5190694f..7ef0814ae3 100755 --- a/en/lessons/from-html-to-list-of-words-1.md +++ b/en/lessons/from-html-to-list-of-words-1.md @@ -17,12 +17,12 @@ activity: transforming topics: [python] abstract: "In this two-part lesson, we will build on what you’ve learned about Downloading Web Pages with Python, learning how to remove the HTML markup from the webpage of Benjamin Bowsey’s 1780 criminal trial transcript. We will achieve this by using a variety of string operators, string methods, and close reading skills. We introduce looping and branching so that programs can repeat tasks and test for certain conditions, making it possible to separate the content from the HTML tags. Finally, we convert content from a long string to a list of words that can later be sorted, indexed, and counted." -next: from-html-to-list-of-words-2 -previous: manipulating-strings-in-python +next: /en/lessons/from-html-to-list-of-words-2 +previous: /en/lessons/manipulating-strings-in-python series_total: 15 lessons sequence: 7 python_warning: false -redirect_from: /lessons/from-html-to-list-of-words-1 +redirect_from: /lessons/from-html-to-list-of-words-1/ avatar_alt: A giraffe being mimicked by a human doi: 10.46430/phen0006 --- @@ -37,7 +37,7 @@ The Old Bailey Online’s website has recently been updated. Unfortunately, due ## Lesson Goals In this two-part lesson, we will build on what you’ve learned about -[Downloading Web Pages with Python](/lessons/working-with-web-pages), learning how to remove the *HTML markup* from +[Downloading Web Pages with Python](/en/lessons/working-with-web-pages), learning how to remove the *HTML markup* from the webpage of [Benjamin Bowsey’s 1780 criminal trial transcript][]. We will achieve this by using a variety of *string operators*, *string methods* and close reading skills. We introduce *looping* and *branching* so that @@ -49,7 +49,7 @@ indexed, and counted. ## The Challenge To get a clearer picture of the task ahead, open the -*obo-t17800628-33.html* file that you created in [Downloading Web Pages with Python](/lessons/working-with-web-pages) (or [download and save the trial] +*obo-t17800628-33.html* file that you created in [Downloading Web Pages with Python](/en/lessons/working-with-web-pages) (or [download and save the trial] [obo-t17800628-33.html] if you do not already have a copy), then look at the HTML source by clicking on *Tools -> Web Developer -> Page Source*. As you scroll through the @@ -250,13 +250,13 @@ that’s ok! - programming-historian-2 ([zip][]) - [/lessons/working-with-web-pages]: /lessons/working-with-web-pages - [Benjamin Bowsey’s 1780 criminal trial transcript]: http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33 - [HTML]: http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33-defend448&div=t17800628-33 - [XML]: http://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/17800628.xml&div=t17800628-33&xml=yes - [1]: http://www.w3schools.com/html/ - [zip file from the previous lesson here.]: /lessons/manipulating-strings-in-python#code-syncing - [Manipulating Strings in Python]: /lessons/manipulating-strings-in-python - [Code Reuse and Modularity]: /lessons/code-reuse-and-modularity - [zip]: /assets/python-lessons2.zip - [obo-t17800628-33.html]: /assets/from-html-to-list-of-words-1/obo-t17800628-33.html +- [/lessons/working-with-web-pages](/en/lessons/working-with-web-pages) +- [Benjamin Bowsey’s 1780 criminal trial transcript](https://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33) +- [HTML](https://www.oldbaileyonline.org/browse.jsp?id=t17800628-33-defend448&div=t17800628-33) +- [XML](https://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/17800628.xml&div=t17800628-33&xml=yes) +- [1](https://www.w3schools.com/html/) +- [zip file from the previous lesson here.](/en/lessons/manipulating-strings-in-python#code-syncing) +- [Manipulating Strings in Python](/en/lessons/manipulating-strings-in-python) +- [Code Reuse and Modularity](/en/lessons/code-reuse-and-modularity) +- [zip](/assets/python-lessons2.zip) +- [obo-t17800628-33.html](/assets/from-html-to-list-of-words-1/obo-t17800628-33.html) diff --git a/en/lessons/from-html-to-list-of-words-2.md b/en/lessons/from-html-to-list-of-words-2.md index 3485dcb6b0..69a64ad197 100755 --- a/en/lessons/from-html-to-list-of-words-2.md +++ b/en/lessons/from-html-to-list-of-words-2.md @@ -16,12 +16,12 @@ exclude_from_check: activity: transforming topics: [python] abstract: "In this lesson, you will learn the Python commands needed to implement the second part of the algorithm begun in the lesson 'From HTML to a List of Words (part 1)'." -next: normalizing-data -previous: from-html-to-list-of-words-1 +next: /en/lessons/normalizing-data +previous: /en/lessons/from-html-to-list-of-words-1 series_total: 15 lessons sequence: 8 python_warning: false -redirect_from: /lessons/from-html-to-list-of-words-2 +redirect_from: /lessons/from-html-to-list-of-words-2/ avatar_alt: A soldier being mocked by a man doi: 10.46430/phen0007 --- @@ -350,7 +350,7 @@ to make sure you have the correct code. - python-lessons3.zip ([zip sync][]) [From HTML to a List of Words (part 1)]: /lessons/from-html-to-list-of-words-1 - [integer]: http://docs.python.org/2.4/lib/typesnumeric.html - [types]: http://docs.python.org/3/library/types.html + [integer]: https://docs.python.org/2.4/lib/typesnumeric.html + [types]: https://docs.python.org/3/library/types.html [zip]: /assets/python-lessons2.zip [zip sync]: /assets/python-lessons3.zip diff --git a/en/lessons/generating-an-ordered-data-set-from-an-OCR-text-file.md b/en/lessons/generating-an-ordered-data-set-from-an-OCR-text-file.md index e55e15e258..605a2e8e22 100755 --- a/en/lessons/generating-an-ordered-data-set-from-an-OCR-text-file.md +++ b/en/lessons/generating-an-ordered-data-set-from-an-OCR-text-file.md @@ -14,7 +14,7 @@ exclude_from_check: activity: transforming topics: [data-manipulation] abstract: "This tutorial illustrates strategies for taking raw OCR output from a scanned text, parsing it to isolate and correct essential elements of metadata, and generating an ordered data set (a python dictionary) from it." -redirect_from: /lessons/generating-an-ordered-data-set-from-an-OCR-text-file +redirect_from: /lessons/generating-an-ordered-data-set-from-an-OCR-text-file/ avatar_alt: A small case with a set of books doi: 10.46430/phen0036 --- @@ -37,13 +37,13 @@ It is often the case that historians involved in digital projects wish to work w 2. Even if you had such an army of helpers, proof-reading the OCR output of, say, a collection of twelfth century Italian charters transcribed and published in 1935, will quickly drive them all mad, make their eyes bleed, and the result will still be a great wad of text containing a great many errors, and you will __still__ have to do __something__ to it before it becomes useful in any context. -Going through a text file line by line and correcting OCR errors one at a time is hugely error-prone, as any proof reader will tell you. There are ways to automate some of this tedious work. A scripting language like Perl or Python can allow you to search your OCR output text for common errors and correct them using "Regular Expressions", a language for describing patterns in text. (So called because they express a ["regular language"](http://en.wikipedia.org/wiki/Regular_language). See L.T. O'Hara's [tutorial on Regular Expressions](/lessons/cleaning-ocrd-text-with-regular-expressions.html) here at the PM.) Regular Expressions, however, are only useful if the expressions you are searching for are ... well ... regular. Unfortunately, much of what you have in OCR output is highly *irregular*. If you could impose some order on it: create an ordered data set out of it, your Regular Expression tools would become much more powerful. +Going through a text file line by line and correcting OCR errors one at a time is hugely error-prone, as any proof reader will tell you. There are ways to automate some of this tedious work. A scripting language like Perl or Python can allow you to search your OCR output text for common errors and correct them using "Regular Expressions", a language for describing patterns in text. (So called because they express a ["regular language"](https://en.wikipedia.org/wiki/Regular_language). See L.T. O'Hara's [tutorial on Regular Expressions](/en/lessons/cleaning-ocrd-text-with-regular-expressions) here at the PM.) Regular Expressions, however, are only useful if the expressions you are searching for are ... well ... regular. Unfortunately, much of what you have in OCR output is highly *irregular*. If you could impose some order on it: create an ordered data set out of it, your Regular Expression tools would become much more powerful. Consider, for example, what happens if your OCR interpreted a lot of strings like this "21 July, 1921" as "2l July, 192l", turning the integer '1' into an 'l'. You would love to be able to write a search and replace script that would turn all instances of 2l into 21, but then what would happen if you had lots of occurrences of strings like this in your text: "2lb. hammer". You'd get a bunch of 21b. hammers; not what you want. If only you could tell your script: only change 2l into 21 in sections where there are dates, not weights. If you had an ordered data set, you could do things like that. Very often the texts that historians wish to digitize are, in fact, ordered data sets: ordered collections of primary source documents, or a legal code say, or a cartulary. But the editorial structure imposed upon such resources is usually designed for a particular kind of data retrieval technology i.e., a codex, a book. For a digitized text you need a different kind of structure. If you can get rid of the book related infrastructure and reorganize the text according to the sections and divisions that you're interested in, you will wind up with data that is much easier to do search and replace operations on, and as a bonus, your text will become immediately useful in a variety of other contexts as well. -This is where a scripting language like Python comes very much in handy. For our project we wanted to prepare some of the documents from a [12th century collection of *imbreviatura*](http://www.worldcat.org/oclc/17591390) from the Italian scribe known as Giovanni Scriba (you can [access the PDF here](https://notariorumitinera.eu/Docs/Biblioteca_Digitale/SB/3a47488c28eef2aedfea52ebbde2c634/dd361cb1479ab2309f5ceef1f875c2a5.pdf)) so that they could be marked up by historians for subsequent NLP analysis or potentially for other purposes as well. The pages of the 1935 published edition look like this. +This is where a scripting language like Python comes very much in handy. For our project we wanted to prepare some of the documents from a [12th century collection of *imbreviatura*](https://www.worldcat.org/oclc/17591390) from the Italian scribe known as Giovanni Scriba (you can [access the PDF here](https://notariorumitinera.eu/Docs/Biblioteca_Digitale/SB/3a47488c28eef2aedfea52ebbde2c634/dd361cb1479ab2309f5ceef1f875c2a5.pdf)) so that they could be marked up by historians for subsequent NLP analysis or potentially for other purposes as well. The pages of the 1935 published edition look like this. {% include figure.html filename="gs_pg110.png" caption="GS page 110" %} @@ -156,7 +156,7 @@ Unfortunately, regular expressions won't help you much here. This text can appea IL CIRTOL.'RE DI G:OV.I\N( sca:FR 339 342 NI .\ßlO CHIAUDANO 9LtTTIA MORESCO -These strings are not regular enough to reliably find with regular expressions; however, if you know what the strings are *supposed* to look like, you can compose some kind of string similarity algorithm to test each string against an exemplar and measure the likelihood that it is a page header. Fortunately, I didn't have to compose such an algorithm, Vladimir Levenshtein did it for us in 1965 (see: ). A computer language can encode this algorithm in any number of ways; here's an effective Python function that will work for us: +These strings are not regular enough to reliably find with regular expressions; however, if you know what the strings are *supposed* to look like, you can compose some kind of string similarity algorithm to test each string against an exemplar and measure the likelihood that it is a page header. Fortunately, I didn't have to compose such an algorithm, Vladimir Levenshtein did it for us in 1965 (see: ). A computer language can encode this algorithm in any number of ways; here's an effective Python function that will work for us: ```python @@ -224,21 +224,21 @@ def rom2ar(rom): (run <[this little script](/assets/generating-an-ordered-data-set-from-an-OCR-text-file/Roman_to_Arabic.txt)> to see in detail how `rome2ar` works. Elegant programming like this can offer insight; like poetry.) ## Some other things we'll need: -At the top of your Python module, you're going to want to import some python modules that are a part of the standard library. (see Fred Gibbs's tutorial [*Installing Python Modules with pip*](/lessons/installing-python-modules-pip)). +At the top of your Python module, you're going to want to import some python modules that are a part of the standard library. (see Fred Gibbs's tutorial [*Installing Python Modules with pip*](/en/lessons/installing-python-modules-pip)). 1. First among these is the "re" (regular expression) module `import re`. Regular expressions are your friends. However, bear in mind Jamie Zawinski's quip: >Some people, when confronted with a problem, think "I know, I'll use regular expressions." Now they have two problems. - (Again, have a look at L.T. O'Hara's introduction here at the Programming Historian [Cleaning OCR’d text with Regular Expressions](/lessons/cleaning-ocrd-text-with-regular-expressions.html)) + (Again, have a look at L.T. O'Hara's introduction here at the Programming Historian [Cleaning OCR’d text with Regular Expressions](/en/lessons/cleaning-ocrd-text-with-regular-expressions)) 2. Also: `from pprint import pprint`. `pprint` is just a pretty-printer for python objects like lists and dictionaries. You'll want it because python dictionaries are much easier to read if they are formatted. -3. And: `from collections import Counter`. We'll want this for the [Find and normalize footnote markers and texts](#footnotes) section below. This is not really necessary, but we'll do some counting that would require a lot of lines of fiddly code and this will save us the trouble. The collections module has lots of deep magic in it and is well worth getting familiar with. (Again, see Doug Hellmann's PyMOTW for the [collections](https://pymotw.com/3/collections/index.html) module. I should also point out that his book [*The Python Standard Library By Example*](https://doughellmann.com/books/the-python-3-standard-library-by-example/) is one well worth having.) +3. And: `from collections import Counter`. We'll want this for the [Find and normalize footnote markers and texts](#find-and-normalize-footnote-markers-and-texts) section below. This is not really necessary, but we'll do some counting that would require a lot of lines of fiddly code and this will save us the trouble. The collections module has lots of deep magic in it and is well worth getting familiar with. (Again, see Doug Hellmann's PyMOTW for the [collections](https://pymotw.com/3/collections/index.html) module. I should also point out that his book [*The Python Standard Library By Example*](https://doughellmann.com/books/the-python-3-standard-library-by-example/) is one well worth having.) ## A very brief review of regular expressions as they are implemented in python -L.T. O'Hara's [introduction](/lessons/cleaning-ocrd-text-with-regular-expressions.html) to using python flavored regular expressions is invaluable. In this context we should review a couple of basic facts about Python's implementation of regular expressions, the `re` module, which is part of Python's standard library. +L.T. O'Hara's [introduction](/en/lessons/cleaning-ocrd-text-with-regular-expressions) to using python flavored regular expressions is invaluable. In this context we should review a couple of basic facts about Python's implementation of regular expressions, the `re` module, which is part of Python's standard library. 1. `re.compile()` creates a regular expression object that has a number of methods. You should be familiar with `.match()`, and `.search()`, but also `.findall()` and `.finditer()` 2. Bear in mind the difference between `.match()` and `.search()`: `.match()` will only match at the __beginning__ of a line, whereas `.search()` will match anywhere in the line __but then it stops__, it'll __only__ return the first match it finds. @@ -490,7 +490,7 @@ Since those roman numeral headings are now reliably findable with our 'slug' reg slug_and_firstline = re.compile("(\[~~~~\sGScriba_)(.*)\s::::\s(\d+)\s~~~~\]\n(.*)(\(\d?.*\d+\))") ``` -Let's break down that regex using the verbose mode (again, see O'Hara's [tutorial](/lessons/cleaning-ocrd-text-with-regular-expressions.html)). Our 'slug' for each charter takes the form "[~\~\~\~ GScriba_CCVII :::: 207 ~~~~]" for example. The compiled pattern above is exactly equivalent to the folowing (note the re.VERBOSE switch at the end): +Let's break down that regex using the verbose mode (again, see O'Hara's [tutorial](/en/lessons/cleaning-ocrd-text-with-regular-expressions)). Our 'slug' for each charter takes the form "[~\~\~\~ GScriba_CCVII :::: 207 ~~~~]" for example. The compiled pattern above is exactly equivalent to the folowing (note the re.VERBOSE switch at the end): ```python slug_and_firstline = re.compile(r""" @@ -823,7 +823,7 @@ Note that the `try: except:` blocks come to the rescue again here. The loop abov > NOTA BENE: Again, bear in mind that we are modifying a data structure in memory rather than editing successive text files. So this loop should be __added__ to your script __below__ the summary and marginal loop, which is __below__ the loop that created your skeleton dictionary. ## Parse Dates and add to the dictionary -Dates are hard. Students of British history cling to [Cheyney](http://www.worldcat.org/oclc/41238508) as to a spar on a troubled ocean. And, given the way the Gregorian calendar was adopted so gradually, and innumerable other local variations, correct date reckoning for medieval sources will always require care and local knowledge. Nevertheless, here too Python can be of some help. +Dates are hard. Students of British history cling to [Cheyney](https://www.worldcat.org/oclc/41238508) as to a spar on a troubled ocean. And, given the way the Gregorian calendar was adopted so gradually, and innumerable other local variations, correct date reckoning for medieval sources will always require care and local knowledge. Nevertheless, here too Python can be of some help. Our Italian summary line invariably contains a date drawn from the text, and it's conveniently set off from the rest of the line by parentheses. So we can parse them and create Python `date` objects. Then, if we want, we can do some simple calendar arithmetic. @@ -850,7 +850,7 @@ for ch in charters: Once you're satisfied that all the parenthetical date expressions are present and correct, and conform to your regular expression, you can parse them and add them to your data structure as dates rather than just strings. For this you can use the `datetime` module. -This module is part of the standard library, is a deep subject, and ought to be the subject of its own tutorial, given the importance of dates for historians. As with a lot of other python modules, a good introduction is Doug Hellmann's [PyMOTW](https://pymotw.com/3/datetime/index.html)(module of the week). An even more able extension library is [mxDateTime](http://www.egenix.com/products/python/mxBase/mxDateTime/). Suffice it here to say that the `datetime.date` module expects parameters like this: +This module is part of the standard library, is a deep subject, and ought to be the subject of its own tutorial, given the importance of dates for historians. As with a lot of other python modules, a good introduction is Doug Hellmann's [PyMOTW](https://pymotw.com/3/datetime/index.html)(module of the week). An even more able extension library is [mxDateTime](https://www.egenix.com/products/python/mxBase/mxDateTime/). Suffice it here to say that the `datetime.date` module expects parameters like this: ```python >>> from datetime import date @@ -957,7 +957,7 @@ Print out our resulting dictionary using `pprint(charters)` and you'll see somet } ``` -Printing out your Python dictionary as a literal string is not a bad thing to do. For a text this size, the resulting file is perfectly manageable, can be mailed around usefully and read into a python repl session very simply using `eval()`, or pasted directly into a Python module file. On the other hand, if you want an even more reliable way to serialize it in an exclusively Python context, look into [`Pickle`](https://docs.python.org/3.7/library/pickle.html). If you need to move it to some other context, JavaScript for example, or some `RDF` triple stores, Python's [`json`](https://docs.python.org/3.7/library/json.html#module-json) module will translate effectively. If you have to get some kind of XML output, I will be very sorry for you, but the [`lxml`](http://lxml.de/) python module may ease the pain a little. +Printing out your Python dictionary as a literal string is not a bad thing to do. For a text this size, the resulting file is perfectly manageable, can be mailed around usefully and read into a python repl session very simply using `eval()`, or pasted directly into a Python module file. On the other hand, if you want an even more reliable way to serialize it in an exclusively Python context, look into [`Pickle`](https://docs.python.org/3.7/library/pickle.html). If you need to move it to some other context, JavaScript for example, or some `RDF` triple stores, Python's [`json`](https://docs.python.org/3.7/library/json.html#module-json) module will translate effectively. If you have to get some kind of XML output, I will be very sorry for you, but the [`lxml`](https://lxml.de/) python module may ease the pain a little. ## Order from disorder, huzzah. Now that we have an ordered data structure, we can do many things with it. As a very simple example, let's append some code that just prints `charters` out as html for display on a web-site: @@ -1049,7 +1049,7 @@ Being able to do this with your, still mostly uncorrected, OCR output is not a t And, our original problem, OCR cleanup, is now much more tractable because we can target regular expressions for the specific sorts of metadata we have: errors in the Italian summary or in the Latin text? Or we could design search-and-replace routines just for specific charters, or groups of charters. -Beyond this though, there's lots you can do with an ordered data set, including feeding it back through a markup tool like the [brat](http://brat.nlplab.org) as we did for the ChartEx project. Domain experts can then start adding layers of semantic tagging even if you don't do any further OCR error correction. Moreover, with an ordered dataset we can get all sorts of output, some other flavor of XML (if you must) for example: TEI (Text Encoding Initiative), or EAD (Encoded Archival Description). Or you could read your dataset directly into a relational database, or some kind of key/value store. All of these things are essentially impossible if you're working simply with a plain text file. +Beyond this though, there's lots you can do with an ordered data set, including feeding it back through a markup tool like the [brat](https://brat.nlplab.org) as we did for the ChartEx project. Domain experts can then start adding layers of semantic tagging even if you don't do any further OCR error correction. Moreover, with an ordered dataset we can get all sorts of output, some other flavor of XML (if you must) for example: TEI (Text Encoding Initiative), or EAD (Encoded Archival Description). Or you could read your dataset directly into a relational database, or some kind of key/value store. All of these things are essentially impossible if you're working simply with a plain text file. The bits of code above are in no way a turn-key solution for cleaning arbitrary OCR output. There is no such magic wand. The Google approach to scanning the contents of research libraries threatens to drown us in an ocean of bad data. Worse, it elides a fundamental fact of digital scholarship: digital sources are hard to get. Reliable, flexible, and useful digital texts require careful redaction and persistent curation. Google, Amazon, Facebook, *et alia* do not have to concern themselves with the quality of their data, just its quantity. Historians, on the other hand, must care first for the integrity of their sources. diff --git a/en/lessons/geocoding-qgis.md b/en/lessons/geocoding-qgis.md index c365e36995..feba29e6b6 100755 --- a/en/lessons/geocoding-qgis.md +++ b/en/lessons/geocoding-qgis.md @@ -13,12 +13,12 @@ difficulty: 2 review-ticket: https://github.com/programminghistorian/ph-submissions/issues/27 activity: transforming topics: [mapping] -previous: georeferencing-qgis +previous: /en/lessons/georeferencing-qgis series_total: 5 lessons sequence: 5 abstract: | Learn how to use QGIS to convert lists of place names in to geographic coordinates, allowing you to map them. -redirect_from: /lessons/geocoding-qgis +redirect_from: /lessons/geocoding-qgis/ avatar_alt: A young man kissing a young woman on the cheek doi: 10.46430/phen0066 --- @@ -37,13 +37,13 @@ Many types of sources used by historians are inherently spatial. For example: - Imports and exports - Routes and itineraries -In this tutorial, you will learn how to 'geocode' historial data containing placenames (towns, counties, countries, etc), thus making them mappable using [QGIS](http://www.qgis.org/en/site/), a digital mapping software suite. This will allow you to: +In this tutorial, you will learn how to 'geocode' historial data containing placenames (towns, counties, countries, etc), thus making them mappable using [QGIS](https://www.qgis.org/en/site/), a digital mapping software suite. This will allow you to: - Display your data as a map (whether it originated as a list, table, or prose) - Analyse distances between locations in your data - View and analyse geographical distribution within your data -This tutorial forms part of the Mapping and GIS series on *Programming Historian*, and builds upon skills you will have learned in earlier tutorials, especially [Installing QGIS 2.0 and Adding Layers](/lessons/qgis-layers). It presumes that you have a set of [shapefiles](https://en.wikipedia.org/wiki/Shapefile) relevant to the region for which you intend to produce a map, and data that you would like to get into those shapefiles so that it can be visualised and analysed. +This tutorial forms part of the Mapping and GIS series on *Programming Historian*, and builds upon skills you will have learned in earlier tutorials, especially [Installing QGIS 2.0 and Adding Layers](/en/lessons/qgis-layers). It presumes that you have a set of [shapefiles](https://en.wikipedia.org/wiki/Shapefile) relevant to the region for which you intend to produce a map, and data that you would like to get into those shapefiles so that it can be visualised and analysed. ## About Geocoding @@ -54,7 +54,7 @@ There is often confusion between processes of [geocoding](https://en.wikipedia.o - Georeferencing refers to placing visual elements, usually raster images such as satellite photographs, scans of old maps, or some types of vector image such as architectural or archaeological drawings, into geographical space. This involves specifying latitude, longitude coordinates, and scale. - Geocoding is the process of resolving addresses (or some other kind of spatial description) which form part of a dataset into geometries on a map. This gives the ability to view, analyse and query that dataset spatially. -In many modern applications geocoding is completed automatically, often using the mapping tools and gazetters offered seamlessly as part of [Google Maps](https://www.google.co.uk/maps) or [OpenStreetMap](https://www.openstreetmap.org/). When working with contemporary data, or data from relatively recent periods, and Western European or North American historical contexts, this is often sufficient. If you are using data containing place names that are consistent with the present day, you can use the QGIS web geocoder plugin detailed in the postscript to this tutorial, or the [Edinburgh Geoparser](/lessons/geoparsing-text-with-edinburgh). +In many modern applications geocoding is completed automatically, often using the mapping tools and gazetters offered seamlessly as part of [Google Maps](https://www.google.co.uk/maps) or [OpenStreetMap](https://www.openstreetmap.org/). When working with contemporary data, or data from relatively recent periods, and Western European or North American historical contexts, this is often sufficient. If you are using data containing place names that are consistent with the present day, you can use the QGIS web geocoder plugin detailed in the postscript to this tutorial, or the [Edinburgh Geoparser](/en/lessons/geoparsing-text-with-edinburgh). Many historians will be working on contexts where the place names in their data do not match the present day. Remember that street names tend to change relatively frequently, either in terms of spelling or entirely. Administrative areas have changed relatively frequently and were sometimes used inconsistently in historical sources (e.g. was Bristol in Gloucestershire, Somerset, City of Bristol, Avon?) and indeed places have moved between countries, and countries have changed in name and extent. Even town names have changed and are subject to linguistic ambiguities (e.g. *Lynn Episcopi*, Bishop's Lynn, Lynn, King's Lynn, Kings Lynn). For these reasons it is often better to avoid using automated online geocoding tools and create a gazetteer to suit the historical context which you are researching. The processes described in this tutorial are manual, and can be modified and applied to almost any geographical or historical context. @@ -69,15 +69,15 @@ At the end of the tutorial there is a note on using automated geocoding tools wh ## Getting Started -This tutorial assumes that you have installed QGIS version 2 or newer and have followed the *Programming Historian* tutorial [Installing QGIS 2.0 and Adding Layers](/lessons/qgis-layers) by Jim Clifford, Josh MacFadyen and Daniel Macfarlane. Or, at least that you are familiar with the process of adding vector layers in QGIS. +This tutorial assumes that you have installed QGIS version 2 or newer and have followed the *Programming Historian* tutorial [Installing QGIS 2.0 and Adding Layers](/en/lessons/qgis-layers) by Jim Clifford, Josh MacFadyen and Daniel Macfarlane. Or, at least that you are familiar with the process of adding vector layers in QGIS. This tutorial was prepared using QGIS 2.14 'Essen' on Mac OS X 10.11. Menus, windows, and options might appear slightly different on different platforms or versions, but it should not be difficult to translate any differences. At a few points in the tutorial reference is made to how these techniques could be applied using [ArcGIS](https://www.arcgis.com/features/index.html), which is the industry standard commercial GIS application, and is widely available at universities, but is not always superior to QGIS. You will also need to use a relational database such as Microsoft Access or [LibreOffice Base](https://www.libreoffice.org/installation-instructions/), or alternatively be very proficient with spreadsheets. The instructions in the tutorial are designed for use with LibreOffice Base, which is a free download as part of the [LibreOffice](https://www.libreoffice.org/installation-instructions/) suite. -**NB** LibreOffice requires a full installation of Java in order to use the Base application. This is achieved most easily by downloading and installing the Java 8 Development Kit for your operating system from [Oracle](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html). The Java 8 Runtime Environment does NOT work with LibreOffice on Mac OS X 10.11. +**NB** LibreOffice requires a full installation of Java in order to use the Base application. This is achieved most easily by downloading and installing the Java 8 Development Kit for your operating system from [Oracle](https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html). The Java 8 Runtime Environment does NOT work with LibreOffice on Mac OS X 10.11. -The tutorial will map the data extracted from [*Alumni Oxonienses*](http://www.british-history.ac.uk/alumni-oxon/1500-1714) in the *Programming Historian* lesson [Using Gazetteers to Extract Sets of Keywords from Free-Flowing Texts](/lessons/extracting-keywords) using publically available maps of English and Welsh historic counties. If you complete that tutorial first it will help you to understand the nature of the data which is being mapped here. These data are provided as both a full dataset and also a separate file which is a summary of the numbers of Oxford alumni by their county of origin, created from the first file using an Excel PivotTable. +The tutorial will map the data extracted from [*Alumni Oxonienses*](https://www.british-history.ac.uk/alumni-oxon/1500-1714) in the *Programming Historian* lesson [Using Gazetteers to Extract Sets of Keywords from Free-Flowing Texts](/en/lessons/extracting-keywords) using publically available maps of English and Welsh historic counties. If you complete that tutorial first it will help you to understand the nature of the data which is being mapped here. These data are provided as both a full dataset and also a separate file which is a summary of the numbers of Oxford alumni by their county of origin, created from the first file using an Excel PivotTable. # The Data @@ -116,9 +116,9 @@ In this short tutorial we will map the total numbers of early modern University * Set up a new Project file in QGIS and save it in your choice of location. (*NB.* QGIS defaults to saving 'relative pathnames' which means that as long as you save all of your project files in the same folder or its subfolders, you can move it to a different location, such as a USB stick. You can check this setting via the menu `Project>Project Properties` and the `General` side tab). * It is very important to set the [Coordinate Reference System](https://en.wikipedia.org/wiki/Spatial_reference_system) (CRS) to one that suits the data you will import, and the location you plan to map. Go to the menu `Project>Project Properties` and select the 'CRS' tab at the side. First select ‘Enable on the fly CRS transformation’ at the top of this window then use the filter box to find and select `OSGB 1936 / the British National Grid` with the authority ID `ESPG:27700` from under the projected coordinate systems heading. -There is an important distinction between Geographic Coordinate Systems, which simply define measurement units and the datum, and Projected Coordinate Systems, which also define the way in which the globe is ‘flattened’ onto a map. [OSGB](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) is available in both variants in QGIS, so choose the 'projected' version to get a map in which the United Kingdom appears the shape you would expect. For more details on projections in GIS, see the [Working with Projections in QGIS Tutorial](http://www.qgistutorials.com/en/docs/3/working_with_projections.html). +There is an important distinction between Geographic Coordinate Systems, which simply define measurement units and the datum, and Projected Coordinate Systems, which also define the way in which the globe is ‘flattened’ onto a map. [OSGB](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) is available in both variants in QGIS, so choose the 'projected' version to get a map in which the United Kingdom appears the shape you would expect. For more details on projections in GIS, see the [Working with Projections in QGIS Tutorial](https://www.qgistutorials.com/en/docs/3/working_with_projections.html). -* Download a Shapefile containing polygons of the historic counties of England and Wales from [http://www.county-borders.co.uk](http://www.county-borders.co.uk/) (choose the file `Definition A: SHP OSGB36 Simplified` which is a version of the pre-1843 county boundaries of Great Britain projected on the OS National Grid, without detached portions of counties). Unzip the contents of the ZIP file in the same folder as your project file +* Download a Shapefile containing polygons of the historic counties of England and Wales from [https://www.county-borders.co.uk](https://www.county-borders.co.uk/) (choose the file `Definition A: SHP OSGB36 Simplified` which is a version of the pre-1843 county boundaries of Great Britain projected on the OS National Grid, without detached portions of counties). Unzip the contents of the ZIP file in the same folder as your project file * Click the `Add Vector Layer` button (looks like a line graph) from the Manage Layers toolbar and then `Browse` to select and add the Shapefile `UKDefinitionA.shp` from within the folder you’ve unzipped. {% include figure.html filename="QGISFigureAddVector.png" caption="Figure 1: The QGIS Add Vector window on MacOS (the Add Vector button is circled on the left hand toolbar)" %} @@ -142,7 +142,7 @@ This data can now be shown as a [choropleth map](https://en.wikipedia.org/wiki/C {% include figure.html filename="QGISFigure2.png" caption="Figure 3: The vector layer Styles tab showing classified values based on the field joined from the table" %} -For more information on choosing the correct classification method for your data, start by looking at this article on [Classification in GIS](http://wiki.gis.com/wiki/index.php/Classification). Examine the results of your map and think about what is actually being represented. Are the raw numbers of alumni, coloured according to the same classes, for very differently sized counties, helpful? Choropleth maps should normally display data that has been normalised in some way, for example showing population density, rather than raw population. +For more information on choosing the correct classification method for your data, start by looking at this article on [Classification in GIS](https://wiki.gis.com/wiki/index.php/Classification). Examine the results of your map and think about what is actually being represented. Are the raw numbers of alumni, coloured according to the same classes, for very differently sized counties, helpful? Choropleth maps should normally display data that has been normalised in some way, for example showing population density, rather than raw population. You may wish to experiment with the Expression Builder (accessed via the ∑ symbol next to `Column` in `Properties>Style`) to normalise these values using other columns and values that are available to you. Ideally we might normalise by population, but in the absence of this data, you might experiment by using the `$area` property, which is intrinsic to polygon shape layers in GIS. The very simple expression needed to create a map colour ramp on this would be (note that the field name contains spaces, so needs to be contained within double quotation marks): @@ -156,17 +156,17 @@ When you alter any of these settings within the graduated style page you will ne Geocoding is a much more powerful technique than simple table joins because each and every line of your data remains visible and able to be analysed within the GIS software as an individual point on the map (as in table 2). Fundamentally the aim is to join each item of data to a pair of coordinates. Most historical data cannot be geocoded automatically using online tools or QGIS plugins. The geocoding process must therefore be carried out manually to match each data row with a location. This is a simple database operation joining (matching) your data with a gazetteer (a list of places with coordinates). Many gazetteers are available, but relatively few are suitable for use with historical data, for example, for England: -- [Association of British Counties Gazetteer](http://www.gazetteer.org.uk/index.php) (data available to purchase) +- [Association of British Counties Gazetteer](https://gazetteer.org.uk/index.php) (data available to purchase) - [The Historical Gazetteer of England's Place Names](https://www.placenames.org.uk/) allows you to geocode individual locations online only, unfortunately the API service for accessing this data for use in automated geocoding, known as DEEP, part of Unlock, has now (late 2016) been withdrawn. A better [browsing interface](https://www.nottingham.ac.uk/research/groups/ins/Resources/Digital-Survey-of-English-Place-Names.aspx) is available at the [Survey of English Place-Names](https://www.nottingham.ac.uk/research/groups/epns/survey.aspx) -If no gazetteer exists for the area or period that you are studying, you can make your own relatively simply from a vector map by creating a point layer containing the information that you require within QGIS (potentially by combining information from other existing layers) and exporting that complete with XY coordinates. For some parts of the world there are neither historical gazetters, nor vector maps suitable for historical periods, in these cases you will have to investigate creating your own vector and point layer; see the tutorial [Creating New Vector Layers in QGIS 2.0](/lessons/vector-layers-qgis). +If no gazetteer exists for the area or period that you are studying, you can make your own relatively simply from a vector map by creating a point layer containing the information that you require within QGIS (potentially by combining information from other existing layers) and exporting that complete with XY coordinates. For some parts of the world there are neither historical gazetters, nor vector maps suitable for historical periods, in these cases you will have to investigate creating your own vector and point layer; see the tutorial [Creating New Vector Layers in QGIS 2.0](/en/lessons/vector-layers-qgis). ### Tutorial: Creating a Custom Gazetteer and Geocoding in Relational Database If you have completed the first part, you can carry on and follow the steps below in the same project. If you did not, or you want to start a new clean project, follow the instructions from the first section to: * Set up a new Project file in QGIS, and set the Coordinate Reference System to `OSGB 1936/the British National Grid` with the authority ID `ESPG:27700` as a projected coordinate system using `Project>Project Properties>CRS` -* Download a Shapefile containing polygons of the historic counties of England and Wales from [http://www.county-borders.co.uk/](http://www.county-borders.co.uk/) (choose definition A and the OS National Grid). +* Download a Shapefile containing polygons of the historic counties of England and Wales from [https://www.county-borders.co.uk/](https://www.county-borders.co.uk/) (choose definition A and the OS National Grid). Using your existing project, you can now start to add more layers to create your gazetteer: @@ -188,7 +188,7 @@ This data can now be matched against your existing data to complete the geocodin We can now create a composite table of these locations and the data from our original table. This is created by matching the name of the county in the 'place' field of the alumni table with its equivalent in the new gazetteer using a relational database. This tutorial assumes that you have many hundreds or thousands or rows of data (as we do in this tutorial), requiring an automated method. If you only have a few rows, or you have difficulties using these methods, it is possible to do it manually - see 'Geocoding your own Historical Data' below. -In simple scenarios (such as this one where we are only matching a single 'place' attribute – i.e. only 'county') it is possible to code your data to a gazetteer using the [VLOOKUP](https://support.office.com/en-gb/article/VLOOKUP-function-0bbc8083-26fe-4963-8ab8-93a18ad188a1) function in Microsoft Excel (or equivalent spreadsheets) or even using the [MMQGIS](http://michaelminn.com/linux/mmqgis/) plugin within QGIS. However, in most practical scenarios you will probably wish to match on several attributes simultaneously (for instance town, county and country – you would want to distinguish between Sudbury, Suffolk, England; Sudbury, Derbyshire, England; Sudbury, Middlesex, England; and Sudbury, Ontario, Canada). This can be achieved in a somewhat cumbersome way using the [INDEX](https://support.office.com/en-gb/article/INDEX-function-a5dcf0dd-996d-40a4-a822-b56b061328bd) function in Excel, but is more practical, and extensible, in a relational database such as Microsoft Access or LibreOffice Base. +In simple scenarios (such as this one where we are only matching a single 'place' attribute – i.e. only 'county') it is possible to code your data to a gazetteer using the [VLOOKUP](https://support.office.com/en-gb/article/VLOOKUP-function-0bbc8083-26fe-4963-8ab8-93a18ad188a1) function in Microsoft Excel (or equivalent spreadsheets) or even using the [MMQGIS](https://michaelminn.com/linux/mmqgis/) plugin within QGIS. However, in most practical scenarios you will probably wish to match on several attributes simultaneously (for instance town, county and country – you would want to distinguish between Sudbury, Suffolk, England; Sudbury, Derbyshire, England; Sudbury, Middlesex, England; and Sudbury, Ontario, Canada). This can be achieved in a somewhat cumbersome way using the [INDEX](https://support.office.com/en-gb/article/INDEX-function-a5dcf0dd-996d-40a4-a822-b56b061328bd) function in Excel, but is more practical, and extensible, in a relational database such as Microsoft Access or LibreOffice Base. This tutorial uses LibreOffice, which is an Open Source alternative to Microsoft Office and is available for Windows, Mac OS X and all variants of Linux etc (NB it requires a full Java installation). It includes a relational database application on all platforms, unlike Microsoft Access which is available only in the Windows version of Office. However, it is quite restricted in its functionality. If you use Microsoft Access, or are a very proficient spreadsheet user, please feel free to replicate this process using your preferred software. @@ -244,7 +244,7 @@ A more useful way of depicting the geocoded data is to use QGIS's advanced displ You have now completed the geocoding process, and can enjoy the advantages of being able to analyse this inherently spatial historical data in a spatial way. In a real world scenario, you would probably only geocode data which is more precise than simple county level, giving a good deal more analytical potential and making maps plotted more meaningful. Where you have data which can be geocoded to a high – and crucially consistent – level of precision, it is possible to conduct a wide range of geographical analyses such as measures of clustering or distances. -For example, you can easily tweak and refine which records are mapped by changing the definition query in the properties of your geocoded layer (Right click on `GeocodedAlumni` in Layers Panel and select `Layer Properties>General>Provider Feature Filter>Query Builder`). You can use the less than or greater than operators to define years and see if trends change over time, or use the [SQL LIKE](http://www.w3schools.com/sql/sql_like.asp) statement to query the ‘details’ column to filter particular colleges – did they tend to attract students from particular counties? These queries use standard [SQL language](http://www.w3schools.com/sql/) and can be combined with `AND`, `NOT` etc. This example would select only those students who had matriculated at Magdalen College: +For example, you can easily tweak and refine which records are mapped by changing the definition query in the properties of your geocoded layer (Right click on `GeocodedAlumni` in Layers Panel and select `Layer Properties>General>Provider Feature Filter>Query Builder`). You can use the less than or greater than operators to define years and see if trends change over time, or use the [SQL LIKE](https://www.w3schools.com/sql/sql_like.asp) statement to query the ‘details’ column to filter particular colleges – did they tend to attract students from particular counties? These queries use standard [SQL language](https://www.w3schools.com/sql/) and can be combined with `AND`, `NOT` etc. This example would select only those students who had matriculated at Magdalen College: ``` "Details" LIKE '%Magdalen Hall%' diff --git a/en/lessons/geoparsing-text-with-edinburgh.md b/en/lessons/geoparsing-text-with-edinburgh.md index 95567c4083..f632c8c91b 100755 --- a/en/lessons/geoparsing-text-with-edinburgh.md +++ b/en/lessons/geoparsing-text-with-edinburgh.md @@ -17,7 +17,7 @@ review-ticket: https://github.com/programminghistorian/ph-submissions/issues/26 activity: presenting topics: [mapping] abstract: "This tutorial teaches users how to use the Edinburgh Geoparser to process a piece of English-language text, extract and resolve the locations contained within it, and plot them as a web map." -redirect_from: /lessons/geoparsing-text-with-edinburgh +redirect_from: /lessons/geoparsing-text-with-edinburgh/ avatar_alt: Map of the city of Edinburgh doi: 10.46430/phen0067 --- @@ -33,7 +33,7 @@ The Geoparser works best on running text, as it considers locations in context f In December 2015, the Edinburgh Geoparser was released under the University of Edinburgh’s GPL license to be used by other researchers in the field of text mining and natural language processing as well as scholars who are interested in geoparsing text. More information on its documentation, publications using it and how to download it can be found [here](https://www.ltg.ed.ac.uk/software/geoparser/). -A simple online demo of the vanilla Edinburgh Geoparser can be tried out [here](http://jekyll.inf.ed.ac.uk/geoparser.html). It provides only the visual interface to the Geoparser output after uploading a text file and selecting a gazetteer. The demo is otherwise not configurable and should only be used to try out small examples and not for geo-parsing a large number of files. +A simple online demo of the vanilla Edinburgh Geoparser can be tried out [here](https://jekyll.inf.ed.ac.uk/geoparser.html). It provides only the visual interface to the Geoparser output after uploading a text file and selecting a gazetteer. The demo is otherwise not configurable and should only be used to try out small examples and not for geo-parsing a large number of files. The following lesson explains how the Edinburgh Geoparser works under the hood and contains information on: @@ -151,11 +151,11 @@ It takes the stdout from the first command and runs the Geoparser with the follo * `-t` specifies the format of your input.  Text input (`plain`) is recommended for geo-parsing. - * `-g` specifies the gazetteer that should be queried.  In the above example, the gazetteer selected is [GeoNames](http://www.geonames.org/) (`geonames`), a large global gazetteer.  You can also specify other gazetteers, for example the DEEP gazetteer of historical placenames in England (`deep`) or the Pleiades+ gazetteer of ancient places (`plplus`).  For more information on the types of gazetteers offered as part of the distribution see the Geoparser documentation [here](http://groups.inf.ed.ac.uk/geoparser/documentation/v1.3/html/gaz.html). + * `-g` specifies the gazetteer that should be queried.  In the above example, the gazetteer selected is [GeoNames](https://www.geonames.org/) (`geonames`), a large global gazetteer.  You can also specify other gazetteers, for example the DEEP gazetteer of historical placenames in England (`deep`) or the Pleiades+ gazetteer of ancient places (`plplus`).  For more information on the types of gazetteers offered as part of the distribution see the Geoparser documentation [here](https://groups.inf.ed.ac.uk/geoparser/documentation/v1.3/html/gaz.html). * `-o` specifies two pieces of information, the output directory (`../out`) which is located within the `geoparser-1.3` directory and a prefix for the output file name (in this case `172172`, the same prefix as that of the input file name). Once the command is run and the Geoparser is finished, the result files appear in the output directory (`../out`) starting with the specified prefix. -When running the Geoparser, the specified text file is going through a series of processing steps which are combined into one pipeline.  It is first [tokenised](https://en.wikipedia.org/wiki/Lexical_analysis#Tokenization_), [part-of-speech-tagged](https://en.wikipedia.org/wiki/Part-of-speech_tagging) and [lemmatised](https://en.wikipedia.org/wiki/Lemmatisation). After these initial steps, [named entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition) is performed to identify location and person names as well as dates.  It was found that identifying location and person names in parallel helps to distinguish some ambiguous cases (like the string "Lewis" which could refer to a first name or the Scottish island) and where their context helps to distinguish between them.  The extracted locations are then resolved to latitude/longitude coordinate pairs.  The text is then further processed by identifying syntactic phrases (chunking) and temporal relations.  The latter two steps are not very relevant to this lesson and will therefore not be explained in detail.  Finally, visualisations are created to be able to inspect the file and the Geoparser output using a map interface in a browser.  For more information on each of the sub-components of the Geoparser, see the documentation [here](http://groups.inf.ed.ac.uk/geoparser/documentation/v1.3/html/pipeline.html). +When running the Geoparser, the specified text file is going through a series of processing steps which are combined into one pipeline.  It is first [tokenised](https://en.wikipedia.org/wiki/Lexical_analysis#Tokenization_), [part-of-speech-tagged](https://en.wikipedia.org/wiki/Part-of-speech_tagging) and [lemmatised](https://en.wikipedia.org/wiki/Lemmatisation). After these initial steps, [named entity recognition](https://en.wikipedia.org/wiki/Named-entity_recognition) is performed to identify location and person names as well as dates.  It was found that identifying location and person names in parallel helps to distinguish some ambiguous cases (like the string "Lewis" which could refer to a first name or the Scottish island) and where their context helps to distinguish between them.  The extracted locations are then resolved to latitude/longitude coordinate pairs.  The text is then further processed by identifying syntactic phrases (chunking) and temporal relations.  The latter two steps are not very relevant to this lesson and will therefore not be explained in detail.  Finally, visualisations are created to be able to inspect the file and the Geoparser output using a map interface in a browser.  For more information on each of the sub-components of the Geoparser, see the documentation [here](https://groups.inf.ed.ac.uk/geoparser/documentation/v1.3/html/pipeline.html). Note that when using the Geoparser in combination with the GeoNames gazetteer some historical place names will not be identified as they are missing from the gazetteer. Also the Geoparser team can provide additional pre-processing to improve the quality of optical-character recognised output (e.g. to fix soft-hyphen splitting or to deal with the long “s” character). Those scripts are not distributed with the standard distribution but available on request. @@ -219,9 +219,9 @@ where * `W`(est) `N`(orth) `E`(ast) `S`(outh) are decimal degrees * `score` is the same as for option `-l`. -You can grab the coordinates of a bounding box for a particular area using this online [BoundingBox](http://boundingbox.klokantech.com) tool. For example, a bounding box for Canada is `[W:-141.002701, N:83.110619, E:-52.620201, S:41.681019]` (see Figure 5) +You can grab the coordinates of a bounding box for a particular area using this online [BoundingBox](https://boundingbox.klokantech.com) tool. For example, a bounding box for Canada is `[W:-141.002701, N:83.110619, E:-52.620201, S:41.681019]` (see Figure 5) -{% include figure.html filename="geoparser_figure03.png" caption="Figure 5: Bounding box for Canada drawn on [BoundingBox](http://boundingbox.klokantech.com)." %} +{% include figure.html filename="geoparser_figure03.png" caption="Figure 5: Bounding box for Canada drawn on [BoundingBox](https://boundingbox.klokantech.com)." %} To specify this bounding box using the previous example, go back to the scripts directory and run the following command: @@ -289,7 +289,7 @@ If the document date is not specified all temporal expressions will be interpret ### Geo-parsing Multiple Text Files -Now that you know how to geo-parse one file, you may want to do the same thing for a set of documents all at once. You can download a simple shell script which geo-parses multiple files [here](http://groups.inf.ed.ac.uk/geoparser/scripts/run-multiple-files.sh). Please refer to the [Geoparser workshop](https://web.archive.org/web/20180422010112/http://homepages.inf.ed.ac.uk/balex/publications/geoparser-workshop.pdf) slides for more information on how to make this script executable, run and it and adapt it to your needs. +Now that you know how to geo-parse one file, you may want to do the same thing for a set of documents all at once. You can download a simple shell script which geo-parses multiple files [here](https://groups.inf.ed.ac.uk/geoparser/scripts/run-multiple-files.sh). Please refer to the [Geoparser workshop](https://web.archive.org/web/20180422010112/http://homepages.inf.ed.ac.uk/balex/publications/geoparser-workshop.pdf) slides for more information on how to make this script executable, run and it and adapt it to your needs. ### Extracting Geo-Resolution Output to TSV @@ -367,18 +367,18 @@ The lesson is also available in workshop form. If you're interested in running The Geoparser team also welcomes suggestions for future collaboration to tailor the Geoparser to different needs. Please get in touch if you have ideas about how it could be applied. -In the past the Geoparser was used to identify place names for different purposes and in different types of data (e.g. Grover et al., 2010 and Alex et al., 2015). For example, it was adapted to perform fine-grained geo-parsing for literature set in Edinburgh ([Palimpsest](http://palimpsest.blogs.edina.ac.uk/)) presented in the [LitLong](http://litlong.org/) interface. It was used to geo-parse -* volumes of the Survey of English Place Names ([DEEP](http://web.archive.org/web/20170722115758/http://englishplacenames.cerch.kcl.ac.uk/), see Grover and Tobin, 2014), -* large historical collections related to commodity trading in the 19th century British Empire ([Trading Consequences](http://tradingconsequences.blogs.edina.ac.uk/)) and -* 19th century British newspapers by [Prof. Ian Gregory](http://www.lancaster.ac.uk/staff/gregoryi/)’s group at Lancaster University. +In the past the Geoparser was used to identify place names for different purposes and in different types of data (e.g. Grover et al., 2010 and Alex et al., 2015). For example, it was adapted to perform fine-grained geo-parsing for literature set in Edinburgh ([Palimpsest](https://palimpsest.blogs.edina.ac.uk/)) presented in the [LitLong](https://litlong.org/) interface. It was used to geo-parse +* volumes of the Survey of English Place Names ([DEEP](https://web.archive.org/web/20170722115758/https://englishplacenames.cerch.kcl.ac.uk/), see Grover and Tobin, 2014), +* large historical collections related to commodity trading in the 19th century British Empire ([Trading Consequences](https://tradingconsequences.blogs.edina.ac.uk/)) and +* 19th century British newspapers by [Prof. Ian Gregory](https://www.lancaster.ac.uk/staff/gregoryi/)’s group at Lancaster University. -The Geoparser was also adapted to the ancient world for the [Google Ancient Places](https://googleancientplaces.wordpress.com/) project (e.g. see Isaksen et al., 2011), with its [GapVis](http://nrabinowitz.github.io/gapvis/)  interface. More recently, the Geoparser was used to geo-parse Twitter user profile locations (Alex et al, 2016) and the mass digitised text, including the Gazetteers of Scotland (Filgueira et al., 2020) and Encyclopaedia Britannica (Filgueira et al., 2021) +The Geoparser was also adapted to the ancient world for the [Google Ancient Places](https://googleancientplaces.wordpress.com/) project (e.g. see Isaksen et al., 2011), with its [GapVis](https://nrabinowitz.github.io/gapvis/)  interface. More recently, the Geoparser was used to geo-parse Twitter user profile locations (Alex et al, 2016) and the mass digitised text, including the Gazetteers of Scotland (Filgueira et al., 2020) and Encyclopaedia Britannica (Filgueira et al., 2021) ## References -Beatrice Alex, Clare Llewellyn, Claire Grover, Jon Oberlander and Richard Tobin (2016). Homing in on Twitter users: Evaluating an Enhanced Geoparser for User Profile Locations. 2016. In the Proceedings of the 10th Language Resources and Evaluation Conference (LREC), 23-28 May 2016. [[pdf](http://www.lrec-conf.org/proceedings/lrec2016/pdf/129_Paper.pdf)] +Beatrice Alex, Clare Llewellyn, Claire Grover, Jon Oberlander and Richard Tobin (2016). Homing in on Twitter users: Evaluating an Enhanced Geoparser for User Profile Locations. 2016. In the Proceedings of the 10th Language Resources and Evaluation Conference (LREC), 23-28 May 2016. [[pdf](https://www.lrec-conf.org/proceedings/lrec2016/pdf/129_Paper.pdf)] -Beatrice Alex, Kate Byrne, Claire Grover and Richard Tobin (2015). Adapting the Edinburgh Geoparser for Historical Georeferencing. International Journal for Humanities and Arts Computing, 9(1), pp. 15-35, March 2015.[[pdf](http://www.euppublishing.com/doi/pdfplus/10.3366/ijhac.2015.0136)] +Beatrice Alex, Kate Byrne, Claire Grover and Richard Tobin (2015). Adapting the Edinburgh Geoparser for Historical Georeferencing. International Journal for Humanities and Arts Computing, 9(1), pp. 15-35, March 2015.[[pdf](https://www.euppublishing.com/doi/pdfplus/10.3366/ijhac.2015.0136)] Rosa Filgueira, Claire Grover, Vasilios Karaiskos, Beatrice Alex, Sarah Van Eyndhoven, Lisa Gotthard, and Melissa Terras (2021). Extending defoe for the efficient analysis of historical texts at scale. In 2021 IEEE 17th International Conference on eScience (eScience), pp. 21-29. diff --git a/en/lessons/georeferencing-qgis.md b/en/lessons/georeferencing-qgis.md index a6208bdaf0..b2c49bbb4f 100755 --- a/en/lessons/georeferencing-qgis.md +++ b/en/lessons/georeferencing-qgis.md @@ -19,11 +19,11 @@ exclude_from_check: - review-ticket abstract: "In this lesson, you will learn how to georeference historical maps so that they may be added to a GIS as a raster layer." -previous: vector-layers-qgis -next: geocoding-qgis +previous: /en/lessons/vector-layers-qgis +next: /en/lessons/geocoding-qgis series_total: 5 lessons sequence: 4 -redirect_from: /lessons/georeferencing-qgis +redirect_from: /lessons/georeferencing-qgis/ avatar_alt: Map of a moutnaintop city doi: 10.46430/phen0027 --- @@ -45,7 +45,7 @@ georeferencing is one of our most commonly used tools. The technique uses a series of control points to give a two-dimensional object like a paper map the real world coordinates it needs to align with the three-dimensional features of the earth in GIS software (in [Intro to -Google Maps and Google Earth](/lessons/googlemaps-googleearth) we saw an 'overlay' which is a Google +Google Maps and Google Earth](/en/lessons/googlemaps-googleearth) we saw an 'overlay' which is a Google Earth shortcut version of georeferencing). Georeferencing a historical map requires a knowledge of both the @@ -57,7 +57,7 @@ Entering control points in a GIS is easy, but behind the scenes, georeferencing uses complex transformation and compression processes. These are used to correct the distortions and inaccuracies found in many historical maps and stretch the maps so that they fit geographic -coordinates. In cartography this is known as [rubber-sheeting](http://en.wikipedia.org/wiki/Rubbersheeting) because +coordinates. In cartography this is known as [rubber-sheeting](https://en.wikipedia.org/wiki/Rubbersheeting) because it treats the map as if it were made of rubber and the control points as if they were tacks 'pinning' the historical document to a three dimensional surface like the globe. @@ -78,12 +78,12 @@ GDAL and check the box beside it, and click OK. - At this point, you need to shut down and relaunch QGIS. For the purposes of this example, and to keep things as simple as possible, don't reload your existing project but instead start a new project. -- Set up the [Coordinate Reference System](http://en.wikipedia.org/wiki/Spatial_reference_system) (CRS) correctly (see - [Installing QGIS 2.0 and adding Layers](/lessons/qgis-layers) for a reminder) +- Set up the [Coordinate Reference System](https://en.wikipedia.org/wiki/Spatial_reference_system) (CRS) correctly (see + [Installing QGIS 2.0 and adding Layers](/en/lessons/qgis-layers) for a reminder) - Save this new project (under File menu, select Save Project) and call it 'georeferencing.' - Add the 'coastline\_polygon' layer (see [Installing QGIS 2.0 and - adding Layers](/lessons/qgis-layers) for a reminder) + adding Layers](/en/lessons/qgis-layers) for a reminder) ## Open the Necessary GIS Layers @@ -123,7 +123,7 @@ referred to as 'Lots' in PEI. Hence the file name {% include figure.html filename="geo41.png" caption="Figure 4" %} For more information on adding and visualizing layers see [Installing -QGIS 2.0 and adding Layers](/lessons/qgis-layers). +QGIS 2.0 and adding Layers](/en/lessons/qgis-layers). {% include figure.html filename="geo51.png" caption="Figure 5" %} @@ -179,7 +179,7 @@ Some tips for choosing control points: made. - Check that your control points did not change location over time. Roads were often re-routed, and even houses and other buildings were - moved, especially [in Atlantic Canada](http://books.google.ca/books?id=TqCNZYXWXAUC&dq=tilting&source=gbs_navlinks_s)! + moved, especially [in Atlantic Canada](https://books.google.ca/books?id=TqCNZYXWXAUC&dq=tilting&source=gbs_navlinks_s)! *Add your first control point:* @@ -247,13 +247,13 @@ compress the image. Most of these settings can be left as default: linear transformation type, nearest neighbour resampling method, and LZW compression. (The -[world file](http://en.wikipedia.org/wiki/World_file) is not necessary, unless you want to georeference the +[world file](https://en.wikipedia.org/wiki/World_file) is not necessary, unless you want to georeference the same image again in another GIS or if someone else needs to georeference the image and does not have access to your GIS data, coordinate reference system, etc.) The target SRS is not important, but you could use this feature to give the new raster a different reference system. -- Assign a folder for your new georeferenced raster file. [Tif](http://en.wikipedia.org/wiki/Tagged_Image_File_Format) is +- Assign a folder for your new georeferenced raster file. [Tif](https://en.wikipedia.org/wiki/Tagged_Image_File_Format) is the default format for rasters georeferenced in QGIS. - Be aware that a Tif file is going to be much larger than your original map, even with LZW compression, so make sure you have adequate space if you are @@ -307,7 +307,7 @@ use this feature to give the new raster a different reference system. Now that you have a newly georeferenced map in your GIS you can explore the layer, adjust the transparency, contrast and brightness, and go back -through [Creating New Vector Layers in QGIS](/lessons/vector-layers-qgis) to digitize some of the +through [Creating New Vector Layers in QGIS](/en/lessons/vector-layers-qgis) to digitize some of the historical information that you have created. For instance, this georeferenced map of PEI shows the locations of all homes in 1863, including the name of the head of household. By assigning points on the @@ -323,4 +323,4 @@ over a DEM (digital elevation model) to give it a hillshade terrain or 3D effect and perform a 'fly-over' of PEI homes in the nineteenth century. -*This lesson is part of the [Geospatial Historian](http://geospatialhistorian.wordpress.com/).* +*This lesson is part of the [Geospatial Historian](https://geospatialhistorian.wordpress.com/).* diff --git a/en/lessons/geospatial-data-analysis.md b/en/lessons/geospatial-data-analysis.md index e4513d7797..8313e0b87e 100644 --- a/en/lessons/geospatial-data-analysis.md +++ b/en/lessons/geospatial-data-analysis.md @@ -35,7 +35,7 @@ This tutorial will introduce scholars to some of these techniques for processing Specifically, this tutorial is going to use a membership list--with addresses--from a para-religious organization in America (PTL Ministries) and downloadable geographic data to assess population characteristics that could provide insights into an organization that is often characterized as more rural and less wealthy, alongside a host of other characteristics. The tutorial will then visualize and analyze this data to assess possible insights. This process will provide the basic tools and understandings that will allow scholars to assess other events and organizations that have geographic data. From this, you should be able to discover or challenge understandings of historical events using geospatial analysis. ## Pre-requisites -The work for this lesson will be done in R and R Studio, an open source statistical package used by data scientists, statisticians and other researchers. We are using R, because it is a widely-used open source tool that will allow us to both visualize and analyze our data using a multitude of methods that can be expanded upon quite easily. Some background knowledge of the software and statistics will be helpful. For introductions to R, I recommend the [r-basics](/lessons/r-basics-with-tabular-data) tutorial and the more comprehensive [Computational Historical Thinking](https://dh-r.lincolnmullen.com) as starting points. There are many other services such as this [MOOC](https://www.coursera.org/learn/r-programming) and [DataCamp](https://www.datacamp.com/) that can introduce beginners to R's broader functionality. [UCLA](http://www.ats.ucla.edu/stat/r/default.htm) also has a nice introduction.[^1] While this tutorial will attempt to step through the entire process in R, basic knowledge of R is needed. The tutorial also assumes users will have some knowledge about the event you are observing which you will use later as a means to test and contest assumptions. +The work for this lesson will be done in R and R Studio, an open source statistical package used by data scientists, statisticians and other researchers. We are using R, because it is a widely-used open source tool that will allow us to both visualize and analyze our data using a multitude of methods that can be expanded upon quite easily. Some background knowledge of the software and statistics will be helpful. For introductions to R, I recommend the [r-basics](/en/lessons/r-basics-with-tabular-data) tutorial and the more comprehensive [Computational Historical Thinking](https://dh-r.lincolnmullen.com) as starting points. There are many other services such as this [MOOC](https://www.coursera.org/learn/r-programming) and [DataCamp](https://www.datacamp.com/) that can introduce beginners to R's broader functionality. [UCLA](https://www.ats.ucla.edu/stat/r/default.htm) also has a nice introduction.[^1] While this tutorial will attempt to step through the entire process in R, basic knowledge of R is needed. The tutorial also assumes users will have some knowledge about the event you are observing which you will use later as a means to test and contest assumptions. ## Lesson Goals @@ -119,7 +119,7 @@ The number of variables in `County_Aggregate_Data` should now increase as all of The next step is to merge our list with our `SpatialDataFrame` so we can perform our analysis. While we are using a membership list, it can be any list that is geographic in nature. For example, you may have a list of events that happened during a particular time period; or a list of places an individual chooses to visit. This type of data will come in two basic formats. The first is information such as locations, address, or incident locations--which will be converted to geographic coordinates. The second will be a table that lists the same information alongside the county (or geographic region) where it occurred. We can handle either. ## Geocoding -In the first case we have raw addresses of the members of our organization which will necessitate some additional steps. The address will need be transformed into geographical points in a process called [geocoding](https://en.wikipedia.org/wiki/Geocoding). This will create geographic points--from addresses--that can be linked to spatial regions in our downloaded census data so that we can analyze it to help us discover trends related to geographic location of these addresses. R can do some of this work but if you have a large number of addresses, you will need to use an external service because the free services R uses (such as google) will cap how many address you can geocode in a day. One popular outside service is hosted by [Texas A&M Geocoding Services](http://geoservices.tamu.edu/Services/Geocode/) and can handle large batches at a reasonable price. In the end, our address will be transformed into a list of latitudes and longitudes. This is the data R needs. +In the first case we have raw addresses of the members of our organization which will necessitate some additional steps. The address will need be transformed into geographical points in a process called [geocoding](https://en.wikipedia.org/wiki/Geocoding). This will create geographic points--from addresses--that can be linked to spatial regions in our downloaded census data so that we can analyze it to help us discover trends related to geographic location of these addresses. R can do some of this work but if you have a large number of addresses, you will need to use an external service because the free services R uses (such as google) will cap how many address you can geocode in a day. One popular outside service is hosted by [Texas A&M Geocoding Services](https://geoservices.tamu.edu/Services/Geocode/) and can handle large batches at a reasonable price. In the end, our address will be transformed into a list of latitudes and longitudes. This is the data R needs. If you have less than 2,500 addresses this can be handled in R using Google's geocoder. In R, you must first gather the address from whatever dataset you have, and then transform it. In our example, the data has already been geocoded, but below is an example of the commands used when processing a list of address and turning them into a list of geographic coordinates: @@ -174,7 +174,7 @@ Now we have a large dataframe called `County_Aggregate_Data` which has our count ```r religion <- read.csv("./data/Religion/Churches.csv", as.is=TRUE) ``` -Depending on the state of the data you may need to do some data transformations in order to merge it back with the DataFrame. For complex transformations, see tutorials in R on working with data such as [Data Wrangling and Management in R tutorial](/en/lessons/data-wrangling-and-management-in-r) [data transforms](http://r4ds.had.co.nz/transform.html). In essence, you need to have a common field in both datasets to merge upon. Often this is a geographic id for the county and state represented by `GEOID`. It could also be the unique FIPS Code given by the US Census. Below I am using state and county `GEOID`. In this example, we are converting one data frame's common fields to numeric so that they match the variable type of the other dataframe: +Depending on the state of the data you may need to do some data transformations in order to merge it back with the DataFrame. For complex transformations, see tutorials in R on working with data such as [Data Wrangling and Management in R tutorial](/en/lessons/data-wrangling-and-management-in-r) [data transforms](https://r4ds.had.co.nz/transform.html). In essence, you need to have a common field in both datasets to merge upon. Often this is a geographic id for the county and state represented by `GEOID`. It could also be the unique FIPS Code given by the US Census. Below I am using state and county `GEOID`. In this example, we are converting one data frame's common fields to numeric so that they match the variable type of the other dataframe: ```r religion$STATEFP <- religion$STATE @@ -192,7 +192,7 @@ This will bring in all additional fields into our `SpatialDataFrame`. Now we have a large `SpatialDataFrame` called `County_Aggregate_Data` which has our geocoded count data, our external count data and our census data by county. It is now time to begin to look at the data distribution and assess if everything appears correct and is in a format that will allow for some visualization and data analysis. We have some inherent complexity to our data because it is considered "count data." As such, we should be cognizant that our data is not measuring individuals directly but rather relationships between counties. We are attempting to discover if counties with certain traits lead to higher membership in our datasets. These realities can help us gather some assumptions on the individuals in these regions. ## Visualizing -Because we are analyzing geospatial data, it is often best to begin with geographic visuals. There are many options here, but I find it easiest to start with the qtm function from the TMAP library which creates [choropleth](https://en.wikipedia.org/wiki/Choropleth_map) maps simply. We could also use [GGPlot2][(http://strimas.com/r/tidy-sf/](http://web.archive.org/web/20190922234254/http://strimas.com/r/tidy-sf/)) which which should be installed using the development version. +Because we are analyzing geospatial data, it is often best to begin with geographic visuals. There are many options here, but I find it easiest to start with the qtm function from the TMAP library which creates [choropleth](https://en.wikipedia.org/wiki/Choropleth_map) maps simply. We could also use [GGPlot2][(https://strimas.com/r/tidy-sf/](https://web.archive.org/web/20190922234254/https://strimas.com/r/tidy-sf/)) which which should be installed using the development version. Now, we are going to prepare the map and look at some census data. First on our list should be membership numbers relative to population (relative membership distribution). One of the most commonly used and clearest ways to display this information is by number of members per 10,000 people. We will then do the math to create a relative population variable(number of members per 10,000 people). We do this because we have to ensure we are taking into account the variability of populations within the census regions that we are analyzing otherwise we will get misleading visualization in densely populated counties that represent general population trends rather than variable relationships. If we did not take this step, we would undoubtedly see a map that highlights urban areas rather than areas where membership is strongest. @@ -298,7 +298,7 @@ Through this process, we have gathered and transformed geospatial data into a us ## Other Models and Visualizations -There are many other models and visualizations available that can bring insight but they also add some complexity which demand further statistical understandings. For example, You can also create more complex scatterplots that can provide further insights. [Plot.ly](https://plot.ly/r/) offers interactive scatter plots that can be customized and shared.[^8]. While statistical modeling usually focuses on a particular model's predictive insight, well-fit models also provide insight into the data they represent. In particular, the Poisson regression is frequently used to create [models of count data](http://www.theanalysisfactor.com/regression-models-for-count-data/) which is how population data is often represented. [Geographically Weighted Regressions](https://rstudio-pubs-static.s3.amazonaws.com/44975_0342ec49f925426fa16ebcdc28210118.html) also have particular advantages with this type of data. But assessing fit has some complexity. [Decision trees](hhttps://www.analyticsvidhya.com/blog/2016/04/complete-tutorial-tree-based-modeling-scratch-in-python/) could also be useful for historical data because they give an understandable graphical representation of the the leading factors that caused inclusion in a group or list. Principal component analysis, [correspondence analysis](/en/lessons/correspondence-analysis-in-R) and other clustering methods can also be helpful, especially when there is limited knowledge or insight into the event being analyzed yet there is an abundance of data associated with the event. I recommend background reading or discussions with a data scientist or statistician when exploring some of these modeling options as understanding the configuration and parameters of the individual models is essential to ensuring the results are trustworthy and significant. +There are many other models and visualizations available that can bring insight but they also add some complexity which demand further statistical understandings. For example, You can also create more complex scatterplots that can provide further insights. [Plot.ly](https://plot.ly/r/) offers interactive scatter plots that can be customized and shared.[^8]. While statistical modeling usually focuses on a particular model's predictive insight, well-fit models also provide insight into the data they represent. In particular, the Poisson regression is frequently used to create [models of count data](https://www.theanalysisfactor.com/regression-models-for-count-data/) which is how population data is often represented. [Geographically Weighted Regressions](https://rstudio-pubs-static.s3.amazonaws.com/44975_0342ec49f925426fa16ebcdc28210118.html) also have particular advantages with this type of data. But assessing fit has some complexity. [Decision trees](hhttps://www.analyticsvidhya.com/blog/2016/04/complete-tutorial-tree-based-modeling-scratch-in-python/) could also be useful for historical data because they give an understandable graphical representation of the the leading factors that caused inclusion in a group or list. Principal component analysis, [correspondence analysis](/en/lessons/correspondence-analysis-in-R) and other clustering methods can also be helpful, especially when there is limited knowledge or insight into the event being analyzed yet there is an abundance of data associated with the event. I recommend background reading or discussions with a data scientist or statistician when exploring some of these modeling options as understanding the configuration and parameters of the individual models is essential to ensuring the results are trustworthy and significant. @@ -312,13 +312,13 @@ There are many other models and visualizations available that can bring insight [^3]: This is often leveraged in the field of public health. See for example, [Spatial Analysis and Correlates of County-Level Diabetes Prevalence](https://www.cdc.gov/pcd/issues/2015/14_0404.htm). Other fields such as criminal justice also rely on similar analytics although criminal justice tends to look at smaller census areas within regions. See, for example, `https://www.ncjrs.gov/pdffiles1/nij/grants/204432.pdf` -[^4]: Count data typically has large numbers of zero values which can add some complexity that will not be covered here. There are more complex ways to minimize this using more complex regression models. See, for example [Regression Models with Count Data](https://stats.idre.ucla.edu/stata/seminars/regression-models-with-count-data/). For general description of what normal distributions, which work well without modification look like see normal [distributions](http://www.statisticshowto.com/probability-and-statistics/normal-distributions/) +[^4]: Count data typically has large numbers of zero values which can add some complexity that will not be covered here. There are more complex ways to minimize this using more complex regression models. See, for example [Regression Models with Count Data](https://stats.idre.ucla.edu/stata/seminars/regression-models-with-count-data/). For general description of what normal distributions, which work well without modification look like see normal [distributions](https://www.statisticshowto.com/probability-and-statistics/normal-distributions/) -[^5]: There are different strategies to dealing with this type of data. See for example, [The Excess-zero Problem in Soil Animal Count Data](http://www.sciencedirect.com/science/article/pii/S0031405608000073) or [Data Transformations](http://www.biostathandbook.com/transformation.html). +[^5]: There are different strategies to dealing with this type of data. See for example, [The Excess-zero Problem in Soil Animal Count Data](https://www.sciencedirect.com/science/article/pii/S0031405608000073) or [Data Transformations](https://www.biostathandbook.com/transformation.html). -[^6]: For details on ggmap and and integration with Google Maps or other maps services see the [ggmap overview](http://stat405.had.co.nz/ggmap.pdf). For another broader discussions on google map making that utilizes a few of the libraries in this tutorial see [R and Google Map Making](https://rpubs.com/nickbearman/r-google-map-making). For a discussion of the sf library and it relationship to sp see [Simple Features for R](https://cran.r-project.org/web/packages/sf/vignettes/sf1.html). While sp has been the library spatial analysis library of choice, it is being superseded by sf. +[^6]: For details on ggmap and and integration with Google Maps or other maps services see the [ggmap overview](https://stat405.had.co.nz/ggmap.pdf). For another broader discussions on google map making that utilizes a few of the libraries in this tutorial see [R and Google Map Making](https://rpubs.com/nickbearman/r-google-map-making). For a discussion of the sf library and it relationship to sp see [Simple Features for R](https://cran.r-project.org/web/packages/sf/vignettes/sf1.html). While sp has been the library spatial analysis library of choice, it is being superseded by sf. -[^7]: We are setting Coordinate Reference System(CRS) to EPSG 4326 which is the most common mapping system used int the U.S. It is used by Google which is the origins of our data. EPSG 3857 is also used by google. For more on CRS see [Coordinate Reference Systems & Spatial Projections](https://www.earthdatascience.org/courses/earth-analytics/spatial-data-r/intro-to-coordinate-reference-systems/). Also see [coordinate systems reference in R](http://web.archive.org/web/20200225021219/https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/OverviewCoordinateReferenceSystems.pdf). +[^7]: We are setting Coordinate Reference System(CRS) to EPSG 4326 which is the most common mapping system used int the U.S. It is used by Google which is the origins of our data. EPSG 3857 is also used by google. For more on CRS see [Coordinate Reference Systems & Spatial Projections](https://www.earthdatascience.org/courses/earth-analytics/spatial-data-r/intro-to-coordinate-reference-systems/). Also see [coordinate systems reference in R](https://web.archive.org/web/20200225021219/https://www.nceas.ucsb.edu/~frazier/RSpatialGuides/OverviewCoordinateReferenceSystems.pdf). [^8]: These plots are a bit more complex and requires an extra library, but they have some advantages. They work well with complex datasets because they have the ability to model more than two relationships by altering the color or size of the data points(we did this earlier on the choropleths by altering font size). Moreover, they are interactive which allows you to explore extra information about data points after the plot is created without wrecking the visual makeup of the plot. Here is an example that looks at the relationship between income and membership but also adds urban status to the visual using color. I am also adjusting point size based on population so I can take a look at more populated areas alongside the other data: diff --git a/en/lessons/getting-started-with-markdown.md b/en/lessons/getting-started-with-markdown.md index 2902de8d04..4327d734d6 100755 --- a/en/lessons/getting-started-with-markdown.md +++ b/en/lessons/getting-started-with-markdown.md @@ -16,7 +16,7 @@ topics: [data-management] abstract: "In this lesson, you will be introduced to Markdown, a plain text-based syntax for formatting documents. You will find out why it is used, how to format Markdown files, and how to preview Markdown-formatted documents on the web." exclude_from_check: - reviewers -redirect_from: /lessons/getting-started-with-markdown +redirect_from: /lessons/getting-started-with-markdown/ avatar_alt: Ornate decorated characters from a typographical manual doi: 10.46430/phen0046 --- @@ -34,11 +34,11 @@ Since Programming Historian lessons are submitted as Markdown files, I have incl ## What is Markdown? -Developed in 2004 by [John Gruber](http://daringfireball.net/projects/markdown/ "Markdown on Daring Fireball"), Markdown refers to both (1) a way of formatting text files, as well as (2) a Perl utility to convert Markdown files into HTML. In this lesson, we'll focus on the first part and learn to write files using the Markdown syntax. +Developed in 2004 by [John Gruber](https://daringfireball.net/projects/markdown/ "Markdown on Daring Fireball"), Markdown refers to both (1) a way of formatting text files, as well as (2) a Perl utility to convert Markdown files into HTML. In this lesson, we'll focus on the first part and learn to write files using the Markdown syntax. Plain text files have many advantages over other formats. For one, they are readable on virtually all devices. They have also withstood the test of time better than other file types -- if you've ever tried to open a document saved in a legacy word processor format, you'll be familiar with the compatibility challenges involved. -By following Markdown syntax, you'll be able to produce files that are both legible in plain text and ready to be styled on other platforms. Many blogging engines, static site generators, and sites like [GitHub](http://github.com "GitHub") also support Markdown, and will render these files into HTML for display on the web. Additionally, tools like Pandoc can convert files into and out of Markdown. For more on Pandoc, visit the lesson on [Sustainable authorship in plain text using Pandoc and Markdown](/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) by Dennis Tenen and Grant Wythoff. +By following Markdown syntax, you'll be able to produce files that are both legible in plain text and ready to be styled on other platforms. Many blogging engines, static site generators, and sites like [GitHub](https://github.com "GitHub") also support Markdown, and will render these files into HTML for display on the web. Additionally, tools like Pandoc can convert files into and out of Markdown. For more on Pandoc, visit the lesson on [Sustainable authorship in plain text using Pandoc and Markdown](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) by Dennis Tenen and Grant Wythoff. ## Markdown Syntax Markdown files are saved with the extension `.md`, and can be opened in a text editor such as TextEdit, Notepad, Sublime Text, or Vim. Many websites and publishing platforms also offer web-based editors and/or extensions for entering text using Markdown syntax. @@ -231,13 +231,13 @@ Reference-style links are handy for footnotes and may keep your plain text docum You may then add the URL to another part of the document: -`[1]: http://programminghistorian.org/ "The Programming Historian"` +`[1]: https://programminghistorian.org/ "The Programming Historian"` **This renders as:** One example is the [Programming Historian][1] website. -[1]: http://programminghistorian.org/ "The Programming Historian" +[1]: https://programminghistorian.org/ "The Programming Historian" ### Images @@ -305,7 +305,7 @@ To specify the alignment of each column, colons `:` can be added to the header r ## Markdown Limitations -While Markdown is becoming increasingly popular, particularly for styling documents that are viewable on the web, many people and publishers still expect traditional Word documents, PDFs, and other file formats. This can be mitigated somewhat with command line conversion tools such as [Pandoc](https://pandoc.org/); however, certain word processor features like track changes are not supported yet. Please visit the Programming Historian lesson on [Sustainable authorship in plain text using Pandoc and Markdown](/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) for more information about Pandoc. +While Markdown is becoming increasingly popular, particularly for styling documents that are viewable on the web, many people and publishers still expect traditional Word documents, PDFs, and other file formats. This can be mitigated somewhat with command line conversion tools such as [Pandoc](https://pandoc.org/); however, certain word processor features like track changes are not supported yet. Please visit the Programming Historian lesson on [Sustainable authorship in plain text using Pandoc and Markdown](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) for more information about Pandoc. ## Conclusion diff --git a/en/lessons/getting-started-with-mysql-using-r.md b/en/lessons/getting-started-with-mysql-using-r.md index 56d9e53009..0c060821e8 100755 --- a/en/lessons/getting-started-with-mysql-using-r.md +++ b/en/lessons/getting-started-with-mysql-using-r.md @@ -16,7 +16,7 @@ slug: getting-started-with-mysql-using-r activity: transforming topics: [data-manipulation, distant-reading, r, data-visualization] abstract: "This lesson will help you store large amounts of historical data in a structured manner, search and filter that data, and visualize some of the data as a graph." -redirect_from: /lessons/getting-started-with-mysql-using-r +redirect_from: /lessons/getting-started-with-mysql-using-r/ avatar_alt: A hand holding a newspaper doi: 10.46430/phen0076 --- @@ -29,7 +29,7 @@ R can perform analysis and data storage without the use of a relational database - When data is stored in a relational database already. - Working with the data of different entities that are related to one another. An example would be a database of soldiers of two different armies that fought a battle where we wanted to know what squad, platoon, company and brigade each soldier was part of. -A further short discussion of this is on [Jason A. French's blog](http://www.jason-french.com/blog/2014/07/03/using-r-with-mysql-databases/)[^2]. +A further short discussion of this is on [Jason A. French's blog](https://www.jason-french.com/blog/2014/07/03/using-r-with-mysql-databases/)[^2]. By the end of this lesson you will be able to install a database system on your computer, create a database table, store information in the table and then query the data. At the conclusion of the lesson we'll use a query of the database to make a graph. @@ -48,13 +48,13 @@ MySQL is a relational database used to store and query information. This lesson - Store records to the table. - Query the table. -In this tutorial you will make a database of newspaper stories that contain words from a search of a newspaper archive. The program will store the title, date published and URL of each story in a database. We'll use another program to query the database and look for historically significant patterns. Sample data will be provided from the [Welsh Newspapers Online](http://newspapers.library.wales) newspaper archive. We are working toward having a list of stories we can query for information. At the end of the lesson, we will run a query to generate a graph of the number of newspaper stories in the database to see if there is a pattern that is significant. +In this tutorial you will make a database of newspaper stories that contain words from a search of a newspaper archive. The program will store the title, date published and URL of each story in a database. We'll use another program to query the database and look for historically significant patterns. Sample data will be provided from the [Welsh Newspapers Online](https://newspapers.library.wales) newspaper archive. We are working toward having a list of stories we can query for information. At the end of the lesson, we will run a query to generate a graph of the number of newspaper stories in the database to see if there is a pattern that is significant. # Required Software R, R Studio, MySQL Server and MySQL Workbench are the pieces of software required for this lesson. Notes on installing these software packages are below. ## R -In their lesson [Basic Text Processing in R](/lessons/basic-text-processing-in-r)[^3], Taylor Arnold and Lauren Tilton provide an excellent summary of the knowledge of R required for this lesson. Only basic knowledge of R is assumed. Taryn Dewar's lesson ['R Basics with Tabular Data']( /lessons/r-basics-with-tabular-data)[^4] +In their lesson [Basic Text Processing in R](/en/lessons/basic-text-processing-in-r)[^3], Taylor Arnold and Lauren Tilton provide an excellent summary of the knowledge of R required for this lesson. Only basic knowledge of R is assumed. Taryn Dewar's lesson ['R Basics with Tabular Data']( /lessons/r-basics-with-tabular-data)[^4] covers how to install R and become familiar with it. ### Download R @@ -179,7 +179,7 @@ SET PASSWORD=PASSWORD('your_new_password_you_just_wrote_down_in_step_3.5'); 3.6. Restart the machine. After restarting the machine you may need to repeat step *3.3 Start the MySQL server* above. ###### MySQL Workbench downloads -Click on this link: [http://dev.mysql.com/downloads/workbench/](http://dev.mysql.com/downloads/workbench/). Scroll down and click to **Select Operating System** that matches your computer. If necessary, **Select OS Version**. Once you have done that click the blue **Download** button. On the download page, scroll down, you have the option of starting the download by clicking **No thanks, just start my download.** +Click on this link: [https://dev.mysql.com/downloads/workbench/](https://dev.mysql.com/downloads/workbench/). Scroll down and click to **Select Operating System** that matches your computer. If necessary, **Select OS Version**. Once you have done that click the blue **Download** button. On the download page, scroll down, you have the option of starting the download by clicking **No thanks, just start my download.** Once the file is downloaded, double click on the downloaded file to install it. Once the installation of MySQL Workbench is done, as per the instructions on the screen, drag the icon to the Applications folder on the left. (See below) @@ -408,7 +408,7 @@ You have successfully connected to the database using a configuration file. # Storing data in a table with SQL -In this section of the lesson we'll create a SQL statement to insert a row of data into the database table about this [newspaper story](http://newspapers.library.wales/view/4121281/4121288/94/). We'll insert the record first in MySQL workbench and later we'll do it in R. +In this section of the lesson we'll create a SQL statement to insert a row of data into the database table about this [newspaper story](https://newspapers.library.wales/view/4121281/4121288/94/). We'll insert the record first in MySQL workbench and later we'll do it in R. 1. In MySQL Workbench, click the icon labelled SQL+ to create a new SQL tab for executing queries. 2. Paste this statement below into the query window. This will insert a record into the table. @@ -438,7 +438,7 @@ LEFT(RTRIM('http://newspapers.library.wales/view/4121281/4121288/94/'),99), | search_term_used) | " | | VALUES('THE LOST LUSITANIA.', | The value to be inserted into the story_title field | | '1915-05-21', | story_date_published field | -| LEFT(RTRIM('http://newspapers.library.wales/view/4121281/4121288/94/'),99), | story_url field. This field is a VARCHAR(99) so it has a maximum length of 99 characters. Inserting a URL longer than 99 characters would cause an error and so two functions are used to control for that. RTRIM() trims trailing spaces to the right of the URL. LEFT(value,99) returns only the leftmost 99 characters of the trimmed URL. This URL is much shorter than that and so these functions are here for an example only. | +| LEFT(RTRIM('https://newspapers.library.wales/view/4121281/4121288/94/'),99), | story_url field. This field is a VARCHAR(99) so it has a maximum length of 99 characters. Inserting a URL longer than 99 characters would cause an error and so two functions are used to control for that. RTRIM() trims trailing spaces to the right of the URL. LEFT(value,99) returns only the leftmost 99 characters of the trimmed URL. This URL is much shorter than that and so these functions are here for an example only. | | 'German+Submarine'); | search_term_used field | @@ -837,10 +837,10 @@ Below is what the plot should look like: # Going further with MySQL -If you wanted to put a database on a website, using MySQL as the database and the PHP language to build the pages of the site is one way to do this. An example of this type of website is one I built to [search issues of the Equity newspaper](http://www.jeffblackadar.ca/graham_fellowship/corpus_entities_equity/). Larry Ullman's book *PHP and MySQL for Dynamic Web Sites* covers how to set up and connect to a database using MySQL and PHP in a hacker resistant way. +If you wanted to put a database on a website, using MySQL as the database and the PHP language to build the pages of the site is one way to do this. An example of this type of website is one I built to [search issues of the Equity newspaper](https://www.jeffblackadar.ca/graham_fellowship/corpus_entities_equity/). Larry Ullman's book *PHP and MySQL for Dynamic Web Sites* covers how to set up and connect to a database using MySQL and PHP in a hacker resistant way. For examples of using SQL to sort and group data as well as perform calculations, see: -[MySQL by Examples for Beginners](http://web.archive.org/web/20171228130133/https://www.ntu.edu.sg/home/ehchua/programming/sql/MySQL_Beginner.html) or MySQL's [Examples of Common Queries](https://dev.mysql.com/doc/refman/5.7/en/examples.html). +[MySQL by Examples for Beginners](https://web.archive.org/web/20171228130133/https://www.ntu.edu.sg/home/ehchua/programming/sql/MySQL_Beginner.html) or MySQL's [Examples of Common Queries](https://dev.mysql.com/doc/refman/5.7/en/examples.html). # Conclusion @@ -850,7 +850,7 @@ I hope that you now have the knowledge to set up a database table, connect to it # Credits -I completed this lesson thanks to the support of the [George Garth Graham Undergraduate Digital History Research Fellowship](http://grahamresearchfellow.org/). +I completed this lesson thanks to the support of the [George Garth Graham Undergraduate Digital History Research Fellowship](https://grahamresearchfellow.org/). Thank you to Dr. Amanda Visconti for her guidance and support during the preparation of this lesson. @@ -862,10 +862,10 @@ Ullman, L. 2005. *PHP and MySQL for Dynamic Web Sites, 2nd ed.* Berkeley, Calif: [^1]: Lincoln Mullen, "Natural Language Processing," RPubs, [https://rpubs.com/lmullen/nlp-chapter](https://rpubs.com/lmullen/nlp-chapter). -[^2]: Jason A. French, "Using R With MySQL Databases," blog (3 July 2014), [http://www.jason-french.com/blog/2014/07/03/using-r-with-mysql-databases/](http://www.jason-french.com/blog/2014/07/03/using-r-with-mysql-databases/). +[^2]: Jason A. French, "Using R With MySQL Databases," blog (3 July 2014), [https://www.jason-french.com/blog/2014/07/03/using-r-with-mysql-databases/](https://www.jason-french.com/blog/2014/07/03/using-r-with-mysql-databases/). -[^3]: Taylor Arnold and Lauren Tilton, "Basic Text Processing in R," Programming Historian (27 March 2017), [/lessons/basic-text-processing-in-r](/lessons/basic-text-processing-in-r). +[^3]: Taylor Arnold and Lauren Tilton, "Basic Text Processing in R," Programming Historian (27 March 2017), [/lessons/basic-text-processing-in-r](/en/lessons/basic-text-processing-in-r). -[^4]: Taryn Dewar, "R Basics with Tabular Data," Programming Historian (05 September 2016), [/lessons/r-basics-with-tabular-data](/lessons/r-basics-with-tabular-data). +[^4]: Taryn Dewar, "R Basics with Tabular Data," Programming Historian (05 September 2016), [/lessons/r-basics-with-tabular-data](/en/lessons/r-basics-with-tabular-data). The R program I used to gather the sample data is [here](https://github.com/jeffblackadar/getting-started-with-mysql/blob/master/newspaper-search-and-store.R). diff --git a/en/lessons/googlemaps-googleearth.md b/en/lessons/googlemaps-googleearth.md index d91672e3c2..68987fabda 100755 --- a/en/lessons/googlemaps-googleearth.md +++ b/en/lessons/googlemaps-googleearth.md @@ -19,10 +19,10 @@ topics: [mapping] abstract: "Google My Maps and Google Earth provide an easy way to start creating digital maps. With a Google Account you can create and edit personal maps by clicking on My Places." -next: qgis-layers +next: /en/lessons/qgis-layers series_total: 5 lessons sequence: 1 -redirect_from: /lessons/googlemaps-googleearth +redirect_from: /lessons/googlemaps-googleearth/ avatar_alt: An old man consulting a large globe with a compass doi: 10.46430/phen0028 --- @@ -63,13 +63,13 @@ or Quantum GIS. - Log in to your Google Account if you aren't already logged in (follow the basic instructions to create an account if necessary) -{% include figure.html filename="geo1.png" caption="Figure 1" %} +{% include figure.html filename="en-or-googlemaps-googleearth-01.png" caption="Figure 1" %} - Click on the question mark at bottom right and click Take a Tour for an introduction to how My Maps works -{% include figure.html filename="geo2.png" caption="Figure 2" %} +{% include figure.html filename="en-or-googlemaps-googleearth-02.png" caption="Figure 2" %} - At the upper left corner, a menu box appears, titled 'Untitled Map'. By clicking on the title you can rename as 'My test map' or a title @@ -86,42 +86,42 @@ or Quantum GIS. modern place names and avoid risking that Google with choose the wrong Constantinople. -{% include figure.html filename="geo3.png" caption="Figure 3" %} +{% include figure.html filename="en-or-googlemaps-googleearth-03.png" caption="Figure 3" %} -{% include figure.html filename="geo4.png" caption="Figure 4" %} +{% include figure.html filename="en-or-googlemaps-googleearth-04.png" caption="Figure 4" %} - Next, you can Import a Dataset. Click the Import button under the untitled layer. -{% include figure.html filename="geo5.png" caption="Figure 5" %} +{% include figure.html filename="en-or-googlemaps-googleearth-05.png" caption="Figure 5" %} - A new window will pop up and give you the option of importing a CSV (comma separated value), XLXS (Microsoft Excel) file, KML (Google's spatial file formate) or GPX (common GPS file formate). These are two common spreadsheet formats; CSV is simple and universal, XLXS is the MS Excel format. You can also work with a Google spreadsheet from your Drive account. -{% include figure.html filename="geo6.png" caption="Figure 6" %} +{% include figure.html filename="en-or-googlemaps-googleearth-06.png" caption="Figure 6" %} - Download this sample data and located it on your computer: [UK - Global Fat Supply CSV file][]. If you open the file in Excel or + Global Fat Supply CSV file](/assets/googlemaps-googleearth/UKGlobalFatSupply1894-1896.csv.zip). If you open the file in Excel or another spreadsheet program, you'll find a simple two column dataset with a list of different kinds of fats and the associated list of places. This data was created using British import tables from 1896. -{% include figure.html filename="geo7.png" caption="Figure 7" %} +{% include figure.html filename="en-or-googlemaps-googleearth-07.png" caption="Figure 7" %} - Drag the file into the box provided by Google Maps. - You will then be promted to choose which column Google should use to identify a the location. Choose Place. -{% include figure.html filename="geo8.png" caption="Figure 8" %} +{% include figure.html filename="en-or-googlemaps-googleearth-08.png" caption="Figure 8" %} - You will then be promoted again to choose which column should be used for the label. Choose 'Commodity'. - You should now have a global map of the major exporters of fat to Britain during the mid-1890s. -{% include figure.html filename="geo9.png" caption="Figure 9: Click to see full-size image" %} +{% include figure.html filename="en-or-googlemaps-googleearth-09.png" caption="Figure 9" %} - You can now explore the data in more detail and change the Style to distinguish between the different types of fats. @@ -130,9 +130,9 @@ or Quantum GIS. Commodities. On the left hand side, the legend will show the amount of occurrences of each style in brackets, e.g. 'Flax Seeds (4)'. -{% include figure.html filename="geo10.png" caption="Figure 10" %} +{% include figure.html filename="en-or-googlemaps-googleearth-10.png" caption="Figure 10" %} -{% include figure.html filename="geo11.png" caption="Figure 11" %} +{% include figure.html filename="en-or-googlemaps-googleearth-11.png" caption="Figure 11" %} - Continue to play with the options. - This feature provides a powerful tool to display historical @@ -142,7 +142,7 @@ or Quantum GIS. 300 features. -{% include figure.html filename="geo12.png" caption="Figure 12" %} +{% include figure.html filename="en-or-googlemaps-googleearth-12.png" caption="Figure 12" %} ### Creating Vector Layers @@ -159,7 +159,7 @@ a problem as you scale up your digital mapping research, but it is not a problem when you are starting out. In Google Maps you can add a label, a text description, and links to a website or photo. More information about creating historical vectors in a full GIS is available in -[Creating New Vector Layers in QGIS 2.0][]. +[Creating New Vector Layers in QGIS 2.0](/en/lessons/vector-layers-qgis). - To add a layer, you can either click on the layer that has been created for you in the menu box, with the name 'Untitled Layer'. @@ -168,7 +168,7 @@ about creating historical vectors in a full GIS is available in 'Untitled Layer' which you can name as 'Layer 2′. It should look like this: -{% include figure.html filename="geo13.png" caption="Figure 13" %} +{% include figure.html filename="en-or-googlemaps-googleearth-13.png" caption="Figure 13" %} - Note that to the right of Layer there is a checkbox – unchecking this box turns off (i.e. it doesn't appear on the map) a layer and @@ -195,14 +195,14 @@ about creating historical vectors in a full GIS is available in top of the window. Click on the spot on the map where you want the Placemark to appear. -{% include figure.html filename="geo14.png" caption="Figure 14" %} +{% include figure.html filename="en-or-googlemaps-googleearth-14.png" caption="Figure 14" %} - A box will pop up and give you the opportunity to label the Placemark and add a description into the text box. We added Charlottetown and included that it was founded in 1765 in the description box. -{% include figure.html filename="geo15.png" caption="Figure 15" %} +{% include figure.html filename="en-or-googlemaps-googleearth-15.png" caption="Figure 15" %} - Add a few more points, including labels and descriptions. @@ -216,7 +216,7 @@ about creating historical vectors in a full GIS is available in Placemark. Labels menu allows you to control whether the name or description of your Placemark appears besides it on the actual map. -{% include figure.html filename="geo16.png" caption="Figure 16" %} +{% include figure.html filename="en-or-googlemaps-googleearth-16.png" caption="Figure 16" %} - Now we will add some lines and shapes (called polygons in GIS software). Adding lines and polygons is a very similar process. We @@ -227,7 +227,7 @@ about creating historical vectors in a full GIS is available in - Click the 'add line or shape' icon box directly to the right of the Markers symbol: -{% include figure.html filename="geo17.png" caption="Figure 17" %} +{% include figure.html filename="en-or-googlemaps-googleearth-17.png" caption="Figure 17" %} - Pick a road and click with your mouse along it, tracing the route for a while. Hit "enter" when you want to finish the line. @@ -238,7 +238,7 @@ about creating historical vectors in a full GIS is available in find the road you have drawn in Layer 2 in the menu box, and click to the right of the name of the road. -{% include figure.html filename="geo18.png" caption="Figure 18" %} +{% include figure.html filename="en-or-googlemaps-googleearth-18.png" caption="Figure 18" %} - To create a polygon (a shape) you can connect the dots of the line to create an enclosed formation. To do this, start drawing and @@ -247,9 +247,9 @@ about creating historical vectors in a full GIS is available in shapes, such as the outline of a city (see examples below). Feel free to experiment with creating lines and polygons. -{% include figure.html filename="geo19.png" caption="Figure 19" %} +{% include figure.html filename="en-or-googlemaps-googleearth-19.png" caption="Figure 19" %} -{% include figure.html filename="geo20.png" caption="Figure 20" %} +{% include figure.html filename="en-or-googlemaps-googleearth-20.png" caption="Figure 20" %} - Like placemarks and lines, you can change the name and description of a polygon. You can also change the colour and line width by @@ -290,9 +290,9 @@ about creating historical vectors in a full GIS is available in you can keep going and learn about Google Earth and in lesson 2, Quantum GIS. -{% include figure.html filename="geo21.png" caption="Figure 21" %} +{% include figure.html filename="en-or-googlemaps-googleearth-21.png" caption="Figure 21" %} -{% include figure.html filename="geo22.png" caption="Figure 22" %} +{% include figure.html filename="en-or-googlemaps-googleearth-22.png" caption="Figure 22" %} ## Google Earth @@ -313,13 +313,13 @@ exported. See the red arrows in the following image for the location of these layers. -{% include figure.html filename="geo23.png" caption="Figure 23: Click to see full-size image" %} +{% include figure.html filename="en-or-googlemaps-googleearth-23.png" caption="Figure 23" %} - Note that under the 'Layer' heading on the lower left side of the window margin, Google provides a number of ready-to-go layers that can be turned on by selecting the corresponding checkbox. -{% include figure.html filename="geo24.png" caption="Figure 24" %} +{% include figure.html filename="en-or-googlemaps-googleearth-24.png" caption="Figure 24" %} - Google Earth also contains some scanned historical maps and aerial photographs (in GIS these types of maps, which are made up of @@ -333,23 +333,23 @@ exported. number of historical maps. See if there are any maps included in the Rumsey Collection that might be useful for your research or teaching. (You can find many more digitized, but not georeferenced - maps at [www.davidrumsey.com][].) + maps at [www.davidrumsey.com](https://www.davidrumsey.com/).) -{% include figure.html filename="geo25.png" caption="Figure 25" %} +{% include figure.html filename="en-or-googlemaps-googleearth-25.png" caption="Figure 25" %} - You might need to zoom in to see all of the Map icons. Can you find the World Globe from 1812? -{% include figure.html filename="geo26.png" caption="Figure 26" %} +{% include figure.html filename="en-or-googlemaps-googleearth-26.png" caption="Figure 26" %} - Once you click on an icon an information panel pops up. Click on the map thumbnail to see the map tacked onto the digital globe. We will learn to properly georeference maps in [Georeferencing in QGIS - 2.0][]. + 2.0](/en/lessons/georeferencing-qgis). -{% include figure.html filename="geo27.png" caption="Figure 27" %} +{% include figure.html filename="en-or-googlemaps-googleearth-27.png" caption="Figure 27" %} -{% include figure.html filename="geo28.png" caption="Figure 28: Click to see full-size image" %} +{% include figure.html filename="en-or-googlemaps-googleearth-28.png" caption="Figure 28" %} ## KML: Keyhole Markup Language files @@ -364,10 +364,10 @@ exported. two platforms and bring your map data into Quantum GIS or ArcGIS. - For example, you can import the data you created in Google Maps Engine Lite. If you created a map in the exercise above, it can be - found by clicking "Open Map" on the [Maps Engine Lite][] home page. + found by clicking "Open Map" on the [Maps Engine Lite](https://mapsengine.google.com) home page. Click on the folder icon on the left hand side of the legend beneath the map title and click "export to KML". (You can also download and - explore Dan Macfarlane's [Seaway map][] for this part of the + explore Dan Macfarlane's [Seaway map](/assets/googlemaps-googleearth/seaway.zip) for this part of the exercise). **Bringing your KML file into Google Earth** @@ -381,7 +381,7 @@ Updates to Google Earth since the publication of this lesson in 2013, mean that - Double click on the KML file in your Download folder. - Find the data in the Temporary Folder in Google Earth. -{% include figure.html filename="geo29.png" caption="Figure 29: Click to see full-size image" %} +{% include figure.html filename="en-or-googlemaps-googleearth-29.png" caption="Figure 29: Click to see full-size image" %} - You can now explore these map features in 3D, or you can add new lines, points and polygons using the various icons along the top @@ -397,7 +397,7 @@ Updates to Google Earth since the publication of this lesson in 2013, mean that tour" icon is selected, recording options will show up on the bottom left of the window). -{% include figure.html filename="geo30.png" caption="Figure 30" %} +{% include figure.html filename="en-or-googlemaps-googleearth-30.png" caption="Figure 30" %} - Try adding a new feature to Dan's Seaway data. We've created a polygon (in GIS terminology a polygon is a closed shape of any type @@ -405,9 +405,9 @@ Updates to Google Earth since the publication of this lesson in 2013, mean that in the next image. Find Lake St. Clair (east of Detroit) and try adding a polygon. -{% include figure.html filename="geo31.png" caption="Figure 31: Click to see full-size image" %} +{% include figure.html filename="en-or-googlemaps-googleearth-31.png" caption="Figure 31" %} -{% include figure.html filename="geo32.png" caption="Figure 32" %} +{% include figure.html filename="en-or-googlemaps-googleearth-32.png" caption="Figure 32" %} - Label the new feature Lake St. Clair. You can then drag the new feature onto Dan's Seaway data and add it to the collection. You can @@ -416,19 +416,18 @@ Updates to Google Earth since the publication of this lesson in 2013, mean that QGIS. Find the save option by right-clicking on the Seaway collection and choose Save Place As or Email. -{% include figure.html filename="geo33.png" caption="Figure 33" %} +{% include figure.html filename="en-or-googlemaps-googleearth-33.png" caption="Figure 33" %} -{% include figure.html filename="geo34.png" caption="Figure 34" %} +{% include figure.html filename="en-or-googlemaps-googleearth-34.png" caption="Figure 34" %} -{% include figure.html filename="geo35.png" caption="Figure 35" %} +{% include figure.html filename="en-or-googlemaps-googleearth-35.png" caption="Figure 35" %} ## Adding Scanned Historical Maps Within Google Earth, you can upload a digital copy of a historical map. This could be a map that has been scanned, or an image obtained that is already in a digital format (for tips on finding historical maps online -see: [Mobile Mapping and Historical GIS in the Field][]). The main -purpose for uploading a digital map, from a historical perspective, is +see: [Mobile Mapping and Historical GIS in the Field](https://niche-canada.org/2011/12/14/mobile-mapping-and-historical-gis-in-the-field)). The main purpose for uploading a digital map, from a historical perspective, is to place it over top of a Google Earth image in the browser. This is known as an overlay. Performing an overlay allows for useful comparisons of change over time. @@ -443,19 +442,19 @@ of change over time. on the top toolbar. and then adjusting the time-scale slider that will appear. -{% include figure.html filename="geo36.png" caption="Figure 36" %} +{% include figure.html filename="en-or-googlemaps-googleearth-36.png" caption="Figure 36" %} -{% include figure.html filename="geo37.png" caption="Figure 37" %} +{% include figure.html filename="en-or-googlemaps-googleearth-37.png" caption="Figure 37" %} - Once you have identified the images you plan to use, click on the 'Add Image Overlay' icon on the top toolbar.\ -{% include figure.html filename="geo38.png" caption="Figure 38" %} +{% include figure.html filename="en-or-googlemaps-googleearth-38.png" caption="Figure 38" %} - A new window will appear. Begin by giving it a different title if you wish (the default is 'Untitled Image Overlay'). -{% include figure.html filename="geo39.png" caption="Figure 39: Click to see full-size image" %} +{% include figure.html filename="en-or-googlemaps-googleearth-39.png" caption="Figure 39" %} - To the right of the Link field, click the Browse button to select from your files the map you wish to be the overlaying image. @@ -466,7 +465,7 @@ of change over time. - There are fluorescent green markers in the middle and at the edges of the uploaded map. These can be used to stretch, shrink, and move the map so that it aligns properly with the satellite image. This is - a simple form of georeferencing (see [Georeferencing in QGIS 2.0][]). + a simple form of georeferencing (see [Georeferencing in QGIS 2.0](/en/lessons/georeferencing-qgis)). The image below shows the above steps using an old map of the town of Aultsville overlaid on top of Google satellite imagery from 2008 in which the remains of the town's roads and building foundations in @@ -474,7 +473,7 @@ of change over time. Villages flooded out by the St. Lawrence Seaway and Power Project). -{% include figure.html filename="geo40.png" caption="Figure 40: Click to see full-size image" %} +{% include figure.html filename="en-or-googlemaps-googleearth-40.png" caption="Figure 40" %} - Back in the New Image Overlay window, note that there are a range of options (Description, View, Altitude, Refresh, Location) that you @@ -496,55 +495,4 @@ of change over time. **You have learned how to use Google Maps and Earth. Make sure you save your work!** -*This lesson is part of the [Geospatial Historian][].* - - [Google Maps Engine Lite]: https://mapsengine.google.com - [geo1]: /images/googlemaps-googleearth/geo1.png - [geo2]: /images/googlemaps-googleearth/geo2.png - [geo3]: /images/googlemaps-googleearth/geo3.png - [geo4]: /images/googlemaps-googleearth/geo4.png - [geo5]: /images/googlemaps-googleearth/geo5.png - [geo6]: /images/googlemaps-googleearth/geo6.png - [UK Global Fat Supply CSV file]: /assets/googlemaps-googleearth/UK.Global.Fat.Supply.1894-1896.-.Sheet1.csv.zip - [geo7]: /images/googlemaps-googleearth/geo7.png - [geo8]: /images/googlemaps-googleearth/geo8.png - [geo9]: /images/googlemaps-googleearth/geo9.png - [geo10]: /images/googlemaps-googleearth/geo10.png - [geo11]: /images/googlemaps-googleearth/geo11.png - [geo12]: /images/googlemaps-googleearth/geo12.png - [Creating New Vector Layers in QGIS 2.0]: /lessons/vector-layers-qgis - [geo13]: /images/googlemaps-googleearth/geo13.png - [geo14]: /images/googlemaps-googleearth/geo14.png - [geo15]: /images/googlemaps-googleearth/geo15.png - [geo16]: /images/googlemaps-googleearth/geo16.png - [geo17]: /images/googlemaps-googleearth/geo17.png - [geo18]: /images/googlemaps-googleearth/geo18.png - [geo19]: /images/googlemaps-googleearth/geo19.png - [geo20]: /images/googlemaps-googleearth/geo20.png - [geo21]: /images/googlemaps-googleearth/geo21.png - [geo22]: /images/googlemaps-googleearth/geo22.png - [geo23]: /images/googlemaps-googleearth/geo23.png - [geo24]: /images/googlemaps-googleearth/geo24.png - [www.davidrumsey.com]: http://www.davidrumsey.com/ - [geo25]: /images/googlemaps-googleearth/geo25.png - [geo26]: /images/googlemaps-googleearth/geo26.png - [Georeferencing in QGIS 2.0]: /lessons/georeferencing-qgis - [geo27]: /images/googlemaps-googleearth/geo27.png - [geo28]: /images/googlemaps-googleearth/geo28.png - [Maps Engine Lite]: https://mapsengine.google.com/map/ - [Seaway map]: /assets/googlemaps-googleearth/seaway.zip - [geo29]: /images/googlemaps-googleearth/geo29.png - [geo30]: /images/googlemaps-googleearth/geo30.png - [geo31]: /images/googlemaps-googleearth/geo31.png - [geo32]: /images/googlemaps-googleearth/geo32.png - [geo33]: /images/googlemaps-googleearth/geo33.png - [geo34]: /images/googlemaps-googleearth/geo34.png - [geo35]: /images/googlemaps-googleearth/geo35.png - [Mobile Mapping and Historical GIS in the Field]: http://niche-canada.org/2011/12/14/mobile-mapping-and-historical-gis-in-the-field/ - "Mobile Mapping and Historical GIS in the Field" - [geo36]: /images/googlemaps-googleearth/geo36.png - [geo37]: /images/googlemaps-googleearth/geo37.png - [geo38]: /images/googlemaps-googleearth/geo38.png - [geo39]: /images/googlemaps-googleearth/geo39.png - [geo40]: /images/googlemaps-googleearth/geo40.png - [Geospatial Historian]: http://geospatialhistorian.wordpress.com/ +*This lesson is part of the [Geospatial Historian](https://geospatialhistorian.wordpress.com/).* diff --git a/en/lessons/gravity-model.md b/en/lessons/gravity-model.md index 934b5894bf..b0ea5fcb74 100644 --- a/en/lessons/gravity-model.md +++ b/en/lessons/gravity-model.md @@ -110,7 +110,7 @@ While gravity models can be used in a range of different migration and trade stu 1 - Adam Crymble, Adam Dennett, Tim Hitchcock, "Modelling regional imbalances in English plebeian migration to late eighteenth-century London", *Economic History Review*, 71, 3 (2018), pp. 747-771: (Paywall until July 2019). -2 - Adam Crymble, Louise Falcini, Tim Hitchcock, "Vagrant Lives: 14,789 Vagrants Processed by the County of Middlesex, 1777-1786", *Journal of Open Humanities Data*, vol. 1, no. 1 (2015), . +2 - Adam Crymble, Louise Falcini, Tim Hitchcock, "Vagrant Lives: 14,789 Vagrants Processed by the County of Middlesex, 1777-1786", *Journal of Open Humanities Data*, vol. 1, no. 1 (2015), . The Vagrancy Act of 1744 gave communities in England and Wales the right to expel outsiders back from whence they came. This was an important right because welfare was distributed locally at the time, and it was paid for by local taxes with the intention of supporting local people. That meant that a large influx of poor outsiders could financially cripple communities that attracted a lot of migration (such as those in London). This restriction on internal migration was only really used against the poor, and constables and local magistrates had tremendous powers of discretion over who they labelled a "vagrant" and who they left alone. As of the time of writing, a version of this law is still on the books in England, and it is still used by the police to arrest people who are begging or who they otherwise feel need to be removed from a situation. People in the late eighteenth century who were arrested under the 1744 act are therefore evidence of internal migration between the various counties of England and London. The question is: were any counties sending more or fewer vagrants to London than we would expect? @@ -122,7 +122,7 @@ A sample of the primary sources that detail these individuals' journeys can be s As part of the "[Vagrant Lives](https://web.archive.org/web/20190213131016/http://www.migrants.adamcrymble.org/the-project/)" project, the original vagrancy lists were converted into a scholarly dataset and published as: -* Adam Crymble, Louise Falcini, Tim Hitchcock, "Vagrant Lives: 14,789 Vagrants Processed by the County of Middlesex, 1777-1786", *Journal of Open Humanities Data*, vol. 1, no. 1 (2015), . +* Adam Crymble, Louise Falcini, Tim Hitchcock, "Vagrant Lives: 14,789 Vagrants Processed by the County of Middlesex, 1777-1786", *Journal of Open Humanities Data*, vol. 1, no. 1 (2015), . Readers are invited to download and explore this [published dataset](https://zenodo.org/record/1217600) and its documentation to understand the types of primary sources being modelled in this example. @@ -239,7 +239,7 @@ In probability statistics, there are a number of different [probability distribu As it happens, our vagrants are best suited to a negative binomial distribution. The reasons for this are that they represent count data (1, 2, 53 vagrants) that must be whole numbers (no 0.5 vagrants) and cannot be negative (no -9 vagrants). Earlier gravity modelling conducted in the 1980s tended to use a [Poisson Distribution](https://en.wikipedia.org/wiki/Poisson_distribution) for modelling human migration. The best approach for gravity models is still a point of academic debate, with some scholars opting for a Negative Binomial approach, and others sticking with the Poisson distribution.[^7] It is possible that another probability distribution entirely is most appropriate for your own data. If you were modelling trade surpluses or deficits (which could be + or -), your data may not follow a negative binomial distribution, and the author recommends speaking to a statistician about the most appropriate option. -What this means for us in this example is that the formula changes slightly. In particular, we no longer solve for $y$, but for the [natural logarithm](https://en.wikipedia.org/wiki/Natural_logarithm) ($ln$) of the [population mean](http://www.statisticshowto.com/population-mean/) ($μ$). You can read more about this type of formula in Michael L. Zwilling's work[^8]. +What this means for us in this example is that the formula changes slightly. In particular, we no longer solve for $y$, but for the [natural logarithm](https://en.wikipedia.org/wiki/Natural_logarithm) ($ln$) of the [population mean](https://www.statisticshowto.com/population-mean/) ($μ$). You can read more about this type of formula in Michael L. Zwilling's work[^8]. **Multivariate Regression Model:** @@ -801,7 +801,7 @@ With thanks to Angela Kedgley, Sarah Lloyd, Tim Hitchcock, Joe Cozens, Katrina N [^5]: For English speakers, the author recommends Eugene O'Loughlin, 'How To...Perform Simple Linear Regression by Hand', *YouTube* (23 December 2015): . [^6]: "Chapter 326: Negative Binomial Regression", *NCSS Stats Software* (n.d.): [^7]: Flowerdew, R. and Aitkin, M., ‘A method of fitting the gravity model based on the Poisson distribution’, *Journal of Regional Science*, 22 (1982), pp. 191–202; Flowerdew, R. and Lovett, A., ‘Fitting constrained Poisson regression models to interurban migration flows’, *Geographical Analysis*, 20 (1988), pp. 297–307; Congdon, P., ‘Approaches to modeling overdispersion in the analysis of migration’, *Environment and Planning* A, 25 (1993), pp. 1481–510; Flowerdew, R., ‘Modelling migration with Poisson regression’, in J. Stillwell, O. Duke-Williams, and A. Dennett, eds., *Technologies for migration and commuting analysis: spatial interaction data applications* (Hershey, Pa., 2010), pp. 261–79. -[^8]: Michael L. Zwilling, "Negative Binomial Regression", *The Mathematica Journal*, vol. 15 (2013): . +[^8]: Michael L. Zwilling, "Negative Binomial Regression", *The Mathematica Journal*, vol. 15 (2013): . [^9]: Crymble, A, A. Dennett, and T. Hitchcock, "Modelling regional imbalances in English plebeian migration to late eighteenth-century London", *Economic History Review*, vol. 71, no. 3 (2018), 747-771. [^10]: For example, see: Grigg, D.B. "E.G. Ravenstein and the 'laws of migration", *Journal of Historical Geography*, vol. 3, no. 1 (1977), pp. 44-54. [^11]: Crymble, A, A. Dennett, and T. Hitchcock, "Modelling regional imbalances in English plebeian migration to late eighteenth-century London", *Economic History Review*, vol. 71, no. 3 (2018), 753-754. diff --git a/en/lessons/image-classification-neural-networks.md b/en/lessons/image-classification-neural-networks.md index 8cddec5da0..08d30fb0b2 100644 --- a/en/lessons/image-classification-neural-networks.md +++ b/en/lessons/image-classification-neural-networks.md @@ -18,7 +18,7 @@ topics: [machine-learning] abstract: This lesson provides a beginner-friendly introduction to convolutional neural networks (CNNs) for image classification. The tutorial provides a conceptual understanding of how neural networks work by using Google's Teachable Machine to train a model on paintings from the ArtUK database. This lesson also demonstrates how to use Javascript to embed the model in a live website. avatar_alt: Cherub behind a devil mask. From Poets' Wit and Humour. Selected by William Henry Willis, London (1882). lesson-partners: [Jisc, The National Archives] -partnership-url: /jisc-tna-partnership +partnership-url: /en/jisc-tna-partnership doi: 10.46430/phen0108 --- @@ -53,7 +53,7 @@ Machine learning can be divided into two forms: supervised and unsupervised lear For this tutorial, we will download a dataset of paintings from [ArtUK](https://artuk.org/), which provides access to works that meet the UK's requirements for "[public ownership](https://artuk.org/footer/faq)." Approximately, [80% of the UK's publicly owned art is not on display](https://artuk.org/about/provide-free-digital-access-to-the-uks-art). ArtUK combats this by providing the general public access to these materials. -The ArtUK website allows you to view artworks by [topic](https://artuk.org/discover/topics), and we will use these topics to train our image classifier. You can [download a `.zip` file containing the images here](/assets/image-classification-neural-networks/dataset.zip). Save the `.zip` file in your `projects` folder and unzip it. Inside, you will find a folder called "dataset" with two additional folders: `training` and `testing`. Once you have downloaded all the files, go ahead and launch a live server on the `projects` folder. In most cases, you can view the server using the localhost address of "http://127.0.0.1". +The ArtUK website allows you to view artworks by [topic](https://artuk.org/discover/topics), and we will use these topics to train our image classifier. You can [download a `.zip` file containing the images here](/assets/image-classification-neural-networks/dataset.zip). Save the `.zip` file in your `projects` folder and unzip it. Inside, you will find a folder called "dataset" with two additional folders: `training` and `testing`. Once you have downloaded all the files, go ahead and launch a live server on the `projects` folder. In most cases, you can view the server using the localhost address of `http://127.0.0.1`. # Understanding Neural Networks @@ -92,7 +92,7 @@ In most hidden layers, the neural network takes the values from previous layers, How do the neurons in hidden layers help solve mathematical problems and classification tasks? Let's go through a simple example. Let's assume that we are interested in solving the following equation: `x+y=7.5`. In this scenario, we know that the output should be 7.5, but we do not know the inputs. We can begin by simply guessing numbers such as 3 and 2. Putting them into our equation gives us an answer of 5. However, we know that we need to get an answer of 7.5 so one of the things that we can do is multiply the inputs by a number. We can start by multiplying our original guesses by 2. The amount that we multiply each number is known as a weight: `(3x2)+(2x2)=10`. Now we have overshot our output, so we need to adjust the weights down. A neural network uses the "error" value to adjust the weights of our network accordingly, in a process called "back propagation." Let's try 1.5: `(3x1.5)+(2x1.5)=7.5`. We now have the correct result despite not knowing the original inputs and simply choosing two random values. This is exactly how a neural network works! -One thing to note is that the output of a neuron to the next layer is rarely the value originally calculated. Instead, it is sent to an activation function to prevent network collapse. Recall from [earlier](#understanding-artificial-neurons) that an activation function in a biological neuron has a threshold that stops all neurons from firing at the same time. You can think of network collapse as removing any redundancy in neurons. For instance, if a neuron adds two different input values then outputs them to another neuron which, in turn, adds up the first neuron's output, we can reduce the number of neurons by programming the first to perform the whole calculation. While this may seem more efficient, it diminishes our network's flexibility. +One thing to note is that the output of a neuron to the next layer is rarely the value originally calculated. Instead, it is sent to an activation function to prevent network collapse. Recall from [earlier](#understanding-neural-networks) that an activation function in a biological neuron has a threshold that stops all neurons from firing at the same time. You can think of network collapse as removing any redundancy in neurons. For instance, if a neuron adds two different input values then outputs them to another neuron which, in turn, adds up the first neuron's output, we can reduce the number of neurons by programming the first to perform the whole calculation. While this may seem more efficient, it diminishes our network's flexibility. The activation function in an artificial neuron stops network collapse by introducing non-linearity. There are numerous types of activation functions. The simplest non-linear functions are "step functions." In these functions, a certain threshold (sometimes a group of thresholds) is chosen and the values to the left of the threshold output a single value, while the values to the right of the threshold output several values. The most popular activation functions are variations of [rectified linear unit](https://perma.cc/BT2H-UDG2) (ReLU). In its simplest form, a ReLU activation function outputs `0` for values that are less than zero, and the input value itself if it's higher than zero. diff --git a/en/lessons/installing-omeka.md b/en/lessons/installing-omeka.md index 66aee7add7..73c3321438 100755 --- a/en/lessons/installing-omeka.md +++ b/en/lessons/installing-omeka.md @@ -14,8 +14,8 @@ review-ticket: https://github.com/programminghistorian/ph-submissions/issues/6 activity: presenting topics: [website] abstract: "This lesson will teach you how to install your own copy of Omeka." -next: creating-an-omeka-exhibit -redirect_from: /lessons/installing-omeka +next: /en/lessons/creating-an-omeka-exhibit +redirect_from: /lessons/installing-omeka/ avatar_alt: A figure working at a machine with gear diagrams doi: 10.46430/phen0052 --- @@ -28,7 +28,7 @@ doi: 10.46430/phen0052 ## Introduction -[Omeka.net](http://omeka.net), as described in [the previous lesson](up-and-running-with-omeka.html), is a useful service for Omeka beginners, but there are a few reasons why you might want to install your own copy of Omeka. Reasons include: +[Omeka.net](https://omeka.net), as described in [the previous lesson](/en/lessons/up-and-running-with-omeka), is a useful service for Omeka beginners, but there are a few reasons why you might want to install your own copy of Omeka. Reasons include: * **Upgrades**. By installing Omeka yourself, you can use the latest versions of Omeka as soon as they're released, without having to wait for Omeka.net to upgrade their system. * **Plugins and themes**. You can install any plugin or theme you want, without being restricted to those provided by Omeka.net. @@ -37,24 +37,24 @@ doi: 10.46430/phen0052 * **Price**. There are many low-cost Virtual Private Servers (VPSs) now, some of which cost only $5 per month. * **Storage**. Many shared hosting providers now offer unlimited storage. This is useful if you have a large media library. -In this tutorial, we'll be entering a few commands on the command line. This tutorial assumes no prior knowledge of the command line, but if you want a concise primer, consult the [Programming Historian introduction to BASH](/lessons/intro-to-bash). There are other ways of installing Omeka, of course, some using exclusively GUI tools. Some hosting providers even offer "[one-click installs](https://omeka.org/classic/docs/GettingStarted/Hosting_Suggestions/)" via their control panels. Many of those methods, however, will install older versions of Omeka which are then harder to upgrade and maintain. The method outlined below may not be the easiest way to install Omeka, but it will give you some good practice with using the command line, which is a skill that will be useful if you want to manually upgrade your install, or manually install other web frameworks. (For example, this installation method is very similar to WordPress's ["Five-Minute Install"](https://codex.wordpress.org/Installing_WordPress).) There are four steps to this process, and it should take about an hour. +In this tutorial, we'll be entering a few commands on the command line. This tutorial assumes no prior knowledge of the command line, but if you want a concise primer, consult the [Programming Historian introduction to BASH](/en/lessons/intro-to-bash). There are other ways of installing Omeka, of course, some using exclusively GUI tools. Some hosting providers even offer "[one-click installs](https://omeka.org/classic/docs/GettingStarted/Hosting_Suggestions/)" via their control panels. Many of those methods, however, will install older versions of Omeka which are then harder to upgrade and maintain. The method outlined below may not be the easiest way to install Omeka, but it will give you some good practice with using the command line, which is a skill that will be useful if you want to manually upgrade your install, or manually install other web frameworks. (For example, this installation method is very similar to WordPress's ["Five-Minute Install"](https://codex.wordpress.org/Installing_WordPress).) There are four steps to this process, and it should take about an hour. ## Step 1: Set Up Your Host -First, sign up for an account with a hosting provider that gives you SSH access. There are two main types of hosting providers: VPS and shared. A VPS host gives you root access, which means you have more control over the server, but your storage space is often limited. For small archives of 20GB or less, this is the best solution, but for large archives, shared hosting plans might be better suited. [DigitalOcean](https://www.digitalocean.com/signup/) is an easy-to-use and inexpensive VPS host, and [Amazon Web Services](http://aws.amazon.com/free/) (AWS) hosts similar virtual servers on their Elastic Computing (EC2) platform, geared more toward advanced users. Both [HostGator](http://www.hostgator.com/) and [DreamHost](http://www.dreamhost.com) offer inexpensive shared hosting with unlimited storage. +First, sign up for an account with a hosting provider that gives you SSH access. There are two main types of hosting providers: VPS and shared. A VPS host gives you root access, which means you have more control over the server, but your storage space is often limited. For small archives of 20GB or less, this is the best solution, but for large archives, shared hosting plans might be better suited. [DigitalOcean](https://www.digitalocean.com/signup/) is an easy-to-use and inexpensive VPS host, and [Amazon Web Services](https://aws.amazon.com/free/) (AWS) hosts similar virtual servers on their Elastic Computing (EC2) platform, geared more toward advanced users. Both [HostGator](https://www.hostgator.com/) and [DreamHost](https://www.dreamhost.com) offer inexpensive shared hosting with unlimited storage. -If you open an account with a VPS provider, you'll first want to create a virtual server with their interface. (If you’re using shared hosting, this is already done for you.) On DigitalOcean, VPS instances are called "droplets," and you can create one by simply logging in and clicking "Create Droplet." On AWS EC2, a VPS is called an "instance," and you can create one by logging into your EC2 console and clicking "Launch Instance." In both cases, **choose an Ubuntu system** to install, since we'll be running Ubuntu Linux commands below. For more detailed help with these steps, check out Digital Ocean's guide [How To Create Your First DigitalOcean Droplet Virtual Server](https://web.archive.org/web/20170608220025/https://www.digitalocean.com/community/tutorials/how-to-create-your-first-digitalocean-droplet-virtual-server), and Amazon's guide [Launch an Amazon EC2 Instance](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-launch-instance_linux.html). +If you open an account with a VPS provider, you'll first want to create a virtual server with their interface. (If you’re using shared hosting, this is already done for you.) On DigitalOcean, VPS instances are called "droplets," and you can create one by simply logging in and clicking "Create Droplet." On AWS EC2, a VPS is called an "instance," and you can create one by logging into your EC2 console and clicking "Launch Instance." In both cases, **choose an Ubuntu system** to install, since we'll be running Ubuntu Linux commands below. For more detailed help with these steps, check out Digital Ocean's guide [How To Create Your First DigitalOcean Droplet Virtual Server](https://web.archive.org/web/20170608220025/https://www.digitalocean.com/community/tutorials/how-to-create-your-first-digitalocean-droplet-virtual-server), and Amazon's guide [Launch an Amazon EC2 Instance](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-launch-instance_linux.html). Now that you have a running server, connect to it with an SSH client. This is sometimes as simple as opening a terminal and typing `ssh user@hostname`, where `user` is the username provided by your VPS and `hostname` is your server address. Consult your host's documentation for instructions for logging on via SSH. Here is a sampling of guides for VPS hosts: * [Digital Ocean: How To Connect To Your Droplet with SSH](https://www.digitalocean.com/docs/droplets/how-to/connect-with-ssh) - * [Amazon Web Services: Connecting to Your Linux Instance Using SSH](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) + * [Amazon Web Services: Connecting to Your Linux Instance Using SSH](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/AccessingInstancesLinux.html) * [Google Cloud: Connecting to Your Linux Instance](https://cloud.google.com/compute/docs/instances/connecting-to-instance) And here are a few guides for shared hosts: * [DreamHost Wiki: SSH](https://help.dreamhost.com/hc/en-us/articles/216041267-SSH-overview) - * [HostGator: How Do I Get and Use SSH Access?](http://support.hostgator.com/articles/hosting-guide/lets-get-started/how-do-i-get-and-use-ssh-access) + * [HostGator: How Do I Get and Use SSH Access?](https://support.hostgator.com/articles/hosting-guide/lets-get-started/how-do-i-get-and-use-ssh-access) When you're connected, you should see a prompt that looks roughly like this: @@ -139,7 +139,7 @@ Now let's download Omeka directly to the server. This will allow us to avoid the If you get a permissions error here on a VPS, make sure you're logged in as the root user with `su root`. Now let's download Omeka with command `wget` like this: - wget http://omeka.org/files/omeka-2.7.zip + wget https://omeka.org/files/omeka-2.7.zip Now let’s first make sure we have the `unzip` command: diff --git a/en/lessons/installing-python-modules-pip.md b/en/lessons/installing-python-modules-pip.md index c2bf486f44..7a69582137 100755 --- a/en/lessons/installing-python-modules-pip.md +++ b/en/lessons/installing-python-modules-pip.md @@ -14,7 +14,7 @@ abstract: "There are many ways to install external python libraries; this tutori exclude_from_check: - editors - review-ticket -redirect_from: /lessons/installing-python-modules-pip +redirect_from: /lessons/installing-python-modules-pip/ avatar_alt: A branch with pears doi: 10.46430/phen0029 --- @@ -154,6 +154,6 @@ python -m pip install XXX Happy installing! [pip]: https://pip.pypa.io/en/stable/ - [curl command]: http://www.thegeekstuff.com/2012/04/curl-examples/ + [curl command]: https://www.thegeekstuff.com/2012/04/curl-examples/ [here]: https://bootstrap.pypa.io/get-pip.py - [StackOverflow page]: http://stackoverflow.com/questions/4750806/how-to-install-pip-on-windows + [StackOverflow page]: https://stackoverflow.com/questions/4750806/how-to-install-pip-on-windows diff --git a/en/lessons/interactive-data-visualization-dashboard.md b/en/lessons/interactive-data-visualization-dashboard.md index aa62c182f0..56207ca274 100644 --- a/en/lessons/interactive-data-visualization-dashboard.md +++ b/en/lessons/interactive-data-visualization-dashboard.md @@ -495,7 +495,7 @@ If you want to host your own server, or someone at your institution can help you ### Setting up in GitHub -You will need to turn the `ph-dash` folder into a GitHub repository. You can do this in the command line by executing `$git init` or using GitHub Desktop (see [Amanda Visconti's _Programming Historian_ lesson](/en/lessons/building-static-sites-with-jekyll-github-pages#github--github-pages-) if you are new to Git or GitHub). +You will need to turn the `ph-dash` folder into a GitHub repository. You can do this in the command line by executing `$git init` or using GitHub Desktop (see [Amanda Visconti's _Programming Historian_ lesson](/en/lessons/building-static-sites-with-jekyll-github-pages/#github--github-pages) if you are new to Git or GitHub). Then, install one more library for deployment by entering `$pip install gunicorn`. The [`gunicorn`](https://gunicorn.org/) library allows Render to set up a web server for you. @@ -596,7 +596,7 @@ The final message of this lesson is to encourage you to adapt the code provided ## Endnotes -[^1]: Ann Marie Ward. *Ireland Gender Pay Gap Analysis* (), . +[^1]: Ann Marie Ward. *Ireland Gender Pay Gap Analysis* (), . [^2]: Stephen Lacy et al., “Issues and Best Practices in Content Analysis,” *Journalism & Mass Communication Quarterly* 92, no. 4 (September 28, 2015): 791–802, . diff --git a/en/lessons/interactive-text-games-using-twine.md b/en/lessons/interactive-text-games-using-twine.md index f79ede873f..124b1f6fbe 100644 --- a/en/lessons/interactive-text-games-using-twine.md +++ b/en/lessons/interactive-text-games-using-twine.md @@ -81,7 +81,7 @@ An important first step in creating a meaningful and inclusive environment for g * Only certain types of people play games (students may feel that the games they play, such as mobile games, do not qualify as “real” games) * That games easily create social change by automatically sparking empathy -Playing and analyzing a game is a helpful way to challenge these assumptions. Games with strong rhetorical arguments are a good place to start. I begin many of my game units with Zoe Quinn’s Twine game [*Depression Quest*](http://www.depressionquest.com/), in which you play as someone living with depression. +Playing and analyzing a game is a helpful way to challenge these assumptions. Games with strong rhetorical arguments are a good place to start. I begin many of my game units with Zoe Quinn’s Twine game [*Depression Quest*](https://www.depressionquest.com/), in which you play as someone living with depression. {% include figure.html filename="Figure1.jpg" alt="A 'passage' from the game Depression Quest. At the top, a scenario is described. It is written in black lettering. Below, there are four options for the player's next action, preceeded by the prompt 'Do you...'. The first option is written in red, and is struck-through indicating that it is unavailable to choose. The following three options are written in blue, indicating that these options are available" caption="Figure 1. Example from *Depression Quest*" %} @@ -128,7 +128,7 @@ In the game I walk through below, the player character will move through her fir You can create a successful Twine game using limited technical elements. However, there are some basics of game design that can turn a technically simple game into a narratively complex game. The game we will make is technically straightforward–it involves narrative and basic, text-based choices that the player can make. However, we will work to create choices that connect to the game's rhetorical goals. ## Creating Your First Story -To create your first game, which Twine will refer to as a "story", go to [Twine](http://twinery.org/) and click "Use it online." If it is your first time using Twine, there will be a basic introduction. Once you have read or skipped this introduction, Twine will take you to your story list. At first, this area will be largely empty. It will populate as you create more stories. To create your first story, click "+Story." +To create your first game, which Twine will refer to as a "story", go to [Twine](https://twinery.org/) and click "Use it online." If it is your first time using Twine, there will be a basic introduction. Once you have read or skipped this introduction, Twine will take you to your story list. At first, this area will be largely empty. It will populate as you create more stories. To create your first story, click "+Story." {% include figure.html filename="Figure2.jpg" caption="Figure 2. Getting Started with the Browser Version of Twine" %} @@ -404,7 +404,7 @@ If you would like to play with Twine or potentially integrate it into a course, ### Twine Games * [*A Witch’s Word*](https://rainbowstarbird.itch.io/a-witchs-word) by RainbowStarbird -* [*Depression Quest*](http://www.depressionquest.com/) by Zoe Quinn +* [*Depression Quest*](https://www.depressionquest.com/) by Zoe Quinn * [*Play Smarter Not Harder: Developing Your Scholarly Meta*](https://perma.cc/W2PK-FCQT) by Jason Helms * [*Queers in Love at the End of the World*](https://w.itch.io/end-of-the-world) by Anna Anthropy * [*September 7th, 2020*](https://perma.cc/GP6X-RARD) by Cait S. Kirby diff --git a/en/lessons/interrogating-national-narrative-gpt.md b/en/lessons/interrogating-national-narrative-gpt.md index 4501b1eab2..7d529c4554 100644 --- a/en/lessons/interrogating-national-narrative-gpt.md +++ b/en/lessons/interrogating-national-narrative-gpt.md @@ -19,7 +19,7 @@ topics: [python, data-manipulation] abstract: "In this lesson, you will learn how to apply a Generative Pre-trained Transformer language model to a large-scale corpus so that you can locate broad themes and trends within written text." avatar_alt: Illustration of different types of magnifying glasses on a page of text. lesson-partners: [Jisc, The National Archives] -partnership-url: /jisc-tna-partnership +partnership-url: /en/jisc-tna-partnership doi: 10.46430/phen0104 --- diff --git a/en/lessons/intro-to-bash.md b/en/lessons/intro-to-bash.md index 10d27284bf..efd58addbe 100755 --- a/en/lessons/intro-to-bash.md +++ b/en/lessons/intro-to-bash.md @@ -17,8 +17,8 @@ exclude_from_check: activity: transforming topics: [data-manipulation, get-ready] abstract: "This lesson will teach you how to enter commands using a command-line interface, rather than through a graphical interface. Command-line interfaces have advantages for computer users who need more precision in their work, such as digital historians. They allow for more detail when running some programs, as you can add modifiers to specify exactly how you want your program to run. Furthermore, they can be easily automated through scripts, which are essentially recipes of text-based commands." -next: research-data-with-unix -redirect_from: /lessons/intro-to-bash +next: /en/lessons/research-data-with-unix +redirect_from: /lessons/intro-to-bash/ avatar_alt: Soldiers in antique armor with spears doi: 10.46430/phen0037 --- @@ -37,11 +37,11 @@ Many of the lessons at the *Programming Historian* require you to enter commands {% include figure.html filename="en-or-intro-to-bash-01.png" caption="Figure 1. GUI of Ian Milligan's Computer" %} -Command-line interfaces have advantages for computer users who need more precision in their work -- such as digital historians. They allow for more detail when running some programs, as you can add modifiers to specify *exactly* how you want your program to run. Furthermore, they can be easily automated through [scripts](http://www.tldp.org/LDP/Bash-Beginners-Guide/html/chap_01.html), which are essentially recipes of text-based commands. +Command-line interfaces have advantages for computer users who need more precision in their work -- such as digital historians. They allow for more detail when running some programs, as you can add modifiers to specify *exactly* how you want your program to run. Furthermore, they can be easily automated through [scripts](https://www.tldp.org/LDP/Bash-Beginners-Guide/html/chap_01.html), which are essentially recipes of text-based commands. -There are two main command-line interfaces, or 'shells,' that many digital historians use. On OS X or many Linux installations, the shell is known as `bash`, or the 'Bourne-again shell.' For users on Windows-based systems, the command-line interface is by default `MS-DOS-based`, which uses different commands and [syntax](http://en.wikipedia.org/wiki/Syntax), but can often achieve similar tasks. This tutorial provides a basic introduction to the `bash` terminal, and Windows users can follow along by installing popular shells such as [Cygwin](https://www.cygwin.com/) or Git Bash (see below). +There are two main command-line interfaces, or 'shells,' that many digital historians use. On OS X or many Linux installations, the shell is known as `bash`, or the 'Bourne-again shell.' For users on Windows-based systems, the command-line interface is by default `MS-DOS-based`, which uses different commands and [syntax](https://en.wikipedia.org/wiki/Syntax), but can often achieve similar tasks. This tutorial provides a basic introduction to the `bash` terminal, and Windows users can follow along by installing popular shells such as [Cygwin](https://www.cygwin.com/) or Git Bash (see below). -This lesson uses a **[Unix shell](http://en.wikipedia.org/wiki/Unix_shell)**, which is a command-line interpreter that provides a user interface for the [Unix](http://en.wikipedia.org/wiki/Unix) operating system and for Unix-like systems. This lesson will cover a small number of basic commands. By the end of this tutorial you will be able to navigate through your file system and find files, open them, perform basic data manipulation tasks such as combining and copying files, as well as both reading them and making relatively simple edits. These commands constitute the building blocks upon which more complex commands can be constructed to fit your research data or project. Readers wanting a reference guide that goes beyond this lesson are recommended to read Deborah S. Ray and Eric J. Ray, *Unix and Linux: Visual Quickstart Guide*, 4th edition (2009). +This lesson uses a **[Unix shell](https://en.wikipedia.org/wiki/Unix_shell)**, which is a command-line interpreter that provides a user interface for the [Unix](https://en.wikipedia.org/wiki/Unix) operating system and for Unix-like systems. This lesson will cover a small number of basic commands. By the end of this tutorial you will be able to navigate through your file system and find files, open them, perform basic data manipulation tasks such as combining and copying files, as well as both reading them and making relatively simple edits. These commands constitute the building blocks upon which more complex commands can be constructed to fit your research data or project. Readers wanting a reference guide that goes beyond this lesson are recommended to read Deborah S. Ray and Eric J. Ray, *Unix and Linux: Visual Quickstart Guide*, 4th edition (2009). ## Windows Only: Installing Git Bash @@ -59,7 +59,7 @@ When you run it, you will see this window. {% include figure.html filename="en-or-intro-to-bash-03.png" caption="Figure 3. A blank terminal screen on our OS X workstation" %} -You might want to change the default visual appearance of the terminal, as eyes can strain at repeatedly looking at black text on a white background. In the default OS X application, you can open the 'Settings' menu in 'Preferences' under Terminal. Click on the 'Settings' tab and change it to a new colour scheme. We personally prefer something with a bit less contrast between background and foreground, as you'll be staring at this a great deal. 'Novel' is a soothing one as is the popular [Solarized](http://ethanschoonover.com/solarized) suite of colour palettes. For Windows users, a similar effect can be achieved using the Git Bash `Properties` tab. To reach this, right-click anywhere in the top bar and select `Properties`. +You might want to change the default visual appearance of the terminal, as eyes can strain at repeatedly looking at black text on a white background. In the default OS X application, you can open the 'Settings' menu in 'Preferences' under Terminal. Click on the 'Settings' tab and change it to a new colour scheme. We personally prefer something with a bit less contrast between background and foreground, as you'll be staring at this a great deal. 'Novel' is a soothing one as is the popular [Solarized](https://ethanschoonover.com/solarized) suite of colour palettes. For Windows users, a similar effect can be achieved using the Git Bash `Properties` tab. To reach this, right-click anywhere in the top bar and select `Properties`. {% include figure.html filename="en-or-intro-to-bash-04.png" caption="Figure 4. The Settings Screen on the OS X Terminal Shell Application" %} @@ -137,7 +137,7 @@ When you want to use two flags, you can just run them together. So, by typing you receive output in a human-readable format; you learn that that 6020 bits is also 5.9KB, that another file is 1 megabyte, and so forth. -These options are *very* important. In other lessons within the *Programming Historian*, you'll see them. [Wget](/lessons/applied-archival-downloading-with-wget), [MALLET](/lessons/topic-modeling-and-mallet), and [Pandoc](/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) all use the same syntax. Luckily, you do not need to memorize syntax; instead, keep these lessons handy so you can take a quick peek if you need to tweak something. These lessons can all be done in any order. +These options are *very* important. In other lessons within the *Programming Historian*, you'll see them. [Wget](/en/lessons/applied-archival-downloading-with-wget), [MALLET](/en/lessons/topic-modeling-and-mallet), and [Pandoc](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) all use the same syntax. Luckily, you do not need to memorize syntax; instead, keep these lessons handy so you can take a quick peek if you need to tweak something. These lessons can all be done in any order. You've now spent a great deal of time in your home directory. Let's go somewhere else. You can do that through the `cd` or Change Directory command. @@ -161,7 +161,7 @@ This moves us 'up' one directory, putting us back in `/Users/ianmilligan1/`. If will bring you right back to the home directory, right where you started. -Try exploring: visit your documents directory, your pictures, folders you might have on your desktop. Get used to moving in and out of directories. Imagine that you are navigating a [tree structure](http://en.wikipedia.org/wiki/Tree_structure). If you're on the desktop, you won't be able to `cd documents` as it is a 'child' of your home directory, whereas your Desktop is a 'sibling' of the Documents folder. To get to a sibling, you have to go back to the common parent. To do this, you will have to back up to your home directory (`cd ..`) and then go forward again to `cd documents`. +Try exploring: visit your documents directory, your pictures, folders you might have on your desktop. Get used to moving in and out of directories. Imagine that you are navigating a [tree structure](https://en.wikipedia.org/wiki/Tree_structure). If you're on the desktop, you won't be able to `cd documents` as it is a 'child' of your home directory, whereas your Desktop is a 'sibling' of the Documents folder. To get to a sibling, you have to go back to the common parent. To do this, you will have to back up to your home directory (`cd ..`) and then go forward again to `cd documents`. Being able to navigate your file system using the bash shell is very important for many of the lessons at the *Programming Historian*. As you become more comfortable, you'll soon find yourself skipping directly to the directory that you want. In our case, from anywhere on our system, you could type @@ -171,7 +171,7 @@ or, on Windows, something like `cd c:\mallet-2.0.7\` -and be brought to our MALLET directory for [topic modeling](/lessons/topic-modeling-and-mallet). +and be brought to our MALLET directory for [topic modeling](/en/lessons/topic-modeling-and-mallet). Finally, try @@ -185,7 +185,7 @@ in Windows. That command will open up your GUI at the current directory. Make su ## Interacting with Files -As well as navigating directories, you can interact with files on the command line: you can read them, open them, run them, and even edit them, often without ever having to leave the interface. There is some debate over why one would do this. The primary reason is the seamless experience of working on the command line: you never have to pick up your mouse or touch your track pad, and, although it has a steep learning curve it can eventually become a sole writing environment. Furthermore, many programs require you to use the command line to operate with them. Since you'll be using programs on the command line, it can often be quicker to make small edits without switching into a separate program. For some of these arguments, see Jon Beltran de Heredia's ["Why, oh WHY, do those #?@! nutheads use vi?"](http://www.viemu.com/a-why-vi-vim.html). +As well as navigating directories, you can interact with files on the command line: you can read them, open them, run them, and even edit them, often without ever having to leave the interface. There is some debate over why one would do this. The primary reason is the seamless experience of working on the command line: you never have to pick up your mouse or touch your track pad, and, although it has a steep learning curve it can eventually become a sole writing environment. Furthermore, many programs require you to use the command line to operate with them. Since you'll be using programs on the command line, it can often be quicker to make small edits without switching into a separate program. For some of these arguments, see Jon Beltran de Heredia's ["Why, oh WHY, do those #?@! nutheads use vi?"](https://www.viemu.com/a-why-vi-vim.html). Here's a few basic ways to interact with files. @@ -197,7 +197,7 @@ This creates a directory named, you guessed it, 'ProgHist-Text.' In general, it' But wait! There's a trick to make things a bit quicker. Go up one directory (`cd ..` - which will take you back to the Desktop). To navigate to the `ProgHist-Text` directory you could type `cd ProgHist-Text`. Alternatively, you could type `cd Prog` and then hit tab. You will notice that the interface completes the line to `cd ProgHist-Text`. **Hitting tab at any time within the shell will prompt it to attempt to auto-complete the line based on the files or sub-directories in the current directory. This is case sensitive, however (i.e. in the previous example, `cd prog` would not auto complete to `ProgHist-Text`. Where two or more files have the same characters, the auto-complete will only fill up to the first point of difference. We would encourage using this method throughout the lesson to see how it behaves.** -Now you need to find a basic text file to help us with the example. Why don't you use a book that you know is long, such as Leo Tolstoy's epic *War and Peace*. The text file is availiable via [Project Gutenberg](http://www.gutenberg.org/ebooks/2600). If you have already installed [wget](/lessons/applied-archival-downloading-with-wget), you can just type +Now you need to find a basic text file to help us with the example. Why don't you use a book that you know is long, such as Leo Tolstoy's epic *War and Peace*. The text file is availiable via [Project Gutenberg](https://www.gutenberg.org/ebooks/2600). If you have already installed [wget](/en/lessons/applied-archival-downloading-with-wget), you can just type `wget http://www.gutenberg.org/files/2600/2600-0.txt` @@ -267,7 +267,7 @@ and hit enter, a combination of all the .txt files in the current directory are ## Editing Text Files Directly on the Command Line -If you want to read a file in its entirety without leaving the command line, you can fire up [vim](http://en.wikipedia.org/wiki/Vim_%28text_editor%29). Vim is a very powerful text editor, which is perfect for using with programs such as [Pandoc](https://pandoc.org/) to do word processing, or for editing your code without having to switch to another program. Best of all, it comes included with bash on both OS X and Windows. Vim has a fairly steep learning curve, so we will just touch on a few minor points. +If you want to read a file in its entirety without leaving the command line, you can fire up [vim](https://en.wikipedia.org/wiki/Vim_%28text_editor%29). Vim is a very powerful text editor, which is perfect for using with programs such as [Pandoc](https://pandoc.org/) to do word processing, or for editing your code without having to switch to another program. Best of all, it comes included with bash on both OS X and Windows. Vim has a fairly steep learning curve, so we will just touch on a few minor points. Type @@ -277,7 +277,7 @@ You should see vim come to life before you, a command-line based text editor. {% include figure.html filename="en-or-intro-to-bash-06.png" caption="Figure 6. Vim" %} -If you really want to get into Vim, there is a [good Vim guide](http://vimdoc.sourceforge.net/htmldoc/quickref.html) available. +If you really want to get into Vim, there is a [good Vim guide](https://vimdoc.sourceforge.net/htmldoc/quickref.html) available. Using Vim to read files is relatively simple. You can use the arrow keys to navigate around and could theoretically read *War and Peace* through the command line (one should get an achievement for doing that). Some quick basic navigational commands are as follows: @@ -313,7 +313,7 @@ To leave vim or to make saves, you have to enter a series of commands. Press `:` If you want to quit, type `:` again and then `q`. It will return you to the command line. As with the rest of bash, you could have also combined the two commands. Pressing `:` and then typing `wq` would have written the file and then quit. Or, if you wanted to exit **without** saving, `q!` would have quit vim and overriden the default preference to save your changes. -Vim is different than you are likely used to and will require more work and practice to become fluent with it. But if you are tweaking minor things in files, it is a good way to get started. As you become more comfortable, you might even find yourself writing term papers with it, by harnessing the [footnoting and formatting power of Pandoc and Markdown](/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown). +Vim is different than you are likely used to and will require more work and practice to become fluent with it. But if you are tweaking minor things in files, it is a good way to get started. As you become more comfortable, you might even find yourself writing term papers with it, by harnessing the [footnoting and formatting power of Pandoc and Markdown](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown). ## Moving, Copying, and Deleting Files diff --git a/en/lessons/intro-to-linked-data.md b/en/lessons/intro-to-linked-data.md index c54f6f1ca1..08cd69f2f2 100755 --- a/en/lessons/intro-to-linked-data.md +++ b/en/lessons/intro-to-linked-data.md @@ -17,7 +17,7 @@ activity: acquiring topics: [lod] abstract: | Introduces core concepts of Linked Open Data, including URIs, ontologies, RDF formats, and a gentle intro to the graph query language SPARQL. -redirect_from: /lessons/intro-to-linked-data +redirect_from: /lessons/intro-to-linked-data/ avatar_alt: An old man with a woman on each arm doi: 10.46430/phen0068 --- @@ -42,12 +42,12 @@ This lesson offers a brief and concise introduction to [Linked Open Data](https: The tutorial should take a couple of hours to complete, and you may find it helpful to re-read sections to solidify your understanding. Technical terms have been linked to their corresponding page on Wikipedia, and you are encouraged to pause and read about terms that you find challenging. After having learned some of the key principles of LOD, the best way to improve and solidify that knowledge is to practise. This tutorial provides opportunities to do so. By the end of the course you should understand the basics of LOD, including key terms and concepts. -If you need to learn how to explore LOD using the query language [SPARQL](https://en.wikipedia.org/wiki/SPARQL), I recommend Matthew Lincoln's ['Using SPARQL to access Linked Open Data'](/lessons/graph-databases-and-SPARQL), which follows on practically from the conceptual overview offered in this lesson. +If you need to learn how to explore LOD using the query language [SPARQL](https://en.wikipedia.org/wiki/SPARQL), I recommend Matthew Lincoln's ['Using SPARQL to access Linked Open Data'](/en/lessons/graph-databases-and-SPARQL), which follows on practically from the conceptual overview offered in this lesson. In order to provide readers with a solid grounding in the basic principles of LOD, this tutorial will not be able to offer a comprehensive coverage of all LOD concepts. The following two LOD concepts will *not* be the focus of this lesson: 1. The [semantic web](https://en.wikipedia.org/wiki/Semantic_Web) and [semantic reasoning](https://en.wikipedia.org/wiki/Semantic_reasoner) of [datasets](https://en.wikipedia.org/wiki/Data_set). A semantic reasoner would deduce that George VI is the brother or half-brother of Edward VIII, given the fact that a) Edward VIII is the son of George V and b) George VI is the son of George V. This tutorial does not focus on this type of task. -2. Creating and uploading linked open datasets to the [linked data cloud](http://linkeddatacatalog.dws.informatik.uni-mannheim.de/state/). Sharing your LOD is an important principle, which is encouraged below. However, the practicalities of contributing your LOD to the linked data cloud are beyond the scope of this lesson. Some resources that can help you get started with this task are available at the end of this tutorial. +2. Creating and uploading linked open datasets to the [linked data cloud](https://linkeddatacatalog.dws.informatik.uni-mannheim.de/state/). Sharing your LOD is an important principle, which is encouraged below. However, the practicalities of contributing your LOD to the linked data cloud are beyond the scope of this lesson. Some resources that can help you get started with this task are available at the end of this tutorial. ## Linked open data: what is it? @@ -71,7 +71,7 @@ Using the model above in which each person is represented by a unique number, le person=64183282 -And let's make Jack Straw described by the *[Oxford Dictionary of National Biography](http://www.oxforddnb.com)* as 'the enigmatic rebel leader', number `33059614`, making his attribute-value pair look like this: +And let's make Jack Straw described by the *[Oxford Dictionary of National Biography](https://www.oxforddnb.com)* as 'the enigmatic rebel leader', number `33059614`, making his attribute-value pair look like this: person=33059614 @@ -81,7 +81,7 @@ The attribute-value pairs can also store information about other types of entiti place=2655524 -At this point you might be thinking, "that's what a library catalogue does". It's true that the key idea here is that of the [authority file](https://en.wikipedia.org/wiki/Authority_control), which is central in library science (an authority file is a definitive list of terms which can be used in a particular context, for example when cataloguing a book). In both of the examples outlined above, we have used authority files to assign the numbers (the unique ids) to the Jacks and to Blackburn. The numbers we used for the two Jack Straws come from the [Virtual International Authority File](https://www.oclc.org/en/viaf.html) (VIAF), which is maintained by a consortium of libraries worldwide to try to address the problem of the myriad ways in which the same person might be referred to. The unique identifier we used for the Blackburn constituency came from [GeoNames](http://www.geonames.org/), a free geographical database. +At this point you might be thinking, "that's what a library catalogue does". It's true that the key idea here is that of the [authority file](https://en.wikipedia.org/wiki/Authority_control), which is central in library science (an authority file is a definitive list of terms which can be used in a particular context, for example when cataloguing a book). In both of the examples outlined above, we have used authority files to assign the numbers (the unique ids) to the Jacks and to Blackburn. The numbers we used for the two Jack Straws come from the [Virtual International Authority File](https://www.oclc.org/en/viaf.html) (VIAF), which is maintained by a consortium of libraries worldwide to try to address the problem of the myriad ways in which the same person might be referred to. The unique identifier we used for the Blackburn constituency came from [GeoNames](https://www.geonames.org/), a free geographical database. But let's try to be more precise by what we mean by Blackburn in this instance. Jack Straw represented the parliamentary consitituency (an area represented by a single member of parliament) of Blackburn, which has changed its boundaries over time. The '[Digging Into Linked Parliamentary Data](https://repository.jisc.ac.uk/6544/)' (Dilipad) project (on which I worked), produced unique identifiers for party affiliations and constituencies for each member of parliament. In this example, Jack Straw represented the constituency known as 'Blackburn' in its post-1955 incarnation: @@ -127,19 +127,19 @@ In the previous section we used two different numbers to identify our two differ The problem is that around the world there are many databases that contain people with these numbers, and they're probably all different people. Outside of our immediate context these numbers don't identify unique individuals. Let's try to fix that. Here are these same identifiers but as URIs: - http://viaf.org/viaf/64183282/ + https://viaf.org/viaf/64183282/ - http://viaf.org/viaf/33059614/ + https://viaf.org/viaf/33059614/ Just as the unique number disambiguated our two Jack Straws, the full URI above helps us disambiguate between all of the different authority files out there. In this case, it's clear that we are using VIAF as our authority file. You have already seen this form of disambuguation many times on the web. There are many websites round the world with pages called `/home` or `/faq`. But there is no confusion because the [domain](https://en.wikipedia.org/wiki/Domain_name) (the first part of the [Uniform Resource Locator](https://en.wikipedia.org/wiki/Uniform_Resource_Locator) (URL) - eg. `bbc.co.uk`) is unique and thus all pages that are part of that domain are unique from other `/faq` pages on other websites. In the address `http://www.bbc.co.uk/faqs` it is the `bbc.co.uk` part which makes the subsequent pages unique. This is so obvious to people who use the web all the time that they don't think about it. You probably also know that if you want to start a website called `bbc.co.uk` you can't, because that name has already been registered with the appropriate authority, which is the [Domain Name System](https://en.wikipedia.org/wiki/Domain_Name_System). The registration guarantees uniqueness. URIs also have to be unique. While the examples above look like URLs, it is also possible to construct a URI that looks nothing like a URL. We have many ways of uniquely identifying people and things and we rarely think or worry about it. Barcodes, passport numbers, and even your postal address are all designed to be unique. Mobile phone numbers are frequently put up as shop signs precisely because they are unique. All of these could be used as URIs. -When we wanted to create URIs for the entities described by the '[Tobias](http://www.history.ac.uk/projects/digital/tobias)' project, we chose a URL-like structure, and chose to use our institutional webspace, setting aside `data.history.ac.uk/tobias-project/` as a place dedicated to hosting these URIs. By putting it at `data.history.ac.uk` rather than `history.ac.uk`, there was a clear separation between URIs and the pages of the website. For example, one of the URIs from the Tobias project was http://data.history.ac.uk/tobias-project/person/15601. While the format of the abovementioned URIs is the same as a URL, they do not link to web pages (try pasting it of them into a web browser). Many people new to LOD find this confusing. All URLs are URIs but not all URIs are URLs. A URI can describe anything at all, whereas URL describes the location of something on the web. So a URL tells you the location of a web page or a file or something similar. A URI just does the job of identifying something. Just as the International Standard Book Number, or [ISBN](https://www.iso.org/standard/36563.html) `978-0-1-873354-6` uniquely identifies a hardback edition of _Baptism, Brotherhood and Belief in Reformation Germany_ by Kat Hill, but doesn't tell you where to get a copy. For that you would need something like a library [shelfmark](https://en.wikipedia.org/wiki/Accession_number_(library_science)), which gives you an exact location on a shelf of a specific library. +When we wanted to create URIs for the entities described by the '[Tobias](https://www.history.ac.uk/projects/digital/tobias)' project, we chose a URL-like structure, and chose to use our institutional webspace, setting aside `data.history.ac.uk/tobias-project/` as a place dedicated to hosting these URIs. By putting it at `data.history.ac.uk` rather than `history.ac.uk`, there was a clear separation between URIs and the pages of the website. For example, one of the URIs from the Tobias project was https://data.history.ac.uk/tobias-project/person/15601. While the format of the abovementioned URIs is the same as a URL, they do not link to web pages (try pasting it of them into a web browser). Many people new to LOD find this confusing. All URLs are URIs but not all URIs are URLs. A URI can describe anything at all, whereas URL describes the location of something on the web. So a URL tells you the location of a web page or a file or something similar. A URI just does the job of identifying something. Just as the International Standard Book Number, or [ISBN](https://www.iso.org/standard/36563.html) `978-0-1-873354-6` uniquely identifies a hardback edition of _Baptism, Brotherhood and Belief in Reformation Germany_ by Kat Hill, but doesn't tell you where to get a copy. For that you would need something like a library [shelfmark](https://en.wikipedia.org/wiki/Accession_number_(library_science)), which gives you an exact location on a shelf of a specific library. There is a little bit of jargon around URIs. People talk about whether they are, or are not, [dereferenceable](https://en.wikipedia.org/wiki/Reference_(computer_science)). That just means *can it be turned from an abstract reference into something else?* For example, if you paste a URI into the address bar of a browser, will it return something? The VIAF URI for historian Simon Schama is: - http://viaf.org/viaf/46784579 + https://viaf.org/viaf/46784579 If you put that into the browser you will get back a web page about Simon Schama which contains structured data about him and his publishing history. This is very handy - for one thing, it's not obvious from the URI who or even what is being referred to. Similarly, if we treated a mobile phone number (with international code) as the URI for a person then it should be dereferenceable. Someone might answer the phone, and it might even be Schama. @@ -166,9 +166,9 @@ We're making up examples simply for the purposes of illustration, but if you wan An ontology is more flexible because it is non-hierarchical. It aims to represent the fluidity of the real world, where things can be related to each other in more complex ways than are represented by a hierarchical tree-like structure. Instead, an ontology is more like a spider's web. -Whatever you are looking to represent with LOD, we suggest that you find an existing vocabulary and use it, rather than try to write your own. The main page here has [a list of some of the most popular vocabularies](http://semanticweb.org/wiki/Main_Page.html). +Whatever you are looking to represent with LOD, we suggest that you find an existing vocabulary and use it, rather than try to write your own. The main page here has [a list of some of the most popular vocabularies](https://semanticweb.org/wiki/Main_Page.html). -Since our example above focuses on pianists, it would be a good idea to find an appropriate ontology rather than create our own system. In fact there is [an ontology for music](http://web.archive.org/web/20170715094229/http://www.musicontology.com/). As well as a well-developed specification it also has some useful examples of its use. You can have a look at the [Getting started pages](http://web.archive.org/web/20170718143925/http://musicontology.com/docs/getting-started.html) to get a sense of how you might use that particular ontology. +Since our example above focuses on pianists, it would be a good idea to find an appropriate ontology rather than create our own system. In fact there is [an ontology for music](https://web.archive.org/web/20170715094229/https://www.musicontology.com/). As well as a well-developed specification it also has some useful examples of its use. You can have a look at the [Getting started pages](https://web.archive.org/web/20170718143925/https://musicontology.com/docs/getting-started.html) to get a sense of how you might use that particular ontology. Unfortunately I can't find anything that describes the relationship between a teacher and a pupil in the Music Ontology. But the ontology is published openly, so I can use it to describe other features of music and then create my own extension. If I then publish my extension openly, others can use it if they wish and it may become a standard. While the Music Ontology project does not have the relationship I need, the [Linked Jazz project](https://linkedjazz.org/) allows use of 'mentorOf', which sounds like it would work nicely in our case. While this is not an ideal solution, it is one that makes an effort to use what is already out there. @@ -214,19 +214,19 @@ Recognising what serialisation you are looking at means that you can then choose Turtle uses aliases or a shortcuts known as [prefixes](https://www.w3.org/TeamSubmission/turtle/#sec-tutorial), which saves us having to write out full URIs every time. Let's go back to the URI we invented in the previous section: - http://data.history.ac.uk/tobias-project/person/15601 + https://data.history.ac.uk/tobias-project/person/15601 We don't want to type this out every time we refer to this person (Jack Straw, you'll remember). So we just have to announce our shortcut: - @prefix toby: . + @prefix toby: . Then Jack is `toby:15601`, which replaces the long URI and is easier on the eye. I have chosen 'toby', but could just as easily chosen any string of letters. Let's now move from Jack Straw to William Shakespeare and use Turtle to describe some stuff about his works. We'll need to decide on the authority files to use, a process which, as mentioned above, is best gleaned from looking at other LOD sets. Here we'll use [Dublin Core](https://en.wikipedia.org/wiki/Dublin_Core), a library [metadata](https://en.wikipedia.org/wiki/Metadata) standard, as one of our prefixes, the [Library of Congress Control Number](https://en.wikipedia.org/wiki/Library_of_Congress_Control_Number) authority file for another, and the last one (VIAF) should be familiar to you. Together these three authority files provide unique identifiers for all of the entities I plan to use in this example.: - @prefix lccn: . - @prefix dc: . - @prefix viaf: . + @prefix lccn: . + @prefix dc: . + @prefix viaf: . lccn:n82011242 dc:creator viaf:96994048 . @@ -236,9 +236,9 @@ In the above example, lccn:n82011242 represents Macbeth; dc:creator links Macbet Turtle also allows you to list triples without bothering to repeat each URI when you've only just used it. Let's add the date when scholars think Macbeth was written, using the Dublin Core attribute-value pair: `dc:created 'YYYY'`: - @prefix lccn: . - @prefix dc: . - @prefix viaf: . + @prefix lccn: . + @prefix dc: . + @prefix viaf: . lccn:n82011242 dc:creator viaf:96994048 ; dc:created "1606" . @@ -257,11 +257,11 @@ You can use a semicolon if the subject is the same but the predicate and object Here we're saying that Shakespeare (96994048) and John Fletcher (12323361) were both the creators of the work *The Two Noble Kinsmen*. -When we looked at ontologies earlier I suggested you have a look at the examples from [the Music Ontology](http://web.archive.org/web/20170718143925/http://musicontology.com/docs/getting-started.html). I hope they didn't put you off. Have a look again now. This is still complicated stuff, but do they make more sense now? +When we looked at ontologies earlier I suggested you have a look at the examples from [the Music Ontology](https://web.archive.org/web/20170718143925/https://musicontology.com/docs/getting-started.html). I hope they didn't put you off. Have a look again now. This is still complicated stuff, but do they make more sense now? One of the most approachable ontologies is Friend of a Friend, or [FOAF](https://en.wikipedia.org/wiki/FOAF_(ontology)). This is designed to describe people, and is perhaps for that reason, fairly intuitive. If, for example, you want to write to tell me that this course is the best thing you've ever read, here is my email address expressed as triples in FOAF: - @prefix foaf: . + @prefix foaf: . :"Jonathan Blaney" foaf:mbox . @@ -286,30 +286,30 @@ The RDF/XML format has the same basic information as Turtle, but is written very Let's move on to a different example to show how RDF/XML combines triples and, at the same time, introduce [Simple Knowledge Organization System](https://en.wikipedia.org/wiki/Simple_Knowledge_Organization_System) (SKOS), which is designed for encoding thesauri or taxonomies. - + Abdication -Here we are saying that the SKOS concept `21250`, abdication, has a preferred label of "abdication". The way it works is that the subject element (including the abdication part, which is an attribute value in XML terms) has the predicate and object nested inside it. The nested element is the predicate and [the leaf node](https://en.wikipedia.org/wiki/Tree_(data_structure)#Terminology), is the object. This example is taken from a project to publish a [thesaurus of British and Irish History](http://www.history.ac.uk/projects/digital/tobias). +Here we are saying that the SKOS concept `21250`, abdication, has a preferred label of "abdication". The way it works is that the subject element (including the abdication part, which is an attribute value in XML terms) has the predicate and object nested inside it. The nested element is the predicate and [the leaf node](https://en.wikipedia.org/wiki/Tree_(data_structure)#Terminology), is the object. This example is taken from a project to publish a [thesaurus of British and Irish History](https://www.history.ac.uk/projects/digital/tobias). Just as with Turtle, we can add more triples. So let's declare that the narrower term in our subject hierarchy, one down from *Abdication* is going to be *Abdication crisis (1936)*. - + Abdication - - + + Remember how predicates and objects are nested inside the subject? Here we've done that twice with the same subject, so we can make this less verbose by nesting both sets of predicates and objects inside the one subject: - + Abdication - + -If you're familiar with XML this will be like mother's milk to you. If you're not you might prefer a format like Turtle. But the advantage here is that in creating my RDF/XML you can use the usual tools available with XML, like dedicated XML editors and parsers, to check that your RDF/XML is correctly formatted. If you're not an XML person I recommend Turtle, for which you can use an [online tool](http://www.easyrdf.org/converter) to check your syntax is correct. +If you're familiar with XML this will be like mother's milk to you. If you're not you might prefer a format like Turtle. But the advantage here is that in creating my RDF/XML you can use the usual tools available with XML, like dedicated XML editors and parsers, to check that your RDF/XML is correctly formatted. If you're not an XML person I recommend Turtle, for which you can use an [online tool](https://www.easyrdf.org/converter) to check your syntax is correct. ## Querying RDF with SPARQL @@ -317,7 +317,7 @@ For this final section we will interrogate some LOD and see what you can do with The query language we use for LOD is called [SPARQL](https://en.wikipedia.org/wiki/SPARQL). It's one of those recursive acronyms beloved of techie people: *SPARQL Protocol and RDF Query Language*. -As I mentioned at the beginning, *Programming Historian* has [a complete lesson](/lessons/graph-databases-and-SPARQL), by Matthew Lincoln, on using SPARQL. My final section here is just an overview of the basic concepts, and if SPARQL piques your interest, you can get a thorough grounding from Lincoln's tutorial. +As I mentioned at the beginning, *Programming Historian* has [a complete lesson](/en/lessons/graph-databases-and-SPARQL), by Matthew Lincoln, on using SPARQL. My final section here is just an overview of the basic concepts, and if SPARQL piques your interest, you can get a thorough grounding from Lincoln's tutorial. We're going to run our SPARQL queries on [DBpedia](https://en.wikipedia.org/wiki/DBpedia), which is a huge LOD set derived from Wikipedia. As well as being full of information that is very difficult to find through the usual Wikipedia interface, it has several SPARQL "end points" - interfaces where you can type in SPARQL queries and get results from DBpedia's triples. @@ -366,7 +366,7 @@ Back to the results for the query I ran a moment ago: I can see a long list in the column labelled _c_. These are all the attributes Roper has in *DBpedia* and will help us to find other people with these attributes. For example I can see ```http://dbpedia.org/class/yago/Historian110177150```. Can I use this to get a list of historians? I'm going to put this into my query but in third place (because that's where it was when I found it in the Lyndal Roper results. My query looks like this: SELECT * WHERE { - ?historian_name ?predicate + ?historian_name ?predicate } I've made a small change here. If this query works at all then I expect my historians to be in the first column, because 'historian' doesn't look like it could be a predicate: it doesn't function like a verb in a sentence; so I'm going to call my first results column 'historian_name' and my second (which I don't know anything about) 'predicate'. @@ -379,8 +379,8 @@ So this works for creating lists, which is useful, but it would much more powerf SELECT ?name WHERE { - ?name ?b . - ?name ?b + ?name ?b . + ?name ?b } It works! I get five results. At the time of writing, there are five British, women historians in *DBpedia*... @@ -389,24 +389,24 @@ It works! I get five results. At the time of writing, there are five British, wo Only five British women historians? Of course there are, in reality, many more than that, as we could easily show by substituting the name of, say, Alison Weir in our first Lyndal Roper query. This brings us to the problem with *Dbpedia* that I mentioned earlier: it's not very consistently marked up with structural information of the type *DBpedia* uses. Our query can list some British women historians but it turns out that we can't use it to generate a meaningful list of people in this category. All we've found is the people in entries in *Wikipedia* that someone has decided to categorise as "British historian" and "woman historian". -With SPARQL on *DBpedia* you have to be careful of the inconsistencies of crowd-sourced material. You could use SPARQL in exactly the same way on a more curated dataset, for example the UK government data: [https://data-gov.tw.rpi.edu//sparql]() and expect to get more robust results (there is a brief tutorial for this dataset here: [https://data-gov.tw.rpi.edu/wiki/A\_crash\_course\_in\_SPARQL]()). +With SPARQL on *DBpedia* you have to be careful of the inconsistencies of crowd-sourced material. You could use SPARQL in exactly the same way on a more curated dataset, for example the UK government data: and expect to get more robust results (there is a brief tutorial for this dataset here: ). -However, despite its inconsistencies, *DBpedia* is a great place to learn SPARQL. This has only been an a brief introduction but there is much more in [Using SPARQL to access Linked Open Data](/lessons/graph-databases-and-SPARQL). +However, despite its inconsistencies, *DBpedia* is a great place to learn SPARQL. This has only been an a brief introduction but there is much more in [Using SPARQL to access Linked Open Data](/en/lessons/graph-databases-and-SPARQL). ## Further reading and resources * Dean Allemang and James Hendler, *Semantic Web for the Working Ontologist*, 2nd edn, Elsevier, 2011 * Tim Berners-Lee [Linked Data](https://www.w3.org/DesignIssues/LinkedData.html) * Bob DuCharme, *Learning SPARQL*, O'Reilly, 2011 -* [Bob DuCharme's blog](http://www.snee.com/bobdc.blog/) is also worth reading +* [Bob DuCharme's blog](https://www.snee.com/bobdc.blog/) is also worth reading * Richard Gartner, *Metadata: Shaping Knowledge from Antiquity to the Semantic Web*, Springer, 2016 * Seth van Hooland and Ruben Verborgh, *Linked Data for Libraries, Archives and Museums*, 2015 * Matthew Lincoln ['Using SPARQL to access Linked Open Data'](/lessons/graph-databases-and-SPARQL) * [Linked Data guides and tutorials](https://web.archive.org/web/20170515070722/http://linkeddata.org/guides-and-tutorials) * Dominic Oldman, Martin Doerr and Stefan Gradmann, 'Zen and the Art of Linked Data: New Strategies for a Semantic Web of Humanist * Knowledge', in *A New Companion to Digital Humanities*, edited by Susan Schreibman et al. -* Max Schmachtenberg, Christian Bizer and Heiko Paulheim, [State of the LOD Cloud 2017](http://linkeddatacatalog.dws.informatik.uni-mannheim.de/state/) +* Max Schmachtenberg, Christian Bizer and Heiko Paulheim, [State of the LOD Cloud 2017](https://linkeddatacatalog.dws.informatik.uni-mannheim.de/state/) * David Wood, Marsha Zaidman and Luke Ruth, *Linked Data: Structured data on the Web*, Manning, 2014 ## Acknowlegements -I'd like to thank my two peer reviewers, Matthew Lincoln and Terhi Nurmikko-Fuller, and my editor, Adam Crymble, for generously spending time helping me to improve this course with numerous suggestions, clarification and corrections. This tutorial is based on one written as part of the 'Thesaurus of British and Irish History as SKOS' [(Tobias) project](http://www.history.ac.uk/projects/digital/tobias), funded by the [AHRC](http://www.ahrc.ac.uk/). It has been revised for the *Programming Historian*. +I'd like to thank my two peer reviewers, Matthew Lincoln and Terhi Nurmikko-Fuller, and my editor, Adam Crymble, for generously spending time helping me to improve this course with numerous suggestions, clarification and corrections. This tutorial is based on one written as part of the 'Thesaurus of British and Irish History as SKOS' [(Tobias) project](https://www.history.ac.uk/projects/digital/tobias), funded by the [AHRC](https://www.ahrc.ac.uk/). It has been revised for the *Programming Historian*. diff --git a/en/lessons/intro-to-powershell.md b/en/lessons/intro-to-powershell.md index 9411a8e5e2..b377545978 100755 --- a/en/lessons/intro-to-powershell.md +++ b/en/lessons/intro-to-powershell.md @@ -27,7 +27,7 @@ doi: 10.46430/phen0054 # Introduction -This tutorial will introduce you to the basics of Windows PowerShell, the standard command-line interface for Windows computers. If you are a Mac or Linux user, you should check out the [Bash introduction](/lessons/intro-to-bash) instead. If you are already familiar with using Bash, you may be able to get started with PowerShell just by looking at the [table at the end of this lesson](#quick-reference). +This tutorial will introduce you to the basics of Windows PowerShell, the standard command-line interface for Windows computers. If you are a Mac or Linux user, you should check out the [Bash introduction](/en/lessons/intro-to-bash) instead. If you are already familiar with using Bash, you may be able to get started with PowerShell just by looking at the [table at the end of this lesson](#quick-reference). The tutorial is divided into two main sections. In the first section, "[Getting Started](#getting-started)," you will learn to do basic desktop tasks like creating and opening files and folders using PowerShell. In the second section, "[Doing More](#doing-more)," you will get a glimpse of some of the features that make work on the command line particularly efficient, and learn enough of the basics to be able to explore further on your own. You will also [get set up to run Python scripts from the command line](#using-command-line-tools-and-running-python-scripts). @@ -35,7 +35,7 @@ This tutorial was written for PowerShell 5.0. If you are using an earlier versio # What is PowerShell and Why is it Useful? -Windows PowerShell is a **command-line interface** for Windows computers. A command-line interface (CLI) is a program for telling your computer to do tasks using typed commands, rather than by clicking pictures on the desktop as in a **graphical user interface** (GUI). (Technically, PowerShell is more than just the CLI, and you can get a quick overview of its features on [Wikipedia](https://en.wikipedia.org/wiki/Windows_PowerShell).) Using the command line has many advantages. It makes it possible to automate tasks and to do many things with one command. Most importantly, a number of tools of value to humanists can only be run from the command line, including many you can learn about on *The Programming Historian*, like [Mallet](/lessons/topic-modeling-and-mallet), [Pandoc](/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown), or [Wget](/lessons/automated-downloading-with-wget). The command line is also the best place to work with programs you have custom built for your own research using programming languages like Python. +Windows PowerShell is a **command-line interface** for Windows computers. A command-line interface (CLI) is a program for telling your computer to do tasks using typed commands, rather than by clicking pictures on the desktop as in a **graphical user interface** (GUI). (Technically, PowerShell is more than just the CLI, and you can get a quick overview of its features on [Wikipedia](https://en.wikipedia.org/wiki/Windows_PowerShell).) Using the command line has many advantages. It makes it possible to automate tasks and to do many things with one command. Most importantly, a number of tools of value to humanists can only be run from the command line, including many you can learn about on *The Programming Historian*, like [Mallet](/en/lessons/topic-modeling-and-mallet), [Pandoc](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown), or [Wget](/en/lessons/automated-downloading-with-wget). The command line is also the best place to work with programs you have custom built for your own research using programming languages like Python. # Getting Started @@ -463,7 +463,7 @@ Use the up-arrow to get your last command back, and add `-ignorewhitespace` to t The most important reason to become familiar with using the command line is not because of the increased precision or the ability to work with multiple files, useful as these features are, but rather because of the many additional tools you gain access to, as mentioned in the introduction. When getting set up to work with some of these tools, you may run into problems because Windows sometimes configures the paths incorrectly. Fixing this problem requires setting environment variables, a topic beyond the scope of this tutorial. Fortunately, there is a lot of support out there, and a little searching will usually turn up the solution you need. Because many lessons on *The Programming Historian* require you to use Python, let's look briefly at getting Python set up. Having done this, you will be less daunted by instructions for setting environment variables for other programs. -If you don't already have Python, or if you wonder why you would want to use it, check out the [Python tutorial](/lessons/introduction-and-installation) right here on *The Programming Historian*. In that tutorial, you will learn to set up Python to run scripts directly in an editor. It will often be more useful to be able to run scripts from the command line. In order to do that, we need to set an environment variable. First, you need to know the name of the directory where Python is installed on your computer. Enter `sl C:\` and then use `gci`. You should see a directory named "Python" with the version number at the end. On my computer, the directory is "Python27." Now we tell Windows to create a Path variable pointing to that directory by entering this into PowerShell, replacing "Python27" with the name of the directory on your computer: +If you don't already have Python, or if you wonder why you would want to use it, check out the [Python tutorial](/en/lessons/introduction-and-installation) right here on *The Programming Historian*. In that tutorial, you will learn to set up Python to run scripts directly in an editor. It will often be more useful to be able to run scripts from the command line. In order to do that, we need to set an environment variable. First, you need to know the name of the directory where Python is installed on your computer. Enter `sl C:\` and then use `gci`. You should see a directory named "Python" with the version number at the end. On my computer, the directory is "Python27." Now we tell Windows to create a Path variable pointing to that directory by entering this into PowerShell, replacing "Python27" with the name of the directory on your computer: `[Environment]::SetEnvironmentVariable("Path", "$env:Path;C:\Python27", "User")` diff --git a/en/lessons/intro-to-twitterbots.md b/en/lessons/intro-to-twitterbots.md index 5e0a9c65db..d8e28ed5c6 100755 --- a/en/lessons/intro-to-twitterbots.md +++ b/en/lessons/intro-to-twitterbots.md @@ -18,7 +18,7 @@ topics: [api] abstract: "An Introduction to Twitter Bots with Tracery This lesson explains how to create simple twitterbots using Tracery and the Cheap Bots Done Quick service. Tracery exists in multiple languages and can be integrated into websites, games, bots." -redirect_from: /lessons/intro-to-twitterbots +redirect_from: /lessons/intro-to-twitterbots/ avatar_alt: A device with several interlocking gears doi: 10.46430/phen0069 --- @@ -31,10 +31,10 @@ Access to Twitter’s API has recently changed. The Free Tier no longer allows u # An Introduction to Twitter Bots with Tracery -This lesson explains how to create simple twitterbots using the [Tracery generative grammar](http://tracery.io) and the [Cheap Bots Done Quick](http://cheapbotsdonequick.com/) service. Tracery exists in multiple languages and can be integrated into websites, games, bots. You may fork it [on github here](https://github.com/galaxykate/tracery/tree/tracery2). +This lesson explains how to create simple twitterbots using the [Tracery generative grammar](https://tracery.io) and the [Cheap Bots Done Quick](https://cheapbotsdonequick.com/) service. Tracery exists in multiple languages and can be integrated into websites, games, bots. You may fork it [on github here](https://github.com/galaxykate/tracery/tree/tracery2). ## Why bots? -Strictly speaking, a twitter bot is a piece of software for automated controlling a Twitter account. When thousands of these are created and are tweeting more or less the same message, they have the ability to shape discourse on Twitter which then can influence other media discourses. Bots of this kind [can even be seen as credible sources of information](http://www.sciencedirect.com/science/article/pii/S0747563213003129). Projects such as [Documenting the Now](http://www.docnow.io/) are creating tools to allow researchers to create and query archives of social media around current events - and which will naturally contain many bot-generated posts. In this tutorial, I want to demonstrate how one can build a simple twitterbot so that, knowing how they operate, historians may more easily spot the bots in our archives - and perhaps counter with bots of their own. +Strictly speaking, a twitter bot is a piece of software for automated controlling a Twitter account. When thousands of these are created and are tweeting more or less the same message, they have the ability to shape discourse on Twitter which then can influence other media discourses. Bots of this kind [can even be seen as credible sources of information](https://www.sciencedirect.com/science/article/pii/S0747563213003129). Projects such as [Documenting the Now](https://www.docnow.io/) are creating tools to allow researchers to create and query archives of social media around current events - and which will naturally contain many bot-generated posts. In this tutorial, I want to demonstrate how one can build a simple twitterbot so that, knowing how they operate, historians may more easily spot the bots in our archives - and perhaps counter with bots of their own. But I believe also that there is space in digital history and the digital humanities more generally for creative, expressive, artistic work. I belive that there is space for programming historians to use the affordances of digital media to create _things_ that could not otherwise exist to move us, to inspire us, to challenge us. There is room for satire; there is room for comment. With Mark Sample, I believe that there is a need for '[bots of conviction](https://medium.com/@samplereality/a-protest-bot-is-a-bot-so-specific-you-cant-mistake-it-for-bullshit-90fe10b7fbaa)'. @@ -76,11 +76,11 @@ Some suggestions to get you thinking, from individuals on Twitter who responded > @electricarchaeo A bot imagining the reactions of Afghans, Iraqis, Syrians, Yemenis, when their family members are killed by drone attacks. — Cory Taylor (@CoryTaylor_) April 22, 2017 -Given that so much historical data is expressed on the web as [JSON](http://json.org/), a bit of digging should find you data that you can actually fold into your bot. +Given that so much historical data is expressed on the web as [JSON](https://json.org/), a bit of digging should find you data that you can actually fold into your bot. My method is that of the bricoleur, the person who adapts and pastes together the bits and bobs of code that he finds; in truth, most programming functions this way. There are many packages available that will interface with Twitter's API, in various languages. There is little 'programming' in this lesson in the sense of writing bots in (for instance) Python. In this introductory lesson, I will show you how to build bots that tell stories, that write poetry, that do wonderful things using Tracery.io as our _generative grammar_, in conjunction with the Cheap Bots Done Quick service to host the bot. For more tutorials on building and hosting Twitter bots with other services, see [the Botwiki tutorial list](https://botwiki.org/tutorials/twitterbots/). -My most successful bot has been [@tinyarchae](http://twitter.com/tinyarchae), a bot that tweets scenes from a horrible dsyfunctional archaeological excavation project. Every archaeological project deals with problems of sexism, abuse, and bad faith; @tinyarchae pushes the stuff of conference whispers to a ridiculous extreme. It is a caricature that contains a kernel of uncomfortable truth. Other bots I have built glitch [archaeological photography](https://twitter.com/archaeoglitch); one is actually useful, in that it is [tweeting out new journal articles in archaeology](https://twitter.com/botarchaeo) and so serves as a research assistant. (For more thoughts on the role bots play in public archaeology, see this [keynote](https://electricarchaeology.ca/2017/04/27/bots-of-archaeology-machines-writing-public-archaeology/) from the [Public Archaeology Twitter Conference](http://web.archive.org/web/20180131161516/https://publicarchaeologyconference.wordpress.com/)). +My most successful bot has been [@tinyarchae](https://twitter.com/tinyarchae), a bot that tweets scenes from a horrible dsyfunctional archaeological excavation project. Every archaeological project deals with problems of sexism, abuse, and bad faith; @tinyarchae pushes the stuff of conference whispers to a ridiculous extreme. It is a caricature that contains a kernel of uncomfortable truth. Other bots I have built glitch [archaeological photography](https://twitter.com/archaeoglitch); one is actually useful, in that it is [tweeting out new journal articles in archaeology](https://twitter.com/botarchaeo) and so serves as a research assistant. (For more thoughts on the role bots play in public archaeology, see this [keynote](https://electricarchaeology.ca/2017/04/27/bots-of-archaeology-machines-writing-public-archaeology/) from the [Public Archaeology Twitter Conference](https://web.archive.org/web/20180131161516/https://publicarchaeologyconference.wordpress.com/)). # Planning: What will your bot do? @@ -90,7 +90,7 @@ We begin with pad and paper. As a child in elementary school, one activity we of and students would fill in the blanks appropriately. It was silly; and it was fun. Twitterbots are to madlibs what sports cars are to horse and wagons. The blanks that we might fill in could be values in svg vector graphics. They could be numbers in numeric file names (and thus tweet random links to an open database, say). They could be, yes, even nouns and adverbs. Since Twitterbots live on the web, the building blocks that we put together can be more than text (although, for the time being, text will be easiest to work with). -We are going to start by sketching out a _replacement grammar_. The conventions of this grammar were developed by Kate Compton ([@galaxykate](https://twitter.com/galaxykate) on Twitter); it's called [Tracery.io](http://tracery.io). It can be used as a javascript library in webpages, in games, and in bots. A replacement grammar works rather similarly to the madlibs you might remember as a child. +We are going to start by sketching out a _replacement grammar_. The conventions of this grammar were developed by Kate Compton ([@galaxykate](https://twitter.com/galaxykate) on Twitter); it's called [Tracery.io](https://tracery.io). It can be used as a javascript library in webpages, in games, and in bots. A replacement grammar works rather similarly to the madlibs you might remember as a child. *In order to make it clear what the _grammar_ is doing, we are going to _not_ create a history bot for the time being. I want to make it clear what the grammar does, and so we will build something surreal to surface how that grammar works.* @@ -173,7 +173,7 @@ Before we move on, there is one last thing to examine. Press the JSON button in } ``` -Every Tracery grammar is actually a JSON object consisting of key/value pairs, which is what Tracery calls symbols and rules. (For more on JSON, please see [this tutorial by Matthew Lincoln](/lessons/json-and-jq)). This is the format we will be using when we actually set our bot up to start tweeting. JSON is finicky. Note how the symbols are wrapped in `"` as are the rules, but the rules are also listed with commas inside `[` and `]`. Remember: +Every Tracery grammar is actually a JSON object consisting of key/value pairs, which is what Tracery calls symbols and rules. (For more on JSON, please see [this tutorial by Matthew Lincoln](/en/lessons/json-and-jq)). This is the format we will be using when we actually set our bot up to start tweeting. JSON is finicky. Note how the symbols are wrapped in `"` as are the rules, but the rules are also listed with commas inside `[` and `]`. Remember: ```JSON { "symbol": ["rule","rule","rule"], @@ -202,9 +202,9 @@ Remember that your bot will be appearing in other people's timelines. The potent You can plumb a bot into your own, current, account, but you probably don't want a bot tweeting _as_ you or _for_ you. In which case, set up a new Twitter account. When you set up a new Twitter account, Twitter will want an email address. You can use a brand new email address, or, if you have a Gmail account, you can use the `+tag` trick, ie instead of 'johndoe' at gmail, you use `johndoe+twitterbot` at gmail. Twitter will accept that as a distinct email from your usual email. -Normally, when one is building a Twitterbot, one has to create an app on twitter (at [apps.twitter.com](http://apps.twitter.com)), obtain the consumer secret and key, and the access token and key. Then you have to program in authentication so that Twitter knows that the program trying to access the platform is permitted. +Normally, when one is building a Twitterbot, one has to create an app on twitter (at [apps.twitter.com](https://apps.twitter.com)), obtain the consumer secret and key, and the access token and key. Then you have to program in authentication so that Twitter knows that the program trying to access the platform is permitted. -Fortunately, we do not have to do that, since George Buckenham created the bot hosting site '[Cheap Bots Done Quick](http://cheapbotsdonequick.com/)'. (That website also shows the JSON source grammar for a number of different bots, which can serve as inspiration). Once you've created your bot's Twitter account - and you are logged in to Twitter as the bot account- go to Cheap Bots Done Quick and hit the 'sign in with Twitter' button. The site will redirect you to Twitter to approve authorization, and then bring you back to Cheap Bots Done Quick. +Fortunately, we do not have to do that, since George Buckenham created the bot hosting site '[Cheap Bots Done Quick](https://cheapbotsdonequick.com/)'. (That website also shows the JSON source grammar for a number of different bots, which can serve as inspiration). Once you've created your bot's Twitter account - and you are logged in to Twitter as the bot account- go to Cheap Bots Done Quick and hit the 'sign in with Twitter' button. The site will redirect you to Twitter to approve authorization, and then bring you back to Cheap Bots Done Quick. The JSON that describes your bot can be written or pasted into the main white box. Take the JSON from the editor and paste it into the main white box. If there are any errors in your JSON, the output box at the bottom will turn red and the site will try to give you an indication of where things have gone wrong. In most cases, this will be because of an errant comma or quotation mark. If you hit the refresh button to the right of the output box (NOT the browser refresh button!), the site will generate new text from your grammar. @@ -224,7 +224,7 @@ As Cheap Bots Done Quick is a service provided by George Buckenham out of a spir > If you create a bot I deem abusive or otherwise unpleasant (for example, @mentioning people who have not consented, posting insults or using slurs) I will take it down -Other pointers for good bot citizenship are provided by Darius Kazemi, one of the great bot artists, are discussed [here](http://tinysubversions.com/2013/03/basic-twitter-bot-etiquette/). +Other pointers for good bot citizenship are provided by Darius Kazemi, one of the great bot artists, are discussed [here](https://tinysubversions.com/2013/03/basic-twitter-bot-etiquette/). # Going further with Tracery Many bots are a good deal more complicated than what we have described here, but it is enough to get you started. Some surprisingly effective bots can be created using Tracery. @@ -249,7 +249,7 @@ The modifiers `.capitalize` and `.s` are added inside the `#` of the symbol they ## Use Emoji -Emoji can be used to great effect in Twitterbots. You can copy and paste emoji directly into the Cheap Bots Done Quick editor; each emoji should be within quotation marks as any other rule would be. Use [this list](http://unicode.org/emoji/charts/full-emoji-list.html) to find the emoji you wish to use, and make sure to copy and paste the emoji from the Twitter column to ensure that your emoji will display. +Emoji can be used to great effect in Twitterbots. You can copy and paste emoji directly into the Cheap Bots Done Quick editor; each emoji should be within quotation marks as any other rule would be. Use [this list](https://unicode.org/emoji/charts/full-emoji-list.html) to find the emoji you wish to use, and make sure to copy and paste the emoji from the Twitter column to ensure that your emoji will display. ## Reusing Generated Symbols with Actions @@ -291,7 +291,7 @@ Tracery reads the origin, and before it gets to the `story` symbol it sees an ac ## Responding to mentions in Cheap Bots Done Quick -[Cheap Bots Done Quick](http://cheapbotsdonequick.com/) has a beta feature that allows your bot to respond to mentions. (Warning: if you create two bots, and one mentions the other, the ensuing 'conversation' can run for a very long time indeed; there is a 5% chance in any exchange that the bot won't respond, thus breaking the conversation). +[Cheap Bots Done Quick](https://cheapbotsdonequick.com/) has a beta feature that allows your bot to respond to mentions. (Warning: if you create two bots, and one mentions the other, the ensuing 'conversation' can run for a very long time indeed; there is a 5% chance in any exchange that the bot won't respond, thus breaking the conversation). To set up a response pattern, click at the bottom of the page to set the button to 'reply to tweets'. In the JSON editing box that appears, you set up the pattern for phrases that your bot will respond to. For instance, some of what @tinyarchae watches for: @@ -307,7 +307,7 @@ To set up a response pattern, click at the bottom of the page to set the button "Should|should|Maybe|maybe|if|If":"#shouldanswer#" } ``` -The symbols here can include regular expression (Regex) patterns (see [this lesson](/lessons/understanding-regular-expressions) on regular expressions) . So, in the example above, the final symbol is watching for 'Should' OR 'should' OR 'Maybe' OR 'maybe' OR 'if' OR 'IF'. To respond to everything thrown its way, the symbol would be the simple dot: `.`. The rules can include simple text (as in the response to "hello") or can be another symbol. The rules should be included in your main grammar in the first JSON editing box on the page. Thus, `#shouldanswer#` is in the main @tinyarchae grammar editor box as a line: +The symbols here can include regular expression (Regex) patterns (see [this lesson](/en/lessons/understanding-regular-expressions) on regular expressions) . So, in the example above, the final symbol is watching for 'Should' OR 'should' OR 'Maybe' OR 'maybe' OR 'if' OR 'IF'. To respond to everything thrown its way, the symbol would be the simple dot: `.`. The rules can include simple text (as in the response to "hello") or can be another symbol. The rules should be included in your main grammar in the first JSON editing box on the page. Thus, `#shouldanswer#` is in the main @tinyarchae grammar editor box as a line: ```JSON "shouldanswer":["We asked #name#, who wrote 'An Archaeology of #verb.capitalize#'. The answer is #yesno#.","This isn't magic 8 ball, you know.","This is all very meta, isn't it.","#name# says to tell you, '42'."], @@ -318,7 +318,7 @@ At the very bottom of the page, you can test your mentions by writing a sample t {% include figure.html filename="bot-lesson-response.png" caption="Testing your bot's response" %} ## SVG graphics -Since SVG is a text format that describes the geometry of a vector graphic, Tracery can be used to create rather artistic work - the [Tiny Space Adventure](https://twitter.com/TinyAdv) bot draws a starfield, a spaceship, and a plot. Its grammar [may be viewed here](https://pastebin.com/YYtZnzZ0). The key issue with generating svg with Tracery is to get the components correct. The source code for the [softlandscapes bot](http://cheapbotsdonequick.com/source/softlandscapes) can be a useful model. This bot begins by defining the critical text that marks out SVG: +Since SVG is a text format that describes the geometry of a vector graphic, Tracery can be used to create rather artistic work - the [Tiny Space Adventure](https://twitter.com/TinyAdv) bot draws a starfield, a spaceship, and a plot. Its grammar [may be viewed here](https://pastebin.com/YYtZnzZ0). The key issue with generating svg with Tracery is to get the components correct. The source code for the [softlandscapes bot](https://cheapbotsdonequick.com/source/softlandscapes) can be a useful model. This bot begins by defining the critical text that marks out SVG: ``` "origin2": ["#preface##defs##bg##mountains##clouds##ending#"], @@ -337,11 +337,11 @@ Working with SVG can be tricky, as things like backslashes, line endings, quotat Bots that generate SVG are beyond the scope of this lesson, but careful study of existing bots should help you on your way. ## Music -Strictly speaking, this is no longer about bots, but since music can be notated in text, one can use Tracery to compose music and then use other libraries to convert this notation into Midi files - see [http://www.codingblocks.net/videos/generating-music-in-javascript/](http://www.codingblocks.net/videos/generating-music-in-javascript/) and my [own experiment](https://electricarchaeology.ca/2017/04/07/tracery-continues-to-be-awesome/). +Strictly speaking, this is no longer about bots, but since music can be notated in text, one can use Tracery to compose music and then use other libraries to convert this notation into Midi files - see [https://www.codingblocks.net/videos/generating-music-in-javascript/](https://www.codingblocks.net/videos/generating-music-in-javascript/) and my [own experiment](https://electricarchaeology.ca/2017/04/07/tracery-continues-to-be-awesome/). # Other Bot Tutorials -- Zach Whalen [How to make a Twitter Bot with Google Spreadsheets](http://www.zachwhalen.net/posts/how-to-make-a-twitter-bot-with-google-spreadsheets-version-04/) +- Zach Whalen [How to make a Twitter Bot with Google Spreadsheets](https://www.zachwhalen.net/posts/how-to-make-a-twitter-bot-with-google-spreadsheets-version-04/) - Casey Bergman, Keeping Up With the Scientific Literature using Twitterbots: The FlyPapers Experiment https://caseybergman.wordpress.com/2014/02/24/keeping-up-with-the-scientific-literature-using-twitterbots-the-flypapers-experiment/ also https://github.com/roblanf/phypapers ; in essence this method collects the RSS feed from journal articles, and then uses a service such as [Dlvr.it](https://dlvrit.com/) to push the links to a Twitter account. - Discontinued: Stefan Bohacek has posted the code templates for a number of different kinds of bots at the code remixing site Glitch.com. If you visit his page, you will see a list of different kinds of bots; click on the 'remix' button and then study the readme button carefully. Glitch requires a login via a Github or Facebook account. - Finally, I would suggest joining the BotMakers Slack group to find more tutorials, like-minded individuals, and further resources: [Sign up here](https://botmakers.org) diff --git a/en/lessons/introduction-and-installation.md b/en/lessons/introduction-and-installation.md index 8e0beb75ab..930b101a97 100755 --- a/en/lessons/introduction-and-installation.md +++ b/en/lessons/introduction-and-installation.md @@ -20,10 +20,10 @@ designed to get you and your computer set up to start programming. We will focus on installing the relevant software – all free and reputable – and finally we will help you to get your toes wet with some simple programming that provides immediate results." -redirect_from: /lessons/introduction-and-installation +redirect_from: /lessons/introduction-and-installation/ avatar_alt: A curled-up snake doi: 10.46430/phen0009 -next: viewing-html-files +next: /en/lessons/viewing-html-files series_total: 15 lessons sequence: 1 --- @@ -57,7 +57,7 @@ The programming language we will use in this series of lessons is Python, a free, open source language. Unless otherwise noted, we will be using **Python 3** throughout. Version 2 is no longer officially supported, but you might still find it used in older projects or lessons. -[Python 3 has a few differences in formatting](http://sebastianraschka.com/Articles/2014_python_2_3_key_diff.html) (think grammar rules), so beware if you find examples online that still use Python 2. They might not run under current versions of Python. +[Python 3 has a few differences in formatting](https://sebastianraschka.com/Articles/2014_python_2_3_key_diff.html) (think grammar rules), so beware if you find examples online that still use Python 2. They might not run under current versions of Python. Backup Your Work! ----------------- @@ -86,8 +86,8 @@ doesn't work on your platform, please let us know. - [Windows Python Installation][] - [Linux Python Installation][] - [Python programming language]: http://www.python.org/ - [Beautiful Soup HTML/XML parser]: http://www.crummy.com/software/BeautifulSoup/ + [Python programming language]: https://www.python.org/ + [Beautiful Soup HTML/XML parser]: https://www.crummy.com/software/BeautifulSoup/ [Komodo Edit]: https://github.com/ActiveState/OpenKomodoIDE [Python Editors]: https://wiki.python.org/python/PythonEditors [Jungle Disk]: https://www.jungledisk.com/ diff --git a/en/lessons/introduction-to-ffmpeg.md b/en/lessons/introduction-to-ffmpeg.md index 53dedf3f11..8bd2124ff9 100644 --- a/en/lessons/introduction-to-ffmpeg.md +++ b/en/lessons/introduction-to-ffmpeg.md @@ -75,7 +75,7 @@ to install FFmpeg and ensure it remains in the most up-to-date version. Homebrew Windows users can use the package manager [Chocolately](https://chocolatey.org/) to install and maintain FFmpeg. Reto Kromer's [Windows installation guide](https://avpres.net/FFmpeg/install_Windows.html) provides all the necessary information to use Chocolately or to install the software from a build. ## For Linux Users -[Linuxbrew](http://linuxbrew.sh/), a program similar to Homebrew, can be used to +[Linuxbrew](https://linuxbrew.sh/), a program similar to Homebrew, can be used to install and maintain FFmpeg in Linux. Reto Kromer also provides a helpful [Linux installation guide](https://avpres.net/FFmpeg/install_Linux.html) that closely resembles the Mac OS installation. Your distribution of Linux may also have its [own package manager](https://www.linode.com/docs/tools-reference/linux-package-management/) already installed that include FFmpeg packages available. Depending on your distribution of Linux (Ubuntu, Fedora, Arch Linux, etc.) these builds can vary, so using Linuxbrew could be useful to ensure that the build is the same regardless of which type of Linux you are using. @@ -111,7 +111,7 @@ that closely resembles the Mac OS installation. Your distribution of Linux may a * If you see something like `-bash: ffmpeg: command not found` then something has gone wrong. - * Note: If you are using a package manager it is unlikely that you will encounter this error message. However, if there is a problem after installing with a package manager, it is likely the issue is with the package manager itself as opposed to FFmpeg. Consult the Troubleshooting sections for [Homebrew](https://docs.brew.sh/Troubleshooting), [Chocolatey](https://chocolatey.org/docs/troubleshooting), or [Linuxbrew](http://linuxbrew.sh/) to ensure the package manager is functioning properly on your computer. If you are attempting to install without a package manager and see this error message, cross-reference your method with the FFmpeg Compilation Guide provided above. + * Note: If you are using a package manager it is unlikely that you will encounter this error message. However, if there is a problem after installing with a package manager, it is likely the issue is with the package manager itself as opposed to FFmpeg. Consult the Troubleshooting sections for [Homebrew](https://docs.brew.sh/Troubleshooting), [Chocolatey](https://chocolatey.org/docs/troubleshooting), or [Linuxbrew](https://linuxbrew.sh/) to ensure the package manager is functioning properly on your computer. If you are attempting to install without a package manager and see this error message, cross-reference your method with the FFmpeg Compilation Guide provided above. ## Using FFmpeg in a web browser (without installing) If you do not want to install FFmpeg on your computer but would like to become familiar with using it at the command-line, Brian Grinstead's [videoconverter.js](https://bgrins.github.io/videoconverter.js/demo/) provides a way to run FFmpeg commands and learn its basic functions in the web-browser of your choice. @@ -381,8 +381,8 @@ done ``` * `for file in *.m4v; do` = initiates the for loop. This first line basically tells FFmpeg: "for all files in this directory with the extension `.m4v`, perform the following command." - * The `*` is a Bash [wildcard](http://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm) attached to a given file-type and specifies them as the input files. - * The word `file` is an arbitrary [variable](http://tldp.org/HOWTO/Bash-Prog-Intro-HOWTO-5.html) which will represent each file as it runs through the loop. + * The `*` is a Bash [wildcard](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm) attached to a given file-type and specifies them as the input files. + * The word `file` is an arbitrary [variable](https://tldp.org/HOWTO/Bash-Prog-Intro-HOWTO-5.html) which will represent each file as it runs through the loop. * `ffprobe -f lavfi -i movie="$file",signalstats -show_entries frame=pkt_pts_time:frame_tags=lavfi.signalstats.HUEMED -print_format csv > "${file%.m4v}.csv"; done` = the same color metadata extraction command we ran on our two excerpts of *Destination Earth*, with some slight alterations to the syntax to account for its use across multiple files in a directory: * `"$file"` recalls each variable. The enclosing quotation marks ensures that the original filename is retained. diff --git a/en/lessons/introduction-to-populating-a-website-with-api-data.md b/en/lessons/introduction-to-populating-a-website-with-api-data.md index c0f4bd632e..952908d080 100644 --- a/en/lessons/introduction-to-populating-a-website-with-api-data.md +++ b/en/lessons/introduction-to-populating-a-website-with-api-data.md @@ -3,7 +3,7 @@ title: | Introduction to Populating a Website with API Data collection: lessons layout: lesson -slug: website-api +slug: introduction-to-populating-a-website-with-api-data date: 2019-05-22 authors: - Go Sugimoto @@ -30,7 +30,7 @@ doi: 10.46430/phen0086 This tutorial offers readers the possibility to quickly learn the basics of APIs without prior knowledge of programming, to start accessing a vast amount of data (often freely) available on the web. In particular, we learn basic [HTML](https://en.wikipedia.org/wiki/HTML) and [PHP](https://en.wikipedia.org/wiki/PHP) to build a simple website to display API query results of cultural heritage collections, using [Europeana API](https://pro.europeana.eu/resources/apis). As the technique is generic, we also create a short template to test it with [Harvard Art Museums API](https://www.harvardartmuseums.org/collections/api). In the tutorial, some other concepts such as [metadata](https://en.wikipedia.org/wiki/Metadata) and [web servers](https://en.wikipedia.org/wiki/Web_server) are explained to understand APIs in a broad context. -[PHP](http://php.net/) is a programming language especially suited for web development, while [HTML](https://en.wikipedia.org/wiki/HTML) is a markup language to create webpages and applications. The exampes in this lesson uses some basic programming, however it is usually also possible to use copy and paste when working with API at a basic level. +[PHP](https://php.net/) is a programming language especially suited for web development, while [HTML](https://en.wikipedia.org/wiki/HTML) is a markup language to create webpages and applications. The exampes in this lesson uses some basic programming, however it is usually also possible to use copy and paste when working with API at a basic level. # Contents The tutorial consists of two parts. The first part provides the basic theory of APIs: @@ -117,7 +117,7 @@ Let’s have a closer look at what you typed into your browser URL box the examp {% include figure.html filename="website-api2.jpg" caption="Untidy JSON data structure (raw data) in Chrome" %} ## Understanding API Data (in JSON) -If your browser does not support a tidy [JSON](https://en.wikipedia.org/wiki/JSON) view (the latest Firefox should have a pre-installed JSON viewer), please copy and paste the entire data to an [online JSON viewer](http://jsonviewer.stack.hu/). It allows us to view the data more easily by expanding (+ button) and collapsing (- button) the data hierarchy. +If your browser does not support a tidy [JSON](https://en.wikipedia.org/wiki/JSON) view (the latest Firefox should have a pre-installed JSON viewer), please copy and paste the entire data to an [online JSON viewer](https://jsonviewer.stack.hu/). It allows us to view the data more easily by expanding (+ button) and collapsing (- button) the data hierarchy. {% include figure.html filename="website-api3.jpg" caption="Online JSON viewer" %} @@ -129,7 +129,7 @@ Now, if you look carefully at the first lines of the data, you may notice someth You read literally: `"apikey"` is your API key. Your API access is `success`ful. We can ignore what `requestNumber` is, but only the first `12` items (records) are returned (to avoid a flood of data) out of the `totalResults` of `1967431`. After that, you have actual data from the collection (i.e. the 12 items). -In order to organise the data, Europeana uses a particular format/structure, called [JSON (JavaScript Object Notation)](http://json.org/). The data are wrapped with curly brackets (which is called Object). It always starts with `{` and ends with `}`. Inside, the data are represented with pairs of strings. Each pair has two components separated by a colon (`:`). For instance, `"totalResults":1967341`. We call this format [name-value pair](https://en.wikipedia.org/wiki/Attribute%E2%80%93value_pair). In our case, the name is `"totalResults"` and `1967341` is the data value. If there are more than one pair, name-value pairs are separated by comma (`,`). To sum up, the simplest JSON data look like this: +In order to organise the data, Europeana uses a particular format/structure, called [JSON (JavaScript Object Notation)](https://json.org/). The data are wrapped with curly brackets (which is called Object). It always starts with `{` and ends with `}`. Inside, the data are represented with pairs of strings. Each pair has two components separated by a colon (`:`). For instance, `"totalResults":1967341`. We call this format [name-value pair](https://en.wikipedia.org/wiki/Attribute%E2%80%93value_pair). In our case, the name is `"totalResults"` and `1967341` is the data value. If there are more than one pair, name-value pairs are separated by comma (`,`). To sum up, the simplest JSON data look like this: ``` { @@ -149,17 +149,17 @@ As there can be a long list of names in a record, let me explain some of the nam | id | Identifier of this item | /9200309/BibliographicResource_3000093757119_source | | country | Country of the data provider | Belgium | | dataProvider | Data provider of this item | Royal Library of Belgium | -| rights | Predefined rights statement (Creative Commons etc) | http://rightsstatements.org/vocab/InC/1.0/ | +| rights | Predefined rights statement (Creative Commons etc) | https://rightsstatements.org/vocab/InC/1.0/ | | title | Title of this item | Stand Not Upon The Order Of Your Going, But Go At Once Shakespeare Macbeth 3-4 Enlist Now | | edmPreview | URL of the preview of this item in Europeana | [https://www.europeana.eu/api/v2/thumbnail-by-url.json?uri=http%3A%2F%2Fuurl.kbr.be%2F1017835%2Fthumbs%2Fs&size=LARGE&type=IMAGE](https://www.europeana.eu/api/v2/thumbnail-by-url.json?uri=http%3A%2F%2Fuurl.kbr.be%2F1017835%2Fthumbs%2Fs&size=LARGE&type=IMAGE) | -| edmIsShownAt | URL (web page) of this item at the website of the data provider | [http://uurl.kbr.be/1017835](http://uurl.kbr.be/1017835) | +| edmIsShownAt | URL (web page) of this item at the website of the data provider | [https://uurl.kbr.be/1017835](https://uurl.kbr.be/1017835) | | edmIsShownBy | URL (media file) of this item at the website of the data provider | [https://www.rijksmuseum.nl/nl/collectie/RP-P-OB-84.508](https://www.rijksmuseum.nl/nl/collectie/RP-P-OB-84.508) | | type | The type of the item | IMAGE | | guid | URL of the item page in Europeana | `http://www.europeana.eu/portal/record/90402/RP_P_OB_84_508.html` | It is outside of the scope of this tutorial to explain the data model of Europeana (Europeana Data Model: EDM), but a short explanation would be handy, because all records are based on it. It consists of different descriptions (i.e. metadata) about cultural heritage items, including: -- [Dublin Core](http://dublincore.org/documents/dcmi-terms/) metadata to describe a cultural heritage object (stored in museums, libraries and archives). It includes the description of mostly physical aspects of the object such as title (Mona Lisa), creator (Leonardo da Vinci), size (77 cm × 53 cm), date (1503-1517?), place (France), owner (Louvre museum), and type (painting). In the Europeana API, it is often specified with prefix `dc`. +- [Dublin Core](https://dublincore.org/documents/dcmi-terms/) metadata to describe a cultural heritage object (stored in museums, libraries and archives). It includes the description of mostly physical aspects of the object such as title (Mona Lisa), creator (Leonardo da Vinci), size (77 cm × 53 cm), date (1503-1517?), place (France), owner (Louvre museum), and type (painting). In the Europeana API, it is often specified with prefix `dc`. - Metadata about digital versions of the physical object. It may include URLs where user can view the object (both at the Europeana website and external website), digital formats (jpg), and licensing information ([Creative Commons](https://en.wikipedia.org/wiki/Creative_Commons)). @@ -211,7 +211,7 @@ If you use Skype XAMPP may not work as Skype may use the same port (80 and 443). {% include figure.html filename="website-api4.jpg" caption="Click Start button for Apache LModule, and it is started (User interface may look a bit different depending on your OS)" %} -{% include figure.html filename="website-api5.jpg" caption="Go to [http://localhost/dashboard](http://localhost/dashboard) in your browser to see if Apache is working" %} +{% include figure.html filename="website-api5.jpg" caption="Go to [https://localhost/dashboard](https://localhost/dashboard) in your browser to see if Apache is working" %} If you see the screens like above, everything should be OK. Go to the installation folder, you will find an "htdocs" folder (for Mac OSX, /Applications/XAMPP/xamppfiles/htdocs). I suggest creating a shortcut on the desktop. We must use this folder to put all the necessary files to create our website, so it is best if it is conveniently located. Right now there are only default files in this folder that XAMPP has prepared for us, so let’s create a brand new PHP file. Inside the "htdocs" folder, create a new text file using your text editor and save it as `helloworld.php`. @@ -225,7 +225,7 @@ print 'Hello World'; ?> ``` -Open your web browser and type [http://localhost/helloworld.php](http://localhost/helloworld.php) in the address bar. When working on PHP code, I suggest keeping the browser open to the web page you are editing, so as soon as you save the file, you can see the outcome. +Open your web browser and type [https://localhost/helloworld.php](https://localhost/helloworld.php) in the address bar. When working on PHP code, I suggest keeping the browser open to the web page you are editing, so as soon as you save the file, you can see the outcome. You should see "Hello World" on a white background in your browser window. Congratulations. You have just made your first PHP program. PHP code should always start with ``. Just like JSON, those lines declare that the file is PHP. `print` means display the following code `'Hello World'` as text. In PHP, you can use either `''` or `""` (single or double quotes) to indicate that the data type is a [string](https://en.wikipedia.org/wiki/String_(computer_science)) (text) (There are [other data types](https://www.w3schools.com/pHp/php_datatypes.asp) such as integer, Boolean, or array, but let’s focus on strings for now). @@ -514,13 +514,13 @@ The point is the API template can be reused and customized, therefore, the most If you can learn a bit of programming, you are no longer restricted by what a website offers by default. You are now free to build your own tool or system, for example, to select, filter, compare, process, analyse, visualise, and share data in new ways. So, what are you waiting for? Be brave and start your new project. ## Useful APIs -- [The New York Times](http://developer.nytimes.com/) +- [The New York Times](https://developer.nytimes.com/) - [The Digital Public Library of America](https://pro.dp.la/developers/api-codex) - [VIAF](https://www.oclc.org/developer/api/oclc-apis/viaf.en.html) -- [GeoNames](http://www.geonames.org/export/web-services.html) +- [GeoNames](https://www.geonames.org/export/web-services.html) - [Wikipedia](https://www.mediawiki.org/wiki/API:Main_page) - [The Open Library](https://openlibrary.org/developers/api) -- [List of useful APIs for museums](http://museum-api.pbworks.com/w/page/21933420/Museum%C2%A0APIs) +- [List of useful APIs for museums](https://museum-api.pbworks.com/w/page/21933420/Museum%C2%A0APIs) ## Author's Project Using APIs - [James Cook Dynamic Journal (JCDJ)](https://web.archive.org/web/20210414011922/https://jcdj.acdh-dev.oeaw.ac.at/)...Contextualisation of a book from The Open Library diff --git a/en/lessons/introduction-to-stylometry-with-python.md b/en/lessons/introduction-to-stylometry-with-python.md index 228f83276f..d5274971e3 100755 --- a/en/lessons/introduction-to-stylometry-with-python.md +++ b/en/lessons/introduction-to-stylometry-with-python.md @@ -18,7 +18,7 @@ activity: analyzing topics: [distant-reading] review-ticket: https://github.com/programminghistorian/ph-submissions/issues/147 abstract: "In this lesson you will learn to conduct 'stylometric analysis' on texts and determine authorship of disputed texts. The lesson covers three methods: Mendenhall's Characteristic Curves of Composition, Kilgariff's Chi-Squared Method, and John Burrows' Delta Method." -redirect_from: /lessons/introduction-to-stylometry-with-python +redirect_from: /lessons/introduction-to-stylometry-with-python/ avatar_alt: A woman reading next to a painting doi: 10.46430/phen0078 --- @@ -53,7 +53,7 @@ Please note that the code in this lesson has been designed to run sequentially. ## Prior Reading -If you do not have experience with the Python programming language or are finding examples in this tutorial difficult, the author recommends you read the lessons on [Working with Text Files in Python](/lessons/working-with-text-files) and [Manipulating Strings in Python](/lessons/manipulating-strings-in-python). Please note, that those lessons were written in Python version 2 whereas this one uses Python version 3. The differences in [syntax](https://en.wikipedia.org/wiki/Syntax) between the two versions of the language can be subtle. If you are confused at any time, follow the examples as written in this lesson and use the other lessons as background material. (More precisely, the code in this tutorial was written using [Python 3.6.4](https://www.python.org/downloads/release/python-364/); the [f-string construct](https://docs.python.org/3/whatsnew/3.6.html#pep-498-formatted-string-literals) in the line `with open(f'data/federalist_{filename}.txt', 'r') as f:`, for example, requires Python 3.6 or a more recent version of the language.) +If you do not have experience with the Python programming language or are finding examples in this tutorial difficult, the author recommends you read the lessons on [Working with Text Files in Python](/en/lessons/working-with-text-files) and [Manipulating Strings in Python](/en/lessons/manipulating-strings-in-python). Please note, that those lessons were written in Python version 2 whereas this one uses Python version 3. The differences in [syntax](https://en.wikipedia.org/wiki/Syntax) between the two versions of the language can be subtle. If you are confused at any time, follow the examples as written in this lesson and use the other lessons as background material. (More precisely, the code in this tutorial was written using [Python 3.6.4](https://www.python.org/downloads/release/python-364/); the [f-string construct](https://docs.python.org/3/whatsnew/3.6.html#pep-498-formatted-string-literals) in the line `with open(f'data/federalist_{filename}.txt', 'r') as f:`, for example, requires Python 3.6 or a more recent version of the language.) ## Required materials @@ -61,7 +61,7 @@ This tutorial uses both datasets and software that you will have to download and ### The Dataset ### -To work through this lesson, you will need to download and unzip the archive of the _Federalist Papers_ ([.zip](/assets/introduction-to-stylometry-with-python/stylometry-federalist.zip)) containing the 85 documents that we will use for our analysis. The archive also contains the [original Project Gutenberg ebook](http://www.gutenberg.org/cache/epub/1404/pg1404.txt) version of the _Federalist Papers_ from which these 85 documents have been extracted. When you unzip the archive, it will create a [directory](https://en.wikipedia.org/wiki/Directory_(computing)) called `data` in your current [working directory](https://en.wikipedia.org/wiki/Working_directory). Make sure that you stay in this current working directory and that you save all work here while completing the lesson. +To work through this lesson, you will need to download and unzip the archive of the _Federalist Papers_ ([.zip](/assets/introduction-to-stylometry-with-python/stylometry-federalist.zip)) containing the 85 documents that we will use for our analysis. The archive also contains the [original Project Gutenberg ebook](https://www.gutenberg.org/cache/epub/1404/pg1404.txt) version of the _Federalist Papers_ from which these 85 documents have been extracted. When you unzip the archive, it will create a [directory](https://en.wikipedia.org/wiki/Directory_(computing)) called `data` in your current [working directory](https://en.wikipedia.org/wiki/Working_directory). Make sure that you stay in this current working directory and that you save all work here while completing the lesson. ### The Software ### @@ -71,15 +71,15 @@ This lesson uses the following Python language versions and [libraries](https:// * [nltk](https://www.nltk.org/) - Natural Language Toolkit, usually abbreviated `nltk`. * [matplotlib](https://matplotlib.org/) -Some of these modules may not be pre-installed on your computer. Should you encounter error messages such as: "Module not found" or similar, you will have to download and install the missing module(s). This is easiest to accomplish using the `pip` command. Full details are available via the *Programming Historian* lesson on [Installing Python modules with pip](/lessons/installing-python-modules-pip). +Some of these modules may not be pre-installed on your computer. Should you encounter error messages such as: "Module not found" or similar, you will have to download and install the missing module(s). This is easiest to accomplish using the `pip` command. Full details are available via the *Programming Historian* lesson on [Installing Python modules with pip](/en/lessons/installing-python-modules-pip). ## Some Notes about Language Independence -This tutorial applies stylometric analysis to a set of English-language texts using a Python library called `nltk`. Much of the functionality provided by the `nltk` works with other languages. As long as a language provides a clear way to distinguish word boundaries within a word, `nltk` should perform well. Languages such as Chinese for which there is no clear distinction between word boundaries may be problematic. I have used `nltk` with French texts without any trouble; other languages that use [diacritics](https://en.wikipedia.org/wiki/Diacritic), such as Spanish and German, should also work well with `nltk`. Please refer to [nltk's documentation](http://www.nltk.org/book/) for details. +This tutorial applies stylometric analysis to a set of English-language texts using a Python library called `nltk`. Much of the functionality provided by the `nltk` works with other languages. As long as a language provides a clear way to distinguish word boundaries within a word, `nltk` should perform well. Languages such as Chinese for which there is no clear distinction between word boundaries may be problematic. I have used `nltk` with French texts without any trouble; other languages that use [diacritics](https://en.wikipedia.org/wiki/Diacritic), such as Spanish and German, should also work well with `nltk`. Please refer to [nltk's documentation](https://www.nltk.org/book/) for details. Only one of the tasks in this tutorial requires language-dependent code. To divide a text into a set of French or Spanish words, you will need to specify the appropriate language as a parameter to `nltk`'s [tokenizer](https://en.wikipedia.org/wiki/Lexical_analysis#Tokenization), which uses English as the default. This will be explained in the tutorial. -Finally, note that some linguistic tasks, such as [part-of-speech tagging](https://en.wikipedia.org/wiki/Part-of-speech_tagging), may not be supported by `nltk` in languages other than English. This tutorial does not cover part-of-speech tagging. Should you need it for your own projects, please refer to the [nltk documentation](http://www.nltk.org/book/) for advice. +Finally, note that some linguistic tasks, such as [part-of-speech tagging](https://en.wikipedia.org/wiki/Part-of-speech_tagging), may not be supported by `nltk` in languages other than English. This tutorial does not cover part-of-speech tagging. Should you need it for your own projects, please refer to the [nltk documentation](https://www.nltk.org/book/) for advice. # The *Federalist Papers* - Historical Context @@ -146,7 +146,7 @@ papers = { Python dictionaries are very flexible. For example, we can access a value by *indexing* the dictionary with one of its keys, we can scan the entire dictionary by looping over its list of keys, etc. We will make ample use of this functionality as we move along. -Next, as we are interested in each author's vocabulary, we will define a short Python [function](https://en.wikipedia.org/wiki/Subroutine) that creates a long listing of the words in each of the papers assigned to a single author. This will be stored as a [string](https://en.wikipedia.org/wiki/String_(computer_science)). Open your chosen Python development environment. If you do not know how to do this, you should read Setting up an Integrated Development Environment ([Mac](/lessons/mac-installation)), ([Linux](/lessons/linux-installation)), ([Windows](/lessons/windows-installation)) before continuing. +Next, as we are interested in each author's vocabulary, we will define a short Python [function](https://en.wikipedia.org/wiki/Subroutine) that creates a long listing of the words in each of the papers assigned to a single author. This will be stored as a [string](https://en.wikipedia.org/wiki/String_(computer_science)). Open your chosen Python development environment. If you do not know how to do this, you should read Setting up an Integrated Development Environment ([Mac](/en/lessons/mac-installation)), ([Linux](/en/lessons/linux-installation)), ([Windows](/en/lessons/windows-installation)) before continuing. ```python # A function that compiles all of the text files associated with a single author into a single string @@ -214,7 +214,7 @@ federalist_by_author_length_distributions[author] = nltk.FreqDist(token_lengths) federalist_by_author_length_distributions[author].plot(15,title=author) ``` -The '%matplotlib inline' declaration below 'import nltk' is required if your development environment is a [Jupyter Notebook](http://jupyter.org/), as it was for me while writing this tutorial; otherwise you may not see the graphs on your screen. If you work in [Jupyter Lab](http://jupyterlab.readthedocs.io/en/stable/getting_started/installation.html), please replace this clause with '%matplotlib ipympl'. +The '%matplotlib inline' declaration below 'import nltk' is required if your development environment is a [Jupyter Notebook](https://jupyter.org/), as it was for me while writing this tutorial; otherwise you may not see the graphs on your screen. If you work in [Jupyter Lab](https://jupyterlab.readthedocs.io/en/stable/getting_started/installation.html), please replace this clause with '%matplotlib ipympl'. The first line in the code snippet above loads the *Natural Language Toolkit module (nltk)*, which contains an enormous number of useful functions and resources for text processing. We will barely touch its basics in this lesson; if you decide to explore text analysis in Python further, I strongly recommend that you start with [nltk's documentation](https://www.nltk.org/). @@ -332,7 +332,7 @@ However, chi-squared is still a coarse method. For one thing, words that appear In some languages, it may be useful to apply parts-of-speech tagging to the word tokens before counting them, so that the same word used as two different parts of speech may count as two different features. For example, in French, very common words like "la" and "le" serve both as articles (in which case they would translate into English as "the") and as pronouns ("it"). This lesson does not use part-of-speech tagging because it is rarely useful for stylometric analysis in contemporary English and because `nltk`'s default tagger does not support other languages very well. -Should you need to apply part-of-speech tagging to your own data, you may be able to download taggers for other languages, to work with a third-party tool like [Tree Tagger](http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/), or even to train your own tagger, but these techniques are far beyond the scope of the current lesson. +Should you need to apply part-of-speech tagging to your own data, you may be able to download taggers for other languages, to work with a third-party tool like [Tree Tagger](https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/), or even to train your own tagger, but these techniques are far beyond the scope of the current lesson. # Third Stylometric Test: John Burrows' Delta Method (Advanced) @@ -622,7 +622,7 @@ Thanks to Stéfan Sinclair and Andrew Piper, in whose seminars at McGill Univers [^17]: Stefan Evert et al., "Understanding and explaining Delta measures for authorship attribution", _Digital Scholarship in the Humanities_, vol. 32, no. suppl_2 (2017), pp. ii4-ii16. -[^18]: José Calvo Tello, “Entendiendo Delta desde las Humanidades,” _Caracteres_, May 27 2016, http://revistacaracteres.net/revista/vol5n1mayo2016/entendiendo-delta/. +[^18]: José Calvo Tello, “Entendiendo Delta desde las Humanidades,” _Caracteres_, May 27 2016, https://revistacaracteres.net/revista/vol5n1mayo2016/entendiendo-delta/. [^19]: Javier de la Rosa and Juan Luis Suárez, “The Life of Lazarillo de Tormes and of His Machine Learning Adversities,” _Lemir_, vol. 20 (2016), pp. 373-438. diff --git a/en/lessons/json-and-jq.md b/en/lessons/json-and-jq.md index 578d8588ce..12b52632be 100755 --- a/en/lessons/json-and-jq.md +++ b/en/lessons/json-and-jq.md @@ -14,7 +14,7 @@ review-ticket: https://github.com/programminghistorian/ph-submissions/issues/23 activity: transforming topics: [data-manipulation] abstract: "Working with data from an art museum API and from the Twitter API, this lesson teaches how to use the command-line utility _jq_ to filter and parse complex JSON files into flat CSV files." -redirect_from: /lessons/json-and-jq +redirect_from: /lessons/json-and-jq/ avatar_alt: A grid-like device for drawing lines doi: 10.46430/phen0055 --- @@ -33,7 +33,7 @@ Access to Twitter’s API has recently changed. The Free Tier no longer allows u JSON (JavaScript Object Notation) is a common data sharing format that can describe complex relationships. Many libraries, archives, museums, and social media sites expose their data through JSON-based APIs. -(On accessing APIs, see [downloading structured data with wget](/lessons/applied-archival-downloading-with-wget) and the [series of lessons on working with APIs](/lessons/intro-to-the-zotero-api).) +(On accessing APIs, see [downloading structured data with wget](/en/lessons/applied-archival-downloading-with-wget) and the [series of lessons on working with APIs](/en/lessons/retired/intro-to-the-zotero-api).) However, many tools for data analysis and visualization require input in flat tables (i.e. CSV), and because JSON is such a flexible data format, often with many nested levels of data, there is no one-size-fits-all graphical user interface for transforming JSON into other formats. @@ -44,7 +44,7 @@ By the end of the lesson, you will understand how to combine basic operators to ## What is JSON? -[You may find a short and cogent primer on JSON here.](http://www.json.org/) +[You may find a short and cogent primer on JSON here.](https://www.json.org/) In brief, a JSON **object** is a series of key/value pairs, where **keys** are the names for the **values** they are paired with. For example, the tiny JSON object: @@ -219,7 +219,7 @@ If you want to access just the first (or the _n_-th) item in an array, put a dig ``` **IMPORTANT: you access the first element of an array with `0`, not `1`.** -This is because JavaScript, like quite a few other programming languages ([though not all!](http://stackoverflow.com/questions/3135325/why-do-vector-indices-in-r-start-with-1-instead-of-0)), [starts counting at 0](http://skillcrush.com/2013/01/17/why-programmers-start-counting-at-zero/). +This is because JavaScript, like quite a few other programming languages ([though not all!](https://stackoverflow.com/questions/3135325/why-do-vector-indices-in-r-start-with-1-instead-of-0)), [starts counting at 0](https://skillcrush.com/2013/01/17/why-programmers-start-counting-at-zero/). This filter returns just the first element of the `artObjects` array. `.artObjects[1]` would return the second, and so on. @@ -269,7 +269,7 @@ Let's break down this query into its component pieces: 3. `.id` This final command accesses the value stored in the key `id` in the two objects that make it through the `select()` filter. jq can also filter based on regular expressions. -(To learn more about regular expressions, see the Programming Historian lesson ["Understanding Regular Expressions"](/lessons/understanding-regular-expressions).) +(To learn more about regular expressions, see the Programming Historian lesson ["Understanding Regular Expressions"](/en/lessons/understanding-regular-expressions).) For example, let's select only those objects whose primary maker has the particle "van" in their name, and return the artist name and artwork id. `test("van")` takes the value returned by the operator `.principalOrFirstMaker` and returns true if that value contains the string `van`: @@ -921,9 +921,9 @@ For fast processing of very large files, or of JSON lines spread across multiple ### Installation on OS X -The easiest way to install jq on OS X is to use the package management system [Homebrew](http://brew.sh/). +The easiest way to install jq on OS X is to use the package management system [Homebrew](https://brew.sh/). This system works via OS X's "Terminal" application, which gives you access to the Bash command line. -[For an introduction to this system, see The Programming Historian's "Introduction to the Bash Command Line".](/lessons/intro-to-bash) +[For an introduction to this system, see The Programming Historian's "Introduction to the Bash Command Line".](/en/lessons/intro-to-bash) Follow the installation instructions for Homebrew itself, and then use this command to install jq: @@ -934,7 +934,7 @@ brew install jq ### Installation on Windows To access the command line easily on Windows, you will need the PowerShell application. -[See the Programming Historian's "Introduction to PowerShell"](/lessons/intro-to-powershell.html) +[See the Programming Historian's "Introduction to PowerShell"](/en/lessons/intro-to-powershell) From PowerShell, you can install the Windows package manager [Chocolatey](https://chocolatey.org/install), and then install jq with the following command: @@ -956,14 +956,14 @@ jq -r '.artObjects[] | [.id, .title, .principalOrFirstMaker, .webImage.url] | @c Alternatively, you can use bash pipes to send text from the output of one function into jq. This can be useful when downloading JSON with a utility like `wget` for retrieving online material. -(See [Automated Downloading with Wget](/lessons/automated-downloading-with-wget) to learn the basics of this other command line program.) +(See [Automated Downloading with Wget](/en/lessons/automated-downloading-with-wget) to learn the basics of this other command line program.) ```sh wget -qO- http://programminghistorian.org/assets/json-and-jq/jq_rkm.json | jq -r '.artObjects[] | [.id, .title, .principalOrFirstMaker, .webImage.url] | @csv' ``` Note that you must use the `wget` flag `-qO-` in order to send the output of `wget` into `jq` by way of a shell pipe. -You can read more about command line pipes in ["Introduction to the Bash Command Line"](/lessons/intro-to-bash) (OS X) or ["Introduction to PowerShell"](/lessons/intro-to-powershell.html) (Windows). +You can read more about command line pipes in ["Introduction to the Bash Command Line"](/en/lessons/intro-to-bash) (OS X) or ["Introduction to PowerShell"](/en/lessons/intro-to-powershell) (Windows). ## Further Resources @@ -974,5 +974,5 @@ If basic counting is all you need to do with your JSON data, then jq can help yo For more involved math, however, it would be more sensible to create table(s) with jq and then continue your analysis in Python, R, or even Excel. If you are working with deeply-nested JSON (that is, many objects within objects), or JSON where objects have inconsistent structure, you may need to use features not covered in this lesson, including [if-then-else statements](https://stedolan.github.io/jq/manual/#if-then-else), [recursion](https://stedolan.github.io/jq/manual/#Recursion), and [reduction](https://stedolan.github.io/jq/manual/#Reduce). -If you can't figure out the filter you need to go from your given input to your desired output, using the tag `jq` over at [StackOverflow](http://stackoverflow.com/questions/tagged/jq) can often get you a speedy answer. -Make sure that you try to [follow best practices when describing your problem](http://stackoverflow.com/help/how-to-ask) and provide a [reproducible example](http://stackoverflow.com/help/mcve). +If you can't figure out the filter you need to go from your given input to your desired output, using the tag `jq` over at [StackOverflow](https://stackoverflow.com/questions/tagged/jq) can often get you a speedy answer. +Make sure that you try to [follow best practices when describing your problem](https://stackoverflow.com/help/how-to-ask) and provide a [reproducible example](https://stackoverflow.com/help/mcve). diff --git a/en/lessons/jupyter-notebooks.md b/en/lessons/jupyter-notebooks.md index 1abcff34b8..bc907eff8b 100644 --- a/en/lessons/jupyter-notebooks.md +++ b/en/lessons/jupyter-notebooks.md @@ -134,7 +134,7 @@ Note that this isn't the only way to make files appear in the Jupyter Notebook f ### Creating a new notebook - + Inside the *notebooks* folder, create a new Jupyter notebook to use to convert the dates for your research project. Click the "New" button in the upper right of the Jupyter Notebook file browser interface. If you've just installed Anaconda as described above, your only option will be to create a Jupyter notebook using the Python 3 *kernel* (the backend component that actually runs the code written in the notebook), but we'll discuss below how to add kernels for other languages. Click on "Python 3", and Jupyter Notebook will open a new tab with the interface for Jupyter notebooks themselves. By default, the notebook will be named "Untitled"; you can click on that text at the top of the screen to rename it. {% include figure.html filename="jupyter-createnew.png" caption="Creating a new Jupyter notebook" %} @@ -332,7 +332,7 @@ If you're already using Jupyter notebooks for documenting the workflow for your There are many digital humanities "Intro to Python" courses and workshops that use Jupyter notebooks (including [Introduction à Python et au développement web avec Python pour les sciences humaines](https://github.com/PonteIneptique/cours-python) by Thibault Clérice, translated from material by Matthew Munson). Jupyter notebooks are also commonly used in text analysis workshops, such as the [word vectors workshop at DH 2018](https://github.com/sul-cidr/dh2018-word-vector-workshops), taught by Eun Seo Jo, Javier de la Rosa, and Scott Bailey. - + Teaching with Jupyter notebooks doesn't always have to involve the time-consuming process of downloading and installing Anaconda, especially if you're envisioning only having one or two lessons that involve notebooks. If your classroom activities with Jupyter notebooks involve using example data that you've already prepared, and if you've already written at least some of the code, you may want to explore running Jupyter notebooks using free cloud computing resources, as long as your students are guaranteed to have reliable internet connectivity in the classroom. Running notebooks in the cloud also provides a consistent environment for all students, sparing you from having to negotiate differences between Windows and Mac, or provide an alternative for students whose laptops lack the hard drive space or memory to run Anaconda effectively. Because the options are evolving quickly, it's best to use your favorite search engine to find a recent list of cloud computing options for Jupyter Notebooks. One project that has seen particular uptake among academic users of notebooks is [MyBinder](https://mybinder.org/), which will take a GitHub repository that contains Jupyter notebook .ipynb files related data (embedded images, data sets you want to use the notebooks on, etc.), and information about necessary packages and dependencies (in a requirements.txt or environment.yml file), and make it launchable using a cloud server. Once you've had MyBinder package up your GitHub repo, you can add a Binder "badge" to the readme file for the repo. Anyone viewing the repo can launch the notebook directly from their browser, without having to download or install anything. @@ -393,7 +393,7 @@ From experimenting with code to documenting workflows, from pedagogy to scholarl [^2]: Millman, KJ and Fernando Perez. 2014. "Developing open source scientific practice". In *Implementing Reproducible Research*, Ed. Victoria Stodden, Friedrich Leisch, and Roger D. Peng. https://osf.io/h9gsd/ -[^3]: Sinclair, Stéfan & Geoffrey Rockwell. 2013. "Voyant Notebooks: Literate Programming and Programming Literacy". Journal of Digital Humanities, Vol. 2, No. 3 Summer 2013. http://journalofdigitalhumanities.org/2-3/voyant-notebooks-literate-programming-and-programming-literacy/ +[^3]: Sinclair, Stéfan & Geoffrey Rockwell. 2013. "Voyant Notebooks: Literate Programming and Programming Literacy". Journal of Digital Humanities, Vol. 2, No. 3 Summer 2013. https://journalofdigitalhumanities.org/2-3/voyant-notebooks-literate-programming-and-programming-literacy/ [^4]: Haley Di Pressi, Stephanie Gorman, Miriam Posner, Raphael Sasayama, and Tori Schmitt, with contributions from Roderic Crooks, Megan Driscoll, Amy Earhart, Spencer Keralis, Tiffany Naiman, and Todd Presner. "A Student Collaborator's Bill of Rights". https://humtech.ucla.edu/news/a-student-collaborators-bill-of-rights/ diff --git a/en/lessons/keywords-in-context-using-n-grams.md b/en/lessons/keywords-in-context-using-n-grams.md index fe9846289a..0b143fd755 100755 --- a/en/lessons/keywords-in-context-using-n-grams.md +++ b/en/lessons/keywords-in-context-using-n-grams.md @@ -17,12 +17,12 @@ activity: presenting topics: [python] abstract: | This lesson takes the frequency pairs collected in "Counting Frequencies" and outputs them in HTML. -next: output-keywords-in-context-in-html-file -previous: output-data-as-html-file +next: /en/lessons/output-keywords-in-context-in-html-file +previous: /en/lessons/output-data-as-html-file series_total: 15 lessons sequence: 13 python_warning: false -redirect_from: /lessons/keywords-in-context-using-n-grams +redirect_from: /lessons/keywords-in-context-using-n-grams/ avatar_alt: A figure dropping two bottles of alcohol doi: 10.46430/phen0010 --- @@ -35,8 +35,8 @@ doi: 10.46430/phen0010 ## Lesson Goals -Like in [Output Data as HTML File][], this lesson takes the frequency -pairs collected in [Counting Frequencies][] and outputs them in HTML. +Like in [Output Data as HTML File](/en/lessons/output-data-as-html-file), this lesson takes the frequency +pairs collected in [Counting Frequencies](/en/lessons/counting-frequencies) and outputs them in HTML. This time the focus is on keywords in context (KWIC) which creates n-grams from the original document content – in this case a trial transcript from the *Old Bailey Online*. You can use your program to @@ -46,7 +46,7 @@ easy to see at a glance how the keyword is used. Once the KWICs have been created, they are then wrapped in HTML and sent to the browser where they can be viewed. This reinforces what was -learned in [Output Data as HTML File][1], opting for a slightly +learned in [Output Data as HTML File](/en/lessons/output-data-as-html-file), opting for a slightly different output. At the end of this lesson, you will be able to extract all possible @@ -59,7 +59,7 @@ from the Internet, and display them clearly in your browser window. - `obo.py` If you do not have these files from the previous lesson, you can -download programming-historian-7, a [zip file from the previous lesson][] +download programming-historian-7, a [zip file from the previous lesson](/assets/python-lessons7.zip). ## From Text to N-Grams to KWIC @@ -104,7 +104,7 @@ than strings. As you already know, Python can easily turn a string into a list using the `split` operation. Once split it becomes simple to retrieve a subsequence of adjacent words in the list by using a *slice*, represented as two indexes separated by a colon. This was introduced -when working with strings in [Manipulating Strings in Python][]. +when working with strings in [Manipulating Strings in Python](/en/lessons/manipulating-strings-in-python). ``` python message9 = "Hello World" @@ -267,11 +267,4 @@ with the Mac / Linux version you may have to open the `obo.py` file and change "file:///Users/username/Desktop/programming-historian/" to the path to the directory on your own computer. -- python-lessons8.py ([zip sync][]) - - [Output Data as HTML File]: /lessons/output-data-as-html-file - [Counting Frequencies]: /lessons/counting-frequencies - [1]: output-data-as-html-file - [zip file from the previous lesson]: /assets/python-lessons7.zip - [Manipulating Strings in Python]: /lessons/manipulating-strings-in-python - [zip sync]: /assets/python-lessons8.zip +- python-lessons8.py [zip sync](/assets/python-lessons8.zip) diff --git a/en/lessons/linear-regression.md b/en/lessons/linear-regression.md index bf24cc0dd6..baf2b5d0c5 100644 --- a/en/lessons/linear-regression.md +++ b/en/lessons/linear-regression.md @@ -19,7 +19,7 @@ abstract: This lesson is the first of a two-part lesson focusing on an indispens mathjax: true avatar_alt: a washburn grinder doi: 10.46430/phen0099 -next: logistic-regression +next: /en/lessons/logistic-regression series_total: 2 lessons sequence: 1 slug: linear-regression @@ -75,7 +75,7 @@ The central goals of these two lessons are: ## Before You Begin - Install the Python 3 version of Anaconda. Installing Anaconda is covered in [Text Mining in Python through the HTRC Feature Reader](/en/lessons/text-mining-with-extracted-features). This will install Python 3.7.3 (or higher), the [Scikit-Learn library](https://scikit-learn.org/stable/install.html), [the Pandas library](https://pandas.pydata.org/docs/), the [matplotlib](https://matplotlib.org/) and [seaborn](https://seaborn.pydata.org/) libraries used to generate visualizations, and all the dependencies needed to run a [Jupyter Notebook](https://jupyter.org/). -- It is possible to install all these dependencies without Anaconda (or with a lightweight alternative like [Miniconda](https://docs.conda.io/en/latest/miniconda.html)). For more information, see the section below titled [Alternatives to Anaconda](#alternatives-to-anaconda) +- It is possible to install all these dependencies without Anaconda (or with a lightweight alternative like [Miniconda](https://docs.conda.io/en/latest/miniconda.html)). For more information, see part two's section titled [Alternatives to Anaconda](/en/lessons/logistic-regression#alternatives-to-anaconda). ## Lesson Dataset @@ -699,7 +699,7 @@ Now move on to [Logistic Regression analysis with scikit-learn](/en/lessons/logi [^9]: Ibid. -[^10]: The University of Texas at Austin. _Statistics Online Support: Variable Types_, [http://sites.utexas.edu/sos/variables/](https://perma.cc/GN36-BCPD). +[^10]: The University of Texas at Austin. _Statistics Online Support: Variable Types_, [https://sites.utexas.edu/sos/variables/](https://perma.cc/GN36-BCPD). [^11]: Jarausch, Konrad H., and Kenneth A. Hardy. _Quantitative Methods for Historians: A Guide to Research, Data, and Statistics_. 1991. UNC Press Books, 2016: 122. diff --git a/en/lessons/linux-installation.md b/en/lessons/linux-installation.md index 5c35fb9be4..f908079e7d 100755 --- a/en/lessons/linux-installation.md +++ b/en/lessons/linux-installation.md @@ -16,7 +16,7 @@ exclude_from_check: activity: transforming topics: [get-ready, python] abstract: "This lesson will help you set up an integrated development environment for Python on a computer running the Linux operating system." -redirect_from: /lessons/linux-installation +redirect_from: /lessons/linux-installation/ avatar_alt: A band with three musicians doi: 10.46430/phen0011 --- diff --git a/en/lessons/logistic-regression.md b/en/lessons/logistic-regression.md index 79f24d014d..9faf33e37d 100644 --- a/en/lessons/logistic-regression.md +++ b/en/lessons/logistic-regression.md @@ -19,7 +19,7 @@ abstract: This lesson is the second in a two-part lesson focusing on regression mathjax: true avatar_alt: a printing press and folding machine doi: 10.46430/phen0100 -previous: linear-regression +previous: /en/lessons/linear-regression series_total: 2 lessons sequence: 2 slug: logistic-regression @@ -741,7 +741,7 @@ If you are not using Anaconda, you will need to cover the following dependencies 1. Install Python 3 (preferably Python 3.7 or later) 2. Recommended: install and run a virtual environment -3. Install the [scikit-learn library](http://scikit-learn.org/stable/install.html) and its dependencies +3. Install the [scikit-learn library](https://scikit-learn.org/stable/install.html) and its dependencies 4. Install [the Pandas library](https://pandas.pydata.org/docs/) 5. Install the [matplotlib](https://matplotlib.org/) and [seaborn](https://seaborn.pydata.org/) libraries 6. Install [Jupyter Notebook](https://jupyter.org/) and its dependencies diff --git a/en/lessons/mac-installation.md b/en/lessons/mac-installation.md index 6fe0537ff5..852b4d827f 100755 --- a/en/lessons/mac-installation.md +++ b/en/lessons/mac-installation.md @@ -17,7 +17,7 @@ exclude_from_check: activity: transforming topics: [get-ready, python] abstract: "This lesson will help you set up an integrated development environment for Python on a computer running a Mac operating system." -redirect_from: /lessons/mac-installation +redirect_from: /lessons/mac-installation/ avatar_alt: A band with three musicians doi: 10.46430/phen0012 --- @@ -142,9 +142,9 @@ Now that you and your computer are up and running, we can move onto some more interesting tasks. If you are working through the Python lessons in order, we suggest you next try '[Understanding Web Pages and HTML][].' - [Time Machine]: http://support.apple.com/kb/ht1427 - [Python website]: http://www.python.org/ - [Beautiful Soup]: http://www.crummy.com/software/BeautifulSoup/ + [Time Machine]: https://support.apple.com/en-gb/104984 + [Python website]: https://www.python.org/ + [Beautiful Soup]: https://www.crummy.com/software/BeautifulSoup/ [other text editing options]: https://wiki.python.org/python/PythonEditors [BBEdit]: https://www.barebones.com/products/bbedit/ [Sublime Text website]: https://www.sublimetext.com/download diff --git a/en/lessons/manipulating-strings-in-python.md b/en/lessons/manipulating-strings-in-python.md index 1c376eb149..19e25f933a 100755 --- a/en/lessons/manipulating-strings-in-python.md +++ b/en/lessons/manipulating-strings-in-python.md @@ -15,12 +15,12 @@ exclude_from_check: activity: transforming topics: [python] abstract: "This lesson is a brief introduction to string manipulation techniques in Python." -next: from-html-to-list-of-words-1 -previous: working-with-web-pages +next: /en/lessons/from-html-to-list-of-words-1 +previous: /en/lessons/working-with-web-pages series_total: 15 lessons sequence: 6 python_warning: false -redirect_from: /lessons/manipulating-strings-in-python +redirect_from: /lessons/manipulating-strings-in-python/ avatar_alt: A man playing a guitar doi: 10.46430/phen0013 --- diff --git a/en/lessons/mapping-with-python-leaflet.md b/en/lessons/mapping-with-python-leaflet.md index f93bdfcbee..e538cdc666 100755 --- a/en/lessons/mapping-with-python-leaflet.md +++ b/en/lessons/mapping-with-python-leaflet.md @@ -14,7 +14,7 @@ activity: presenting topics: [mapping] abstract: "This tutorial teaches users how to create a web map based on tabular data." layout: lesson -redirect_from: /lessons/mapping-with-python-leaflet +redirect_from: /lessons/mapping-with-python-leaflet/ avatar_alt: Map of a mountainous terrain doi: 10.46430/phen0070 --- @@ -34,18 +34,18 @@ In this lesson, you will learn how to create a web map based on that data. By t This lesson uses: -- [python](/lessons/?topic=python) ([pip](http://pip.readthedocs.org/en/stable/), [geopy](https://github.com/geopy/geopy), [pandas](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe)) -- [leaflet](http://leafletjs.com/) -- [geojson.io (from mapbox)](http://geojson.io/) +- [python](/en/lessons/?topic=python) ([pip](https://pip.readthedocs.org/en/stable/), [geopy](https://github.com/geopy/geopy), [pandas](https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe)) +- [leaflet](https://leafletjs.com/) +- [geojson.io (from mapbox)](https://geojson.io/) - [javascript](https://www.javascript.com/) and [jquery](https://jquery.com/) Optional: If you wish to follow along with pre-made scripts you can find them in [this directory](https://github.com/programminghistorian/jekyll/tree/gh-pages/assets/mapping-with-python-leaflet). To set up your working environment: 1. Create a directory for this project where you will keep all of your scripts and files that you will work from -2. If you have a text editor where you can work from the directory of your project, import that directory. You can use editors such as [TextWrangler](http://www.barebones.com/products/textwrangler/) for OS X, [Notepad++](https://notepad-plus-plus.org/) for Windows, or [Sublime Text](http://www.sublimetext.com/). +2. If you have a text editor where you can work from the directory of your project, import that directory. You can use editors such as [TextWrangler](https://www.barebones.com/products/textwrangler/) for OS X, [Notepad++](https://notepad-plus-plus.org/) for Windows, or [Sublime Text](https://www.sublimetext.com/). If you are using a code editor such as Sublime Text, to import the folder you could drag and drop the folder that you want to work from into your editor window. Once you've done that, the directory will appear on the left hand sidebar as you root folder. If you click on your folder, you'll be able to see the contents of your folder. Importing a folder allows you to easily work with the files in your project. If you need to work with multiple files and directories in directories, this will make it easier to search through these files, switch between them while you're working and keep you organized. -3. (Optional) It is recommended to use a [Python virtual environment](http://docs.python-guide.org/en/latest/dev/virtualenvs/) to store the dependencies and versions required for your specific project. +3. (Optional) It is recommended to use a [Python virtual environment](https://docs.python-guide.org/en/latest/dev/virtualenvs/) to store the dependencies and versions required for your specific project. ### Getting Data: Download the CSV @@ -55,7 +55,7 @@ We're going to start with a plain comma-separated values (CSV) data file and cre ```curl -O https://programminghistorian.org/assets/mapping-with-python-leaflet/census.csv``` -The original source of this data is from the [Greater London Authority London Datastore](http://data.london.gov.uk/dataset/historic-census-population). +The original source of this data is from the [Greater London Authority London Datastore](https://data.london.gov.uk/dataset/historic-census-population). ## Geocoding with Python @@ -71,19 +71,19 @@ To clarify, we need to figure out how to gather coordinates for a location for e There's a simple way to do this: you can look up a coordinate online in Google Maps and put each coordinate in your spreadsheet manually. But, if you had 5,000 points the task becomes a little bit more daunting. If you're faced with a repetitive task, it might be worthwhile to approach it programmatically. -If you're familiar with _Programming Historian_, you might have already noticed that there are many [lessons available on how to use Python](/lessons/?topic=python). Python is a great beginner programming language because it is easy to read and happens to be used a lot in GIS applications to optimize workflows. One of the biggest advantages to Python is the impressive amount of libraries which act like pluggable tools to use for many different tasks. Knowing that this is a good programmatic approach, we're now going to build a Python script that will automate geocode every address for us. +If you're familiar with _Programming Historian_, you might have already noticed that there are many [lessons available on how to use Python](/en/lessons/?topic=python). Python is a great beginner programming language because it is easy to read and happens to be used a lot in GIS applications to optimize workflows. One of the biggest advantages to Python is the impressive amount of libraries which act like pluggable tools to use for many different tasks. Knowing that this is a good programmatic approach, we're now going to build a Python script that will automate geocode every address for us. [Geopy](https://github.com/geopy/geopy) is a Python library that gives you access to the various geocoding APIs. Geopy makes it easy for Python developers to locate the coordinates of addresses, cities, countries, and landmarks across the globe using third-party geocoders and other data sources. Geopy includes geocoders built by OpenStreetMap Nominatim, ESRI ArcGIS, Google Geocoding API (V3), Baidu Maps, Bing Maps API, Yahoo! PlaceFinder, Yandex, IGN France, GeoNames, NaviData, OpenMapQuest, What3Words, OpenCage, SmartyStreets, geocoder.us, and GeocodeFarm geocoder services. -[Pandas](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe) is another python library that we will use. It's very popular library amongst scientists and mathematicians to manipulate and analyse data. +[Pandas](https://pandas.pydata.org/pandas-docs/stable/dsintro.html#dataframe) is another python library that we will use. It's very popular library amongst scientists and mathematicians to manipulate and analyse data. -Finally, [Pip](http://pip.readthedocs.org/en/stable/) is a very useful package manager to help you install things like Geopy and Pandas! If you've [already installed Python](/lessons/introduction-and-installation) and [installed pip](/lessons/installing-python-modules-pip), type ```pip list``` to see if you already have the geopy and pandas packages installed. If you do not have pip installed, you can download [get-pip.py](https://bootstrap.pypa.io/get-pip.py), then from your command line go to the directory where get-pip.py is located and run +Finally, [Pip](https://pip.readthedocs.org/en/stable/) is a very useful package manager to help you install things like Geopy and Pandas! If you've [already installed Python](/en/lessons/introduction-and-installation) and [installed pip](/en/lessons/installing-python-modules-pip), type ```pip list``` to see if you already have the geopy and pandas packages installed. If you do not have pip installed, you can download [get-pip.py](https://bootstrap.pypa.io/get-pip.py), then from your command line go to the directory where get-pip.py is located and run ```python get-pip.py ``` -For the most up to date instructions, you can visit [pip's installation manual](http://pip.readthedocs.org/en/stable/installing/). +For the most up to date instructions, you can visit [pip's installation manual](https://pip.readthedocs.org/en/stable/installing/). -To install Geopy and Pandas, open your [command line (using this lesson as a guideline if necessary)](/lessons/intro-to-bash) and install the Geopy and Pandas libraries: +To install Geopy and Pandas, open your [command line (using this lesson as a guideline if necessary)](/en/lessons/intro-to-bash) and install the Geopy and Pandas libraries: On OS X or Linux, the following commands will install the necessary packages: @@ -94,7 +94,7 @@ pip install pytz pip install geopy pip install pandas ``` -Note: We are installing numpy, python-dateutil, and pytz because pandas [requires them](http://pandas.pydata.org/pandas-docs/stable/install.html#dependencies). +Note: We are installing numpy, python-dateutil, and pytz because pandas [requires them](https://pandas.pydata.org/pandas-docs/stable/install.html#dependencies). For Windows, you may need to install [Microsoft Visual C++ Compiler for Python](https://wiki.python.org/moin/WindowsCompilers). Set the environmental variables to recognize python and pip from the command line: @@ -136,7 +136,7 @@ def main(): We are first using pandas' pre-existing read_csv() function to open the CSV file. We pass the filepath to our data file in the first parameter, 'census-historic-population-borough.csv'. If it was in a folder called 'data', you would put 'data/census-historic-population-borough.csv'. The second parameter, ```index_col=None```, will number the rows to generate the index without using any column. If we use ```index_col=0```, it indexes the first column in your data as the row name. The third parameter, ```header=0```, indicates that there is a header row, which is the first line of the spreadsheet (Note: Python uses "0" instead of "1" to indicate the first value in an index). The fourth parameter ```sep=","``` is where you indicate delimiter symbol that is used to split data into fields. Since are using a comma separated values data format, we need to indicate that we are using a comma to split our data. -There are many other parameters you can use. A full list is available in the pandas documentation on the [read_csv() function](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html). +There are many other parameters you can use. A full list is available in the pandas documentation on the [read_csv() function](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html). Next, we anticipate that when we geocode the csv we will get points in the format of (latitude, longitude). If we only want the latitude value of the point in a csv column, we will define a function to isolate that value. The same can be done for our longitude value. @@ -164,7 +164,7 @@ Next, select the geolocator you want to use. Here we're creating two geolocator | request limit | 1 request/s or timeout | | performance test on census data | 33.5s | -You can also choose a different geolocator from the list found in [the geopy documentation](http://geopy.readthedocs.org/). GoogleV3 is a geocoder compatible with geopy, it is a reliable geolocator choice because of their large geographic data coverage. However, since July 2018 an API key is required, and you need to enable billing in Google Cloud to use it. For more information about choosing geolocators, you can follow the discussion in the [geopy repository on Github](https://github.com/geopy/geopy/issues/90). +You can also choose a different geolocator from the list found in [the geopy documentation](https://geopy.readthedocs.org/). GoogleV3 is a geocoder compatible with geopy, it is a reliable geolocator choice because of their large geographic data coverage. However, since July 2018 an API key is required, and you need to enable billing in Google Cloud to use it. For more information about choosing geolocators, you can follow the discussion in the [geopy repository on Github](https://github.com/geopy/geopy/issues/90). To use a geolocator, import them and assign a variable name (in this case we use the name geolocator): @@ -186,7 +186,7 @@ def main(): geolocator = Nominatim() ``` -Finally, using pandas you want to create a column in your spreadsheet called 'latitude'. The script will read the existing 'Area_Name' data column, run the geopy [geolocator](http://geopy.readthedocs.io/en/latest/#module-geopy.geocoders) on the column using pandas' [apply function](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html), and generate a latitude coordinate in that column. The same transformation will occur in the 'longitude' column. Once this is finished it will output a new CSV file with those two columns: +Finally, using pandas you want to create a column in your spreadsheet called 'latitude'. The script will read the existing 'Area_Name' data column, run the geopy [geolocator](https://geopy.readthedocs.io/en/latest/#module-geopy.geocoders) on the column using pandas' [apply function](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.apply.html), and generate a latitude coordinate in that column. The same transformation will occur in the 'longitude' column. Once this is finished it will output a new CSV file with those two columns: ```python import geopy @@ -300,7 +300,7 @@ To address the timeout error, you could add the parameter ```timeout```, which s Now that you have a spreadsheet full of coordinate data, we can convert the CSV spreadsheet into a format that web maps like, like GeoJSON. GeoJSON is a web mapping standard of JSON data. There are a couple of ways to make GeoJSON: -The easiest, recommended way is to use a UI tool developed by Mapbox called [geojson.io](http://geojson.io). All you have to do is click and drag your csv file into the data window (the right side of the screen, next to the map), and it will automatically format your data into GeoJSON for you. You can select the 'GeoJSON' option under 'Save.' Save your GeoJSON file as `census.geojson`. +The easiest, recommended way is to use a UI tool developed by Mapbox called [geojson.io](https://geojson.io). All you have to do is click and drag your csv file into the data window (the right side of the screen, next to the map), and it will automatically format your data into GeoJSON for you. You can select the 'GeoJSON' option under 'Save.' Save your GeoJSON file as `census.geojson`. {% include figure.html filename="webmap-01-geojsonio.gif" caption="Drag and Drop GeoJSON creation!" %} @@ -381,7 +381,7 @@ Which you can now run by using the command: python geocoder-helpercolumn.py census_country.csv ``` -Turn your clean data into GeoJSON by saving it as `census.geojson` and test it out at [geojson.io](http://geojson.io). Remember, drag the new CSV you created (`census_country.csv` into the window to create that beautiful JSON). Do the results look better now? Good! +Turn your clean data into GeoJSON by saving it as `census.geojson` and test it out at [geojson.io](https://geojson.io). Remember, drag the new CSV you created (`census_country.csv` into the window to create that beautiful JSON). Do the results look better now? Good! ## Using Leaflet to Create a Web Map @@ -397,9 +397,9 @@ SimpleHTTPServer is a Python module. If you want to change the server to port 80 ```python -m SimpleHTTPServer 8080``` or ```python3 -m http.server 8080``` (for Python3) -In your browser go to http://localhost:8080 and you should see the files you've been working with so far. +In your browser go to https://localhost:8080 and you should see the files you've been working with so far. -Now in your text editor open a new document and save it as an html file (mymap.html). If you want to do a quick test, copy and paste the text below, refresh your http://localhost:8080 and open the html file in your browser. +Now in your text editor open a new document and save it as an html file (mymap.html). If you want to do a quick test, copy and paste the text below, refresh your https://localhost:8080 and open the html file in your browser. ```html @@ -655,7 +655,7 @@ window.onload = function () { }; ``` -What we've done here is edit the [onEachFeature function](http://leafletjs.com/SlavaUkraini/reference-1.2.0.html#geojson-oneachfeature), which gets called for each feature (in this case, each marker popup) to add additional information about each marker contained in our `census.geojson` data. To add attribute information from our `census.geojson` file, we use the convention `feature.properties.ATTRIBUTE_NAME` to access the population data. In this case, we are adding `feature.properties.Pop_2001`, `feature.properties.Pop_1981`, and `feature.properties.Pop_1801`, and adding a bit of styling with html for readability. +What we've done here is edit the [onEachFeature function](https://leafletjs.com/reference.html#geojson-oneachfeature), which gets called for each feature (in this case, each marker popup) to add additional information about each marker contained in our `census.geojson` data. To add attribute information from our `census.geojson` file, we use the convention `feature.properties.ATTRIBUTE_NAME` to access the population data. In this case, we are adding `feature.properties.Pop_2001`, `feature.properties.Pop_1981`, and `feature.properties.Pop_1801`, and adding a bit of styling with html for readability. {% include figure.html filename="webmap-06-exercise02.jpg" caption="Exercise 02" %} @@ -826,4 +826,4 @@ Congratulations! You now have some hands-on experience geocoding using common Py If you want to explore other web mapping features with Leaflet, there are a number of additional [plugins](https://leafletjs.com/plugins.html) to try out. Of particular interest may be ability to create [time based visualizations](https://github.com/skeate/Leaflet.timeline) and do [heat-mapping](https://github.com/pa7/heatmap.js). -Also, check out the Programming Historian Lesson [Using Javascript to Create Maps of Correspondence](/lessons/using-javascript-to-create-maps) that goes in depth on how to analyze correspondence using geospatial software, and using some of the same tools as this lesson. +Also, check out the Programming Historian Lesson [Using Javascript to Create Maps of Correspondence](/en/lessons/using-javascript-to-create-maps) that goes in depth on how to analyze correspondence using geospatial software, and using some of the same tools as this lesson. diff --git a/en/lessons/naive-bayesian.md b/en/lessons/naive-bayesian.md index 13536eedc7..c2f6dab904 100755 --- a/en/lessons/naive-bayesian.md +++ b/en/lessons/naive-bayesian.md @@ -14,7 +14,7 @@ exclude_from_check: activity: analyzing topics: [distant-reading] abstract: "This lesson shows how to use machine learning to extract interesting documents out of a digital archive." -redirect_from: /lessons/naive-bayesian +redirect_from: /lessons/naive-bayesian/ avatar_alt: A man peers through a geometric tool doi: 10.46430/phen0038 --- @@ -93,7 +93,7 @@ not in the learner code itself).* ## The Old Bailey Digital Archive -The [Old Bailey digital archive](http://www.oldbaileyonline.org/) +The [Old Bailey digital archive](https://www.oldbaileyonline.org/) contains 197,745 criminal trials held at the Old Bailey, aka the Central Criminal Court in London. The trials were held between 1674 and 1913, and since the archive provides the full transcript of each trial, many @@ -1244,7 +1244,7 @@ closely examining the data first). They are also useful in that they can help us determine whether something is skewing our results in a way we don't wish, something we may be able to correct for with different weighting or different selection of features (see the section on -[Tuning](#Tuning) below). +[Tuning](#tuning) below). ### The meanings of misclassification @@ -1460,25 +1460,25 @@ big problem. Happy hunting! - [A Naive Bayesian in the Old Bailey]: http://digitalhistoryhacks.blogspot.com/2008/05/naive-bayesian-in-old-bailey-part-1.html - [Old Bailey digital archive]: http://www.oldbaileyonline.org/ - [A zip file of the scripts]: /assets/naive-bayesian/baileycode.zip - [another zip file]: https://doi.org/10.5281/zenodo.13284 - [BeautifulSoup]: http://www.crummy.com/software/BeautifulSoup/ - [search interface]: http://www.oldbaileyonline.org/forms/formMain.jsp - [classification]: http://en.wikipedia.org/wiki/Statistical_classification - [clustering]: http://home.deib.polimi.it/matteucc/Clustering/tutorial_html/ - ["ff0000," the HTML code for red]: http://www.paulgraham.com/spam.html - [an explanation of Bayes' rule and conditional probabilities]: http://www.yudkowsky.net/rational/bayes - [topic modeling]: /lessons/topic-modeling-and-mallet - [logarithms]: http://betterexplained.com/articles/using-logs-in-the-real-world/ - [priors]: http://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introbayes_sect004.htm - [Introduction to the Bash Command Line]: /lessons/intro-to-bash - [Automated Downloading with wget]: /lessons/automated-downloading-with-wget - [Understanding Regular Expressions]: /lessons/understanding-regular-expressions - [Intro to Beautiful Soup]: /lessons/intro-to-beautiful-soup - [documentation for developers]: http://www.oldbaileyonline.org/static/DocAPI.jsp - [Old Bailey search page]: http://www.oldbaileyonline.org/forms/formMain.jsp - [pypy]: http://pypy.org/ - [Snowball Stemmer]: http://snowball.tartarus.org/ - [a more detailed explanation of TF-IDF]: http://stevenloria.com/finding-important-words-in-a-document-using-tf-idf/ +- [A Naive Bayesian in the Old Bailey](https://digitalhistoryhacks.blogspot.com/2008/05/naive-bayesian-in-old-bailey-part-1.html) +- [Old Bailey digital archive](https://www.oldbaileyonline.org/) +- [A zip file of the scripts](/assets/naive-bayesian/baileycode.zip) +- [another zip file](https://doi.org/10.5281/zenodo.13284) +- [BeautifulSoup](https://www.crummy.com/software/BeautifulSoup/) +- [search interface](https://www.oldbaileyonline.org/forms/formMain.jsp) +- [classification](https://en.wikipedia.org/wiki/Statistical_classification) +- [clustering](https://home.deib.polimi.it/matteucc/Clustering/tutorial_html/) +- ["ff0000," the HTML code for red](https://www.paulgraham.com/spam.html) +- [an explanation of Bayes' rule and conditional probabilities](https://www.yudkowsky.net/rational/bayes) +- [topic modeling](/en/lessons/topic-modeling-and-mallet) +- [logarithms](https://betterexplained.com/articles/using-logs-in-the-real-world/) +- [priors](https://support.sas.com/documentation/cdl/en/statug/63033/HTML/default/viewer.htm#statug_introbayes_sect004.htm) +- [Introduction to the Bash Command Line](/en/lessons/intro-to-bash) +- [Automated Downloading with wget](/en/lessons/automated-downloading-with-wget) +- [Understanding Regular Expressions](/en/lessons/understanding-regular-expressions) +- [Intro to Beautiful Soup](/en/lessons/intro-to-beautiful-soup) +- [documentation for developers](https://www.oldbaileyonline.org/static/DocAPI.jsp) +- [Old Bailey search page](https://www.oldbaileyonline.org/forms/formMain.jsp) +- [pypy](https://pypy.org/) +- [Snowball Stemmer](https://snowball.tartarus.org/) +- [a more detailed explanation of TF-IDF](https://stevenloria.com/finding-important-words-in-a-document-using-tf-idf/) diff --git a/en/lessons/normalizing-data.md b/en/lessons/normalizing-data.md index 70ff98079b..1cc2bec31f 100755 --- a/en/lessons/normalizing-data.md +++ b/en/lessons/normalizing-data.md @@ -17,12 +17,12 @@ exclude_from_check: activity: transforming topics: [python] abstract: "In this lesson, we will make the list we created in the 'From HTML to a List of Words' lesson easier to analyze by normalizing this data." -next: counting-frequencies -previous: from-html-to-list-of-words-2 +next: /en/lessons/counting-frequencies +previous: /en/lessons/from-html-to-list-of-words-2 series_total: 15 lessons sequence: 9 python_warning: false -redirect_from: /lessons/normalizing-data +redirect_from: /lessons/normalizing-data/ avatar_alt: Tall woman dragging a short young man doi: 10.46430/phen0014 --- @@ -258,12 +258,12 @@ to make sure you have the correct code. - python-lessons4.zip ([zip sync][]) - [From HTML to a List of Words (2)]: /lessons/from-html-to-list-of-words-2 - [web page]: http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33 - [From HTML to a List of Words (1)]: /lessons/from-html-to-list-of-words-1 - [Manipulating Strings in Python]: /lessons/manipulating-strings-in-python - [Unicode]: http://unicode.org/ - [Python support]: https://web.archive.org/web/20180502053841/http://www.diveintopython.net/xml_processing/unicode.html - [Dive into Python]: https://web.archive.org/web/20180416143856/http://www.diveintopython.net/regular_expressions/index.html - [zip]: /assets/python-lessons3.zip - [zip sync]: /assets/python-lessons4.zip +- [From HTML to a List of Words (2)](/en/lessons/from-html-to-list-of-words-2) +- [web page](https://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33) +- [From HTML to a List of Words (1)](/en/lessons/from-html-to-list-of-words-1) +- [Manipulating Strings in Python](/en/lessons/manipulating-strings-in-python) +- [Unicode](https://unicode.org/) +- [Python support](https://web.archive.org/web/20180502053841/https://www.diveintopython.net/xml_processing/unicode.html) +- [Dive into Python](https://web.archive.org/web/20180416143856/https://www.diveintopython.net/regular_expressions/index.html) +- [zip](/assets/python-lessons3.zip) +- [zip sync](/assets/python-lessons4.zip) diff --git a/en/lessons/ocr-tutorial.md b/en/lessons/ocr-tutorial.md index c385e89f23..14378b1b27 100755 --- a/en/lessons/ocr-tutorial.md +++ b/en/lessons/ocr-tutorial.md @@ -1,7 +1,7 @@ --- title: OCR Tutorial published: false -redirect_from: /lessons/ocr-tutorial +redirect_from: /lessons/ocr-tutorial/ --- {% include toc.html %} @@ -22,7 +22,7 @@ and Going through a text file line by line and correcting OCR errors one at a time is hugely error-prone, as any proof reader will tell you. If you are dealing with a narrative, a monograph, a diary, or something like that, a great deal of that kind of proofing will be unavoidable; however, if what you have is an ordered collection of primary source documents, a legal code say, or a cartulary, you are far better served by creating an ordered data structure out of it __first__. You will wind up with data that is useful in a variety of contexts, even before your army of street urchins starts correcting specific OCR typos. -This is where a scripting language like Python comes very much in handy. For our project we wanted to prepare some of the documents from a 12th century collection of *imbreviatura* from the Italian scribe known as [Giovanni Scriba](http://www.worldcat.org/oclc/17591390) so that they could be marked up by historians for subsequent NLP analysis or potentially for other purposes as well. The pages of the 1935 published edition look like this. +This is where a scripting language like Python comes very much in handy. For our project we wanted to prepare some of the documents from a 12th century collection of *imbreviatura* from the Italian scribe known as [Giovanni Scriba](https://www.worldcat.org/oclc/17591390) so that they could be marked up by historians for subsequent NLP analysis or potentially for other purposes as well. The pages of the 1935 published edition look like this. ![GS page 110](gs_pg110.png) @@ -89,7 +89,7 @@ You will note that some of this metadata is page-bound and some of it is charter IL CIRTOL.'RE DI G:OV.I\N( sca:FR 339 342 NI .\ßlO CHIAUDANO 9LtTTIA MORESCO -These strings are not regular enough to reliably find with regular expressions; however, if you know what the strings are supposed to look like, you can compose some kind of string similarity algorithm to test each string against an exemplar and measure the likelihood that it is a page header. Fortunately, I didn't have to compose such an algorithm, Vladimir Levenshtein did it for us in 1965 (see: ). A computer language can encode this algorithm in any number of ways, here's an effective Python function that will work for us: +These strings are not regular enough to reliably find with regular expressions; however, if you know what the strings are supposed to look like, you can compose some kind of string similarity algorithm to test each string against an exemplar and measure the likelihood that it is a page header. Fortunately, I didn't have to compose such an algorithm, Vladimir Levenshtein did it for us in 1965 (see: ). A computer language can encode this algorithm in any number of ways, here's an effective Python function that will work for us: ```python @@ -251,7 +251,7 @@ While it's important in itself for us to have our OCR output reliably divided up ## A very brief review of regular expressions as they are implemented in python -L.T. O'Hara's [introduction](/lessons/cleaning-ocrd-text-with-regular-expressions.html) to using python flavored regular expressions is invaluable. In this context we should review a couple of basic facts about Python's implementation of regular expressions, the `re` module, which is part of Python's standard library. +L.T. O'Hara's [introduction](/en/lessons/cleaning-ocrd-text-with-regular-expressions) to using python flavored regular expressions is invaluable. In this context we should review a couple of basic facts about Python's implementation of regular expressions, the `re` module, which is part of Python's standard library. 1. `re.compile()` creates a regular expression object that has a number of methods. You should be familiar with `.match()`, and `.search()`, but also `.findall()` and `.finditer()` 2. Bear in mind the difference between `.match()` and `.search()`: `.match()` will only match at the __beginning__ of a line, whereas `.search()` will match anywhere in the line __but then it stops__, it'll __only__ return the first match it finds. @@ -466,7 +466,7 @@ Print out our resulting dictionary using `pprint(charters)` and you'll see somet } ``` -Printing out your Python dictionary as a literal string is not a bad thing to do. For a text this size, the resulting file is perfectly manageable, can be mailed around usefully and read into a python repl session very simply using `eval()`, or pasted directly into a Python module file. On the other hand, if you want an even more reliable way to serialize it in an exclusively Python context, look into [`Pickle`](https://docs.python.org/2/library/pickle.html). If you need to move it to some other context, JavaScript for example, or some `RDF` triple stores, Python's [`json`](https://docs.python.org/2/library/json.html#module-json) module will translate effectively. If you have to get some kind of XML output, I will be very sorry for you, but the [`lxml`](http://lxml.de/) python module may ease the pain a little. +Printing out your Python dictionary as a literal string is not a bad thing to do. For a text this size, the resulting file is perfectly manageable, can be mailed around usefully and read into a python repl session very simply using `eval()`, or pasted directly into a Python module file. On the other hand, if you want an even more reliable way to serialize it in an exclusively Python context, look into [`Pickle`](https://docs.python.org/2/library/pickle.html). If you need to move it to some other context, JavaScript for example, or some `RDF` triple stores, Python's [`json`](https://docs.python.org/2/library/json.html#module-json) module will translate effectively. If you have to get some kind of XML output, I will be very sorry for you, but the [`lxml`](https://lxml.de/) python module may ease the pain a little. ## Order from disorder, huzzah. Now that we have an ordered data structure, we can do many things with it. As a very simple example, lets just print it out as html for display on a web-site: @@ -525,7 +525,7 @@ fout.write("""""") Drop the resulting file on a web browser, and you've got a nicely formated electronic edition. Being able to do this with your, mostly uncorrected, OCR output is not a trivial advantage. If you're serious about creating a clean, error free, electronic edition of anything, you've got to do some serious proofreading. Having a source text formatted for reading is crucial; moreover, if your proofreader can change the font, spacing, color, layout, and so forth at will, you can increase their accuracy and productivity substantially. With this example in a modern web browser, tweaking those parameters with some simple css declarations is easy. Also, with some ordered HTML to work with, you might crowd-source the OCR error correction, instead of hiring that army of illiterate street urchins. -Beyond this though, there's lots you can do with an ordered data set, including feeding it back through a markup tool like the [brat](http://brat.nlplab.org) as we did for the ChartEx project. Domain experts can then start adding layers of semantic tagging even if you don't do any further OCR error correction. +Beyond this though, there's lots you can do with an ordered data set, including feeding it back through a markup tool like the [brat](https://brat.nlplab.org) as we did for the ChartEx project. Domain experts can then start adding layers of semantic tagging even if you don't do any further OCR error correction. The bits of code above are in no way a turn-key solution for cleaning arbitrary OCR output. There is no such magic wand. The Google approach to scanning the contents of research libraries threatens to drown us in an ocean of bad data. Worse, it elides a fundamental fact of digital scholarship: digital sources are hard to get. Reliable, flexible, and useful digital texts require careful redaction and persistent curation. Google, Amazon, Facebook, *et alia* do not have to concern themselves with the quality of their data, just its quantity. Historians, on the other hand, must care first for the integrity of their sources. diff --git a/en/lessons/ocr-with-google-vision-and-tesseract.md b/en/lessons/ocr-with-google-vision-and-tesseract.md index fb06fac67b..bb44a2ca13 100644 --- a/en/lessons/ocr-with-google-vision-and-tesseract.md +++ b/en/lessons/ocr-with-google-vision-and-tesseract.md @@ -166,7 +166,7 @@ These three documents are copyright-free and available on [archive.org](https:// # OCR with Tesseract -Tesseract takes image files as input. If you have PDFs, you can transform them into .tiff files using any image editing tool, [ImageMagick](https://imagemagick.org/) for instance. The process of converting PDFs to TIFFs using ImageMagick is detailed in the _Programming Historian_ lesson [OCR and Machine Translation](/en/lessons/OCR-and-Machine-Translation#converting-pdfs-to-tiffs-with-imagemagick). +Tesseract takes image files as input. If you have PDFs, you can transform them into .tiff files using any image editing tool, [ImageMagick](https://imagemagick.org/) for instance. The process of converting PDFs to TIFFs using ImageMagick is detailed in the _Programming Historian_ lesson [OCR and Machine Translation](/en/lessons/retired/OCR-and-Machine-Translation#converting-pdfs-to-tiffs-with-imagemagick). Alternatively, you can use OCRmyPDF. This software is based on Tesseract but works with PDFs. More information can be found in the _Programming Historian_ lesson [Working with batches of PDF files](/en/lessons/working-with-batches-of-pdf-files). @@ -638,7 +638,7 @@ def new_file_layout(filename, input_dir, store_dir): lim_p[0].save(new_filepath, "PDF" ,resolution=100.0, save_all=True, append_images=lim_p[1:]) ``` -The following function executes the above and OCRs the new PDF with the `vision_method` defined [in the previous section](#google-vision-2). +The following function executes the above and OCRs the new PDF with the `vision_method` defined [in the previous section](#running-google-vision). ``` def combined_method_I(filename, input_dir, store_dir, output_dir): @@ -676,7 +676,7 @@ batch_combined_method_I(input_dir_cm1, store_dir_cm1, output_dir_cm1) ### Tesseract + Google Vision: Method Two -The second combined method uses the text region coordinates provided by Tesseract to create text output. We will be extracting any words that fall within the defined regions from the JSON response files we generated earlier using the `JSON_OCR` function as explained in the [Google Vision section](#google-vision-2). +The second combined method uses the text region coordinates provided by Tesseract to create text output. We will be extracting any words that fall within the defined regions from the JSON response files we generated earlier using the `JSON_OCR` function as explained in the [Google Vision section](#google-vision). First, we'll create a function that will output a dictionary which contains the coordinates of each text region, as well as the height and width of each page. The height and width are necessary for converting the pixel coordinates provided by Tesseract to the normalised coordinates provided by Google Vision. diff --git a/en/lessons/output-data-as-html-file.md b/en/lessons/output-data-as-html-file.md index 6a0995f991..a6917d5146 100755 --- a/en/lessons/output-data-as-html-file.md +++ b/en/lessons/output-data-as-html-file.md @@ -17,12 +17,12 @@ activity: transforming topics: [python, website] abstract: "This lesson takes the frequency pairs created in the 'Counting Frequencies' lesson and outputs them to an HTML file." -next: keywords-in-context-using-n-grams -previous: creating-and-viewing-html-files-with-python +next: /en/lessons/keywords-in-context-using-n-grams +previous: /en/lessons/creating-and-viewing-html-files-with-python series_total: 15 lessons sequence: 12 python_warning: false -redirect_from: /lessons/output-data-as-html-file +redirect_from: /lessons/output-data-as-html-file/ avatar_alt: A woman wearing an elaborate dress accompanied by two putti doi: 10.46430/phen0015 --- @@ -36,7 +36,7 @@ doi: 10.46430/phen0015 ## Lesson Goals This lesson takes the frequency pairs created in [Counting -Frequencies][] and outputs them to an HTML file. +Frequencies](/en/lessons/counting-frequencies) and outputs them to an HTML file. Here you will learn how to output data as an HTML file using Python. You will also learn about string formatting. The final result is an HTML @@ -49,7 +49,7 @@ appears. - `obo.py` If you do not have these files from the previous lesson, you can -download programming-historian-6, a [zip file from the previous lesson][] +download programming-historian-6, a [zip file from the previous lesson](/assets/python-lessons6.zip). ## Building an HTML wrapper @@ -60,7 +60,7 @@ around something so that it can be used by another program is sometimes called a *wrapper*. What we're going to do now is develop an HTML wrapper for the output of our code that computes word frequencies. We're also going to add some helpful, dynamic *metadata* to supplement the frequency -data collected in [Counting Frequencies][]. +data collected in [Counting Frequencies](/en/lessons/counting-frequencies). ## Metadata @@ -303,8 +303,4 @@ with the Mac / Linux version you may have to open the `obo.py` file and change "file:///Users/username/Desktop/programming-historian/" to the path to the directory on your own computer. -- python-lessons7.zip [zip sync][] - - [Counting Frequencies]: /lessons/counting-frequencies - [zip file from the previous lesson]: /assets/python-lessons6.zip - [zip sync]: /assets/python-lessons7.zip +- [zip sync](/assets/python-lessons7.zip) diff --git a/en/lessons/output-keywords-in-context-in-html-file.md b/en/lessons/output-keywords-in-context-in-html-file.md index c366927919..b3344d2cba 100755 --- a/en/lessons/output-keywords-in-context-in-html-file.md +++ b/en/lessons/output-keywords-in-context-in-html-file.md @@ -16,12 +16,12 @@ exclude_from_check: activity: presenting topics: [python] abstract: "This lesson builds on 'Keywords in Context (Using N-grams)', where n-grams were extracted from a text. Here, you will learn how to output all of the n-grams of a given keyword in a document downloaded from the Internet, and display them clearly in your browser window." -next: downloading-multiple-records-using-query-strings -previous: keywords-in-context-using-n-grams +next: /en/lessons/downloading-multiple-records-using-query-strings +previous: /en/lessons/keywords-in-context-using-n-grams series_total: 15 lessons sequence: 14 python_warning: false -redirect_from: /lessons/output-keywords-in-context-in-html-file +redirect_from: /lessons/output-keywords-in-context-in-html-file/ avatar_alt: A monkey dancing with a lion and a bear doi: 10.46430/phen0016 --- diff --git a/en/lessons/preserving-your-research-data.md b/en/lessons/preserving-your-research-data.md index 707ec3d84e..7730992b81 100755 --- a/en/lessons/preserving-your-research-data.md +++ b/en/lessons/preserving-your-research-data.md @@ -16,7 +16,7 @@ exclude_from_check: activity: sustaining topics: [data-management] abstract: "This lesson will suggest ways in which historians can document and structure their research data so as to ensure it remains useful in the future." -redirect_from: /lessons/preserving-your-research-data +redirect_from: /lessons/preserving-your-research-data/ avatar_alt: A large barrel doi: 10.46430/phen0039 --- @@ -165,7 +165,7 @@ documented), though existing schema such as [Markdown][] are available (Markdown files are saved as .md). An excellent Markdown cheat sheet is available on GitHub ) for those who wish to follow – or adapt – this existing schema. Notepad++ - is recommended for Windows users, though + is recommended for Windows users, though by no means essential, for working with .md files. Mac or Unix users may find [Komodo Edit][] or [Text Wrangler][] helpful. @@ -213,7 +213,7 @@ blogging services. WordPress URLs follow the format: - *website name*/*year(4 digits)*/*month (2 digits)*/*day (2 digits)*/*words-of-title-separated-by-hyphens* -- +- A similar style is used by news agencies such as a The Guardian newspaper: @@ -221,7 +221,7 @@ newspaper: - *website name*/*section subdivision*/*year (4 digits)*/*month (3 characters)*/*day (2 digits)*/*words-describing-content-separated-by-hyphens* -- +- . In archival catalogues, URLs structured by a single data element are @@ -229,12 +229,12 @@ often used. The British Cartoon Archive structures its online archive using the format: - *website name*/record/*reference number* -- +- And the Old Bailey Online uses the format: - *website name*/browse.jsp?ref=*reference number* -- +- What we learn from these examples is that a combination of semantic description and data elements make consistent and predictable data @@ -415,11 +415,11 @@ blog (17 October 2013) Hitchcock, Tim, 'Judging a book by its URLs', Historyonics blog (3 January 2014) - + Howard, Sharon, 'Unclean, unclean! What historians can do about sharing our messy research data', Early Modern Notes blog (18 May 2013) - + Noble, William Stafford, A Quick Guide to Organizing Computational Biology Projects.PLoSComputBiol 5(7): e1000424 (2009) @@ -432,7 +432,7 @@ Information Management: Organising Humanities Material' (2011) Pennock, Maureen, 'The Twelve Principles of Digital Preservation (and a cartridge in a repository…)', British Library Collection Care blog (3 September 2013) - + Pritchard, Adam, 'Markdown Cheatsheet' (2013) @@ -443,8 +443,8 @@ Digital Era', The American Historical Review 108:3 (2003), 735-762. UK Data Archive, 'Documenting your Data' - [PRINCE2]: http://en.wikipedia.org/wiki/PRINCE2 - [platform agnostic]: http://en.wikipedia.org/wiki/Cross-platform - [Markdown]: http://en.wikipedia.org/wiki/Markdown + [PRINCE2]: https://en.wikipedia.org/wiki/PRINCE2 + [platform agnostic]: https://en.wikipedia.org/wiki/Cross-platform + [Markdown]: https://en.wikipedia.org/wiki/Markdown [Komodo Edit]: https://github.com/ActiveState/OpenKomodoIDE [Text Wrangler]: https://www.barebones.com/products/textwrangler/ diff --git a/en/lessons/qgis-layers.md b/en/lessons/qgis-layers.md index 4b7d5faa3e..61c54cac64 100755 --- a/en/lessons/qgis-layers.md +++ b/en/lessons/qgis-layers.md @@ -19,11 +19,11 @@ topics: [mapping] abstract: "In this lesson you will install QGIS software, download geospatial files like shapefiles and GeoTIFFs, and create a map out of a number of vector and raster layers." -next: vector-layers-qgis -previous: googlemaps-googleearth +next: /en/lessons/vector-layers-qgis +previous: /en/lessons/googlemaps-googleearth series_total: 5 lessons sequence: 2 -redirect_from: /lessons/qgis-layers +redirect_from: /lessons/qgis-layers/ avatar_alt: Elevation view view of a mountain range doi: 10.46430/phen0031 --- @@ -104,7 +104,7 @@ making the downloads quick! 1. [coastline.SHP.zip](/assets/qgis-layers/coastline.SHP.zip) 2. [lot_town.SHP.zip](/assets/qgis-layers/lot_town.SHP.zip) 3. [hydronetwork.SHP.zip](/assets/qgis-layers/hydronetwork.SHP.zip) -4. +4. 5. [nat_parks.SHP.zip](/assets/qgis-layers/nat_parks.SHP.zip) 6. [PEI Highways][] 7. [PEI Places][] @@ -462,16 +462,16 @@ save your work!** *This lesson is part of the [Geospatial Historian][].* - [QGIS Download page]: http://qgis.org/en/site/forusers/download.html - [KyngChaos Qgis download page]: http://www.kyngchaos.com/software/qgis - [Download Archive]: http://www.kyngchaos.com/software/archive + [QGIS Download page]: https://qgis.org/en/site/forusers/download.html + [KyngChaos Qgis download page]: https://www.kyngchaos.com/software/qgis + [Download Archive]: https://www.kyngchaos.com/software/archive [PEI Highways]: /assets/qgis-layers/PEI_highway.zip [PEI Places]: /assets/qgis-layers/PEI_placenames.zip - [Coordinate Reference System]: http://en.wikipedia.org/wiki/Spatial_reference_system + [Coordinate Reference System]: https://en.wikipedia.org/wiki/Spatial_reference_system [NRCan's website]: https://perma.cc/B4UW-R4FK - [Double Stereographic projection]: http://www.gov.pe.ca/gis/index.php3?number=77865&lang=E - [Tutorial: Working with Projections in QGIS]: http://web.archive.org/web/20180807132308/http://qgis.spatialthoughts.com/2012/04/tutorial-working-with-projections-in.html - [defined]: http://www.gislounge.com/geodatabases-explored-vector-and-raster-data/ - [aerial photos]: http://en.wikipedia.org/wiki/Orthophoto + [Double Stereographic projection]: https://www.gov.pe.ca/gis/index.php3?number=77865&lang=E + [Tutorial: Working with Projections in QGIS]: https://web.archive.org/web/20180807132308/https://qgis.spatialthoughts.com/2012/04/tutorial-working-with-projections-in.html + [defined]: https://www.gislounge.com/geodatabases-explored-vector-and-raster-data/ + [aerial photos]: https://en.wikipedia.org/wiki/Orthophoto [PEI_CumminsMap1927.tif]: /assets/qgis-layers/PEI_CumminsMap1927_compLZW.tif - [Geospatial Historian]: http://geospatialhistorian.wordpress.com/ + [Geospatial Historian]: https://geospatialhistorian.wordpress.com/ diff --git a/en/lessons/r-basics-with-tabular-data.md b/en/lessons/r-basics-with-tabular-data.md index 5c43e01574..eb678d48d2 100755 --- a/en/lessons/r-basics-with-tabular-data.md +++ b/en/lessons/r-basics-with-tabular-data.md @@ -14,7 +14,7 @@ review-ticket: https://github.com/programminghistorian/ph-submissions/issues/19 activity: transforming topics: [data-manipulation, r] abstract: "This lesson teaches a way to quickly analyze large volumes of tabular data, making research faster and more effective." -redirect_from: /lessons/r-basics-with-tabular-data +redirect_from: /lessons/r-basics-with-tabular-data/ avatar_alt: An ornate illustrated character R doi: 10.46430/phen0056 --- @@ -37,11 +37,11 @@ This tutorial presumes no prior knowledge of R. It will go through some of the b R is ideal for analyzing larger data sets that would take too long to compute manually. Once you understand how to write some of the basic functions and how to import your own data files, you can analyze and visualize the data quickly and efficiently. -While R is a great tool for tabular data, you may find using other approaches to analyse non-tabular sources (such as newspaper transcriptions) more useful. If you are interested in studying these types of sources, take a look at some of the other great lessons of the [Programming Historian](/lessons/). +While R is a great tool for tabular data, you may find using other approaches to analyse non-tabular sources (such as newspaper transcriptions) more useful. If you are interested in studying these types of sources, take a look at some of the other great lessons of the [Programming Historian](/en/lessons/). ## Installing R -R is a programming language and environment for working with data. It can be run using the R console as well as on the [command-line](/lessons/intro-to-bash) or the [R Studio Interface](https://www.rstudio.com/). This tutorial will focus on using the R console. To get started with R, download the program from [The Comprehensive R Archive Network](https://cran.r-project.org/). R is compatible with Linux, Mac, and Windows. +R is a programming language and environment for working with data. It can be run using the R console as well as on the [command-line](/en/lessons/intro-to-bash) or the [R Studio Interface](https://www.rstudio.com/). This tutorial will focus on using the R console. To get started with R, download the program from [The Comprehensive R Archive Network](https://cran.r-project.org/). R is compatible with Linux, Mac, and Windows. When you first open the R console, it will open in a window that looks like this: @@ -534,10 +534,10 @@ For more information on R, visit the [R Manual](https://cran.r-project.org/doc/m There are also a number of other R tutorials online including: -* [R: A self-learn tutorial](http://web.archive.org/web/20191015004305/https://www.nceas.ucsb.edu/files/scicomp/Dloads/RProgramming/BestFirstRTutorial.pdf) - this tutorial goes through a series of functions and provides exercises to practice skills. +* [R: A self-learn tutorial](https://web.archive.org/web/20191015004305/https://www.nceas.ucsb.edu/files/scicomp/Dloads/RProgramming/BestFirstRTutorial.pdf) - this tutorial goes through a series of functions and provides exercises to practice skills. * [DataCamp Introduction to R](https://www.datacamp.com/courses/free-introduction-to-r) - this is a free online course that gives you feedback on your code to help identify errors and learn how to write code more efficiently. -Finally, a great resource for digital historians is Lincoln Mullen's [Digital History Methods in R](http://dh-r.lincolnmullen.com/). It is a draft of a book written specifically on how to use R for digital history work. +Finally, a great resource for digital historians is Lincoln Mullen's [Digital History Methods in R](https://dh-r.lincolnmullen.com/). It is a draft of a book written specifically on how to use R for digital history work. ## Endnotes diff --git a/en/lessons/research-data-with-unix.md b/en/lessons/research-data-with-unix.md index 12c4231cd3..f66b39d80d 100755 --- a/en/lessons/research-data-with-unix.md +++ b/en/lessons/research-data-with-unix.md @@ -16,8 +16,8 @@ exclude_from_check: activity: transforming topics: [data-manipulation] abstract: "This lesson will look at how research data, when organised in a clear and predictable manner, can be counted and mined using the Unix shell." -previous: intro-to-bash -redirect_from: /lessons/research-data-with-unix +previous: /en/lessons/intro-to-bash +redirect_from: /lessons/research-data-with-unix/ avatar_alt: A diagram of a miner sorting ore into an apparatus doi: 10.46430/phen0040 --- @@ -28,27 +28,27 @@ doi: 10.46430/phen0040 ## Introduction -This lesson will look at how research data, when organised in a clear and predictable manner, can be counted and mined using the Unix shell. The lesson builds on the lessons "[Preserving Your Research Data: Documenting and Structuring Data](/lessons/preserving-your-research-data)" and "[Introduction to the Bash Command Line](../lessons/intro-to-bash)". Depending on your confidence with the Unix shell, it can also be used as a standalone lesson or refresher. +This lesson will look at how research data, when organised in a clear and predictable manner, can be counted and mined using the Unix shell. The lesson builds on the lessons "[Preserving Your Research Data: Documenting and Structuring Data](/en/lessons/preserving-your-research-data)" and "[Introduction to the Bash Command Line](/en/lessons/intro-to-bash)". Depending on your confidence with the Unix shell, it can also be used as a standalone lesson or refresher. Having accumulated research data for one project, a historian might ask different questions of that same data when returning to it during a subsequent project. If this data is spread across multiple files - a series of tabulated data, a set of transcribed text, a collection of images - it can be counted and mined using simple Unix commands. The Unix shell gives you access to a range of powerful commands that can transform how you count and mine research data. This lesson will introduce you to a series of commands that use counting and mining of tabulated data, though they only scratch the surface of what the Unix shell can do. By learning just a few simple commands you will be able to undertake tasks that are impossible in Libre Office Calc, Microsoft Excel, or other similar spreadsheet programs. These commands can be easily extended for use with non-tabulated data. -This lesson will also demonstrate that the options for manipulating, counting and mining data available to you will often depend on the amount of metadata, or descriptive text, contained in the filenames of the data you are using as much as the range of Unix commands you have learnt to use. Thus, even if it is not a prerequisite of working with the Unix shell, taking the time to structure your research data and filenaming conventions in a consistent and predictable manner is certainly a significant step towards getting the most out of Unix commands and being able to count and mine your research data. For the value of taking the time to make your data consistent and predictable beyond matters of preservation, see "[Preserving Your Research Data: Documenting and Structuring Data](../lessons/preserving-your-research-data)". +This lesson will also demonstrate that the options for manipulating, counting and mining data available to you will often depend on the amount of metadata, or descriptive text, contained in the filenames of the data you are using as much as the range of Unix commands you have learnt to use. Thus, even if it is not a prerequisite of working with the Unix shell, taking the time to structure your research data and filenaming conventions in a consistent and predictable manner is certainly a significant step towards getting the most out of Unix commands and being able to count and mine your research data. For the value of taking the time to make your data consistent and predictable beyond matters of preservation, see "[Preserving Your Research Data: Documenting and Structuring Data](/en/lessons/preserving-your-research-data)". _____ ## Software and setup -Windows users will need to install Git Bash. This can be installed by downloading the most recent installer at the [git for windows webpage](http://msysgit.github.io/). Instructions for installation are available at [Open Hatch](https://web.archive.org/web/20190318191709/https://openhatch.org/missions/windows-setup/install-git-bash). +Windows users will need to install Git Bash. This can be installed by downloading the most recent installer at the [git for windows webpage](https://msysgit.github.io/). Instructions for installation are available at [Open Hatch](https://web.archive.org/web/20190318191709/https://openhatch.org/missions/windows-setup/install-git-bash). -OS X and Linux users will need to use their terminal shells to follow this lesson, as discussed in "[Introduction to the Bash Command Line](../lessons/intro-to-bash)." +OS X and Linux users will need to use their terminal shells to follow this lesson, as discussed in "[Introduction to the Bash Command Line](/en/lessons/intro-to-bash)." This lesson was written using Git Bash 1.9.0 and the Windows 7 operating system. Equivalent file paths for OS X/Linux have been included where possible. Nonetheless, as commands and flags can change slightly between operating systems OS X/Linux users are referred to Deborah S. Ray and Eric J. Ray, "[*Unix and Linux: Visual Quickstart Guide*](https://www.worldcat.org/title/unix-and-linux/oclc/308171076&referer=brief_results)", 4th edition (2009) which covers interoperability in greater detail. The files used in this lesson are available on "[Figshare](https://doi.org/10.6084/m9.figshare.1172094)". The data contains the metadata for journal articles categorised under 'History' in the British Library ESTAR database. The data is shared under a CC0 copyright waiver. -Download the required files, save them to your computer, and unzip them. If you do not have default software installed to interact with .zip files, we recommend [7-zip](http://www.7-zip.org/) for this purpose. On Windows, we recommend unzipping the folder provided to your c: drive so the files are at `c:\proghist\`. However, any location will work fine, but you may have to adjust your commands as you are following along with this lesson if you use a different location. +Download the required files, save them to your computer, and unzip them. If you do not have default software installed to interact with .zip files, we recommend [7-zip](https://www.7-zip.org/) for this purpose. On Windows, we recommend unzipping the folder provided to your c: drive so the files are at `c:\proghist\`. However, any location will work fine, but you may have to adjust your commands as you are following along with this lesson if you use a different location.
    April 2025 update: The paths indicated for Windows no longer correspond to those used by recent versions of Git (version 2.49.0 at the time of writing). Whenever the path c:\proghist\... is mentioned, you will need to replace it with c/Users/USERNAME/proghist/.... @@ -70,9 +70,9 @@ Type `ls` and then hit enter. This prints, or displays, a list that includes two The files in this directory are the dataset `2014-01_JA.csv` that contains journal article metadata and a file containing documentation about `2014-01_JA.csv` called `2014-01_JA.txt`. -The subdirectory is named `derived_data`. It contains four [.tsv](http://en.wikipedia.org/wiki/Tab-separated_values) files derived from `2014-01_JA.csv`. Each of these includes all data where a keyword such as `africa` or `america` appears in the 'Title' field of `2014-01_JA.csv`. The `derived_data` directory also includes a subdirectory called `results`. +The subdirectory is named `derived_data`. It contains four [.tsv](https://en.wikipedia.org/wiki/Tab-separated_values) files derived from `2014-01_JA.csv`. Each of these includes all data where a keyword such as `africa` or `america` appears in the 'Title' field of `2014-01_JA.csv`. The `derived_data` directory also includes a subdirectory called `results`. -*Note: [CSV](http://en.wikipedia.org/wiki/Comma-separated_values) files are those in which the units of data (or cells) are separated by commas (comma-separated-values) and TSV files are those in which they are separated by tabs. Both can be read in simple text editors or in spreadsheet programs such as Libre Office Calc or Microsoft Excel.* +*Note: [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) files are those in which the units of data (or cells) are separated by commas (comma-separated-values) and TSV files are those in which they are separated by tabs. Both can be read in simple text editors or in spreadsheet programs such as Libre Office Calc or Microsoft Excel.* Before you begin working with these files, you should move into the directory in which they are stored. Navigate to `c:\proghist\data\derived_data` on Windows or `~/users/USERNAME/proghist/data/derived_data` on OS X. @@ -80,7 +80,7 @@ Now that you are here you can count the contents of the files. The Unix command for counting is `wc`. Type `wc -w 2014-01-31_JA_africa.tsv` and hit enter. The flag `-w` combined with `wc` instructs the computer to print a word count, and the name of the file that has been counted, into the shell. -As was seen in "[Introduction to the Bash Command Line](../lessons/intro-to-bash)", flags such as `-w` are an essential part of getting the most out of the Unix shell as they give you better control over commands. +As was seen in "[Introduction to the Bash Command Line](/en/lessons/intro-to-bash)", flags such as `-w` are an essential part of getting the most out of the Unix shell as they give you better control over commands. If your research is more concerned with the number of entries (or lines) than the number of words, you can use the line count flag. Type `wc -l 2014-01-31_JA_africa.tsv` and hit enter. Combined with `wc` the flag `-l` prints a line count and the name of the file that has been counted. @@ -106,7 +106,7 @@ Strings need not be numbers. `grep -c revolution 2014-01-31_JA_america.tsv 2014- You can also use `grep` to create subsets of tabulated data. Type `grep -i revolution 2014-01-31_JA_america.tsv 2014-02-02_JA_britain.tsv > YEAR-MONTH-DAY_JA_america_britain_i_revolution.tsv` (where `YEAR-MONTH-DAY` is the date you are completing this lesson) and hit enter. This command looks in both of the defined files and exports any lines containing `revolution` (without regard to case) to the specified .tsv file. -The data has not been saved to to the `results` directory because it isn't strictly a result; it is derived data. Depending on your research project it may be easier to save this to another subdirectory. For now have a look at this file to verify its contents and when you are happy, delete it using the `rm` command. *Note: the `rm` common is very powerful and should be used with caution. Please refer to "[Introduction to the Bash Command Line](../lessons/intro-to-bash)" for instructions on how to use this command correctly.* +The data has not been saved to to the `results` directory because it isn't strictly a result; it is derived data. Depending on your research project it may be easier to save this to another subdirectory. For now have a look at this file to verify its contents and when you are happy, delete it using the `rm` command. *Note: the `rm` common is very powerful and should be used with caution. Please refer to "[Introduction to the Bash Command Line](/en/lessons/intro-to-bash)" for instructions on how to use this command correctly.* Finally, you can use another flag, `-v`, to exclude data elements when using the `grep` command. Type `grep -iv revolution 2014*_JA_a*.tsv > 2014_JA_iv_revolution.csv` and hit enter. This query looks in the defined files (three in total) and exports all lines that do not contain `revolution` or `Revolution` to `c:\proghist\data\derived_data\2014_JA_iv_revolution.csv`. diff --git a/en/lessons/retired/OCR-and-Machine-Translation.md b/en/lessons/retired/OCR-and-Machine-Translation.md index c1e4c5997c..e18a35f01a 100644 --- a/en/lessons/retired/OCR-and-Machine-Translation.md +++ b/en/lessons/retired/OCR-and-Machine-Translation.md @@ -18,8 +18,8 @@ abstract: This lesson covers how to convert images of text into text files and t avatar_alt: An image of a tree with the Latin phrase Labor Omnia Vincit Improbus doi: 10.46430/phen0091 redirect_from: - - /lessons/OCR-and-Machine-Translation - - /en/lessons/OCR-and-Machine-Translation + - /lessons/OCR-and-Machine-Translation/ + - /en/lessons/OCR-and-Machine-Translation/ retired: true retirement-reason: | Yandex, the translation software used in this lesson, has been deprecated. To successfully follow this lesson, many steps require significant adaptations, especially if users are working on a non-Mac operating system. @@ -80,7 +80,7 @@ With ImageMagick installed, we can now convert our files from PDF to TIFF and ma The command does several things that significantly increase the OCR accuracy rate. The `density` and `depth` commands both make sure the file has the appropriate dots per inch [(DPI)](https://en.wikipedia.org/wiki/Dots_per_inch) for OCR. The `strip`, `background`, and `alpha` commands make sure that the file has the right background. Most importantly, this command converts the PDF into a TIFF image file. If you are not using a PDF, you should still use the above command to ensure the image is ready for OCR. -After these changes, your image may still have problems. For example, there may be a skew or uneven brightness. Fortunately, [ImageMagick](https://imagemagick.org/index.php) is a powerful tool that can help you clean image files. For other ImageMagick options that can improve OCR quality, review this helpful [collection of scripts](http://www.fmwconcepts.com/imagemagick/textcleaner/index.php). Because OCR is a command line tool, you can write a script that will loop over over all of your images (hundreds or thousands) at once. You will learn how to write these kinds of scripts later in the lesson. +After these changes, your image may still have problems. For example, there may be a skew or uneven brightness. Fortunately, [ImageMagick](https://imagemagick.org/index.php) is a powerful tool that can help you clean image files. For other ImageMagick options that can improve OCR quality, review this helpful [collection of scripts](https://www.fmwconcepts.com/imagemagick/textcleaner/index.php). Because OCR is a command line tool, you can write a script that will loop over over all of your images (hundreds or thousands) at once. You will learn how to write these kinds of scripts later in the lesson. # OCR This lesson will use the OCR program [Tesseract](https://github.com/tesseract-ocr/tesseract), the most popular OCR program for Digital Humanities projects. Google maintains Tesseract as free software and released it under the Apache License, Version 2.0. Tesseract supports over 100 different languages, but if you have a particularly difficult or unique script (calligraphy or other handwriting) it might be worth training your own OCR model. For typewritten documents, you need a program that will recognize several similar fonts and correctly identify imperfect letters. Tesseract 4.1 does just that. Google has already trained Tesseract to recognize a variety of fonts for dozens of languages. The following commands will install Tesseract as well as the Russian language package, which you will need for the rest of the lesson: diff --git a/en/lessons/retired/OCR-with-Tesseract-and-ScanTailor.md b/en/lessons/retired/OCR-with-Tesseract-and-ScanTailor.md index 0f23f2302e..d8346117dc 100755 --- a/en/lessons/retired/OCR-with-Tesseract-and-ScanTailor.md +++ b/en/lessons/retired/OCR-with-Tesseract-and-ScanTailor.md @@ -7,9 +7,9 @@ retired_date: 2017-05-10 layout: lesson retired: true redirect_from: - - /lessons/ocr-with-tesseract-and-scantailor - - /lessons/deprecated/ocr-with-tesseract-and-scantailor - - /lessons/deprecated-OCR-with-Tesseract-and-ScanTailor + - /lessons/ocr-with-tesseract-and-scantailor/ + - /lessons/deprecated/ocr-with-tesseract-and-scantailor/ + - /lessons/deprecated-OCR-with-Tesseract-and-ScanTailor/ doi: 10.46430/phen0042 --- @@ -17,7 +17,7 @@ doi: 10.46430/phen0042 ## Lesson Goals -The goal of this lesson is to teach how to do OCR (Optical Character Recognition) for printed or typewritten text. After this lesson you will able to convert printed and typewritten texts into digital text files. In order to be able to go through all the steps the OCR'ing process demands, you'll need to have access to either a scanner or digital camera, a computer with internet access, some patience, and a lot of curiosity! We will make a few easy commands using the Command line, but if you have never used it before, you might want to look at the lesson [Introduction to the Bash Command Line] (/lessons/intro-to-bash). I use Windows, but these instructions should function also in Mac. +The goal of this lesson is to teach how to do OCR (Optical Character Recognition) for printed or typewritten text. After this lesson you will able to convert printed and typewritten texts into digital text files. In order to be able to go through all the steps the OCR'ing process demands, you'll need to have access to either a scanner or digital camera, a computer with internet access, some patience, and a lot of curiosity! We will make a few easy commands using the Command line, but if you have never used it before, you might want to look at the lesson [Introduction to the Bash Command Line](/en/lessons/intro-to-bash). I use Windows, but these instructions should function also in Mac. ## Why OCR? @@ -31,10 +31,14 @@ OCR'ing takes a lot of time. You'll spend numerous hours with your documents, an The OCR process can be divided into four stages: -* [Prework](#prework) -* [Image Preprocessing](#imagepreprocessing) -* [OCR](#ocr) -* [OCR Cleaning](#ocrcleaning) +- [Introduction to OCR](#introduction-to-ocr) + - [Lesson Goals](#lesson-goals) + - [Why OCR?](#why-ocr) + - [Stages of the Lesson](#stages-of-the-lesson) + - [Prework](#prework) + - [Image Preprocessing](#image-preprocessing) + - [OCR](#ocr) + - [OCR Cleaning](#ocr-cleaning) In this lesson, [Prework](#prework) means scanning or taking digital photographs of the printed or typewritten texts. The [Image Preprocessing](#imagepreprocessing) part means the work you do to the acquired images so that they will be more readable for the OCR program. [OCR](#ocr) is the process where a program converts the images of the letters, which are not understandable to a computer, into letters that are computer readable. The result of the OCR will not be without misspellings and other errors. Therefore, you'll need to [Clean the OCR](#ocrcleaning) output so that you'll be able to use it for your purposes. @@ -43,7 +47,7 @@ Before starting, try to doublecheck that no-one else has done the work already. Ok, are you ready? Let's start then! -## Prework +## Prework So, now that you have the material you want to convert into text form, there are two options for doing the prework. The preferable option is to scan the texts, if you have a good scanner. If you don't, you can take digital photographs of the documents. Scanning is a better option, because when the paper lays on the scanner's glass, it will be more or less straight. When you are taking photographs of a big book, the flexure of the paper might cause reduced readability later in the process. @@ -60,11 +64,11 @@ I use Zotero for keeping track of the metadata of my sources, and here is an exa {% include figure.html filename="OCR02.png" caption="My metadata in Zotero" %} -## Image Preprocessing +## Image Preprocessing So, now that we have the text as an image, we will move on to image preprocessing. -For image preprocessing I use a free and open source program called *Scan Tailor*. You can download Scan Tailor [here] (http://scantailor.org/downloads/). +For image preprocessing I use a free and open source program called *Scan Tailor*. You can download Scan Tailor [here](https://scantailor.org/downloads/). When you have downloaded Scan Tailor, open the text scans in the program by clicking "New project", browsing and selecting the file where you saved your text scans, and finally selecting the images you want to preprocess. Scan Tailor is a relatively heavy program, which means that it reserves quite a lot of the computer's capacity. For that reason it is better to divide large files into smaller entities, and preprocess them in parts. When starting a new project you can choose which images you want to select for preprocessing. When you have selected the scans for preprocessing, click "OK". @@ -108,12 +112,12 @@ If you feel you need more instructions on how to use Scan Tailor, they have good Save the preprocessed images to a place where it is easy to access them by using the command line. -## OCR +## OCR OK, let's move on! For OCR we will use a free and open source program called Tesseract. You can install Tesseract [here](https://code.google.com/p/tesseract-ocr/wiki/ReadMe). -If you are going to OCR other languages than English, you will also need to install the [language package](https://code.google.com/p/tesseract-ocr/downloads/list) for that language, and unpack it by using [7-zip](http://www.7-zip.org/). +If you are going to OCR other languages than English, you will also need to install the [language package](https://code.google.com/p/tesseract-ocr/downloads/list) for that language, and unpack it by using [7-zip](https://www.7-zip.org/). Now that we have Tesseract, we can proceed to doing the actual OCR! @@ -184,14 +188,14 @@ The batched text file will appear in the file. Now you have the OCR raw data! Congratulations! -## OCR Cleaning +## OCR Cleaning So, now as you look at the OCR'd text, you will notice soon, that the text is not perfect. Some words are probably not separated and some letters are not correctly written. What remains to be done, is to go through the text, and correct the errors. This is the moment when you will thank yourself if you have been able to do a good quality job in the previous phases, because you will probably have pretty clear data. This is also the moment when you will need to decide how pure text you want to have: are you willing to pay less attention in this phase and accept some blurred words, or is the clarity of the text so essential that you will check all the output? You might be interested in reading further on what to do with the raw OCR data. Fortunately, there are great lessons for that: -[Cleaning the OCR results with regular expressions](/lessons/cleaning-ocrd-text-with-regular-expressions) or [generating an ordered data set from the OCR results](/lessons/generating-an-ordered-data-set-from-an-OCR-text-file) or [preserving your data](/lessons/preserving-your-research-data). +[Cleaning the OCR results with regular expressions](/en/lessons/cleaning-ocrd-text-with-regular-expressions) or [generating an ordered data set from the OCR results](/en/lessons/generating-an-ordered-data-set-from-an-OCR-text-file) or [preserving your data](/en/lessons/preserving-your-research-data). Before OCR'ing all of your sources, try with a small sample of materials to do the whole process from scratch to the analysis part you are planning to do. When doing the latter steps you might realize some details that you will need to take into account when doing the previous stages. diff --git a/en/lessons/retired/counting-frequencies-from-zotero-items.md b/en/lessons/retired/counting-frequencies-from-zotero-items.md index 21c10c32dd..f485533b0b 100755 --- a/en/lessons/retired/counting-frequencies-from-zotero-items.md +++ b/en/lessons/retired/counting-frequencies-from-zotero-items.md @@ -12,16 +12,16 @@ activity: analyzing topics: [api] abstract: "This lesson will show you how to get information from Zotero HTML items, save the content from those items, and count the frequencies of words." -previous: creating-new-items-in-zotero +previous: /en/lessons/retired/creating-new-items-in-zotero exclude_from_check: - reviewers retired: true retirement-reason: | This lesson relied on the Python library libZotero, which is no longer maintained, and which now returns several errors when used. [See further discussion about this retirement decision.](https://github.com/programminghistorian/jekyll/issues/225) redirect_from: - - /lessons/counting-frequencies-from-zotero-items - - /lessons/deprecated/counting-frequencies-from-zotero-items - - /lessons/retired/counting-frequencies-from-zotero-items + - /lessons/counting-frequencies-from-zotero-items/ + - /lessons/deprecated/counting-frequencies-from-zotero-items/ + - /lessons/retired/counting-frequencies-from-zotero-items/ doi: 10.46430/phen0025 --- @@ -255,6 +255,6 @@ Word Frequencies ... ``` - [Counting Frequencies]: /lessons/counting-frequencies - [zip]: /assets/python-lessons5.zip - [Lesson on the Zotero API]: /lessons/intro-to-the-zotero-api +- [Counting Frequencies](/en/lessons/counting-frequencies) +- [zip](/assets/python-lessons5.zip) +-[Lesson on the Zotero API](/en/lessons/retired/intro-to-the-zotero-api) diff --git a/en/lessons/retired/creating-new-items-in-zotero.md b/en/lessons/retired/creating-new-items-in-zotero.md index 781f5e165d..52cc14164a 100755 --- a/en/lessons/retired/creating-new-items-in-zotero.md +++ b/en/lessons/retired/creating-new-items-in-zotero.md @@ -11,8 +11,8 @@ difficulty: 1 activity: transforming topics: [api] abstract: "In this lesson, you will create a new item in a Zotero library and add some basic metadata such as title and date." -next: counting-frequencies-from-zotero-items -previous: intro-to-the-zotero-api +next: /en/lessons/retired/counting-frequencies-from-zotero-items +previous: /en/lessons/retired/intro-to-the-zotero-api categories: [zotero, api] exclude_from_check: - reviewers @@ -20,8 +20,8 @@ retired: true retirement-reason: | This lesson relied on the Python library libZotero, which is no longer maintained, and which now returns several errors when used. [See further discussion about this retirement decision.](https://github.com/programminghistorian/jekyll/issues/225) redirect_from: - - /lessons/creating-new-items-in-zotero - - /lessons/deprecated/creating-new-items-in-zotero + - /lessons/creating-new-items-in-zotero/ + - /lessons/deprecated/creating-new-items-in-zotero/ doi: 10.46430/phen0026 --- diff --git a/en/lessons/retired/getting-started-with-github-desktop.md b/en/lessons/retired/getting-started-with-github-desktop.md index 5d4224dbbc..0eb0f44d70 100755 --- a/en/lessons/retired/getting-started-with-github-desktop.md +++ b/en/lessons/retired/getting-started-with-github-desktop.md @@ -15,13 +15,13 @@ activity: sustaining topics: [data-management] abstract: "In this lesson you will be introduced to the basics of version control, understand why it is useful and implement basic version control for a plain text document using git and GitHub." redirect_from: - - /lessons/getting-started-with-github-desktop - - /en/lessons/getting-started-with-github-desktop + - /lessons/getting-started-with-github-desktop/ + - /en/lessons/getting-started-with-github-desktop/ retired: true retirement-reason: | This lesson is for an old version of GitHub Desktop that is now no longer maintained or supported by GitHub. The new version and its documentation can be found at - We also recommend the Software Carpentry tutorial on version control at + We also recommend the Software Carpentry tutorial on version control at doi: 10.46430/phen0051 --- @@ -104,7 +104,7 @@ Dropbox, Google Drive and other services offer some form of version control in t Using version control has become pretty well established in some scientific disciplines, though its adoption is still far from universal. In the humanities and social sciences the use of version control systems like Git are much less common. The projects below show some possible ways of using Git in academic setting: -* [The Programming Historian](https://github.com/programminghistorian/jekyll) which uses GitHub in the work-flow of managing the [journal](https://github.com/programminghistorian/jekyll/issues), [lessons](/author-guidelines) and [producing the site.](/posts/how-we-moved-to-github) +* [The Programming Historian](https://github.com/programminghistorian/jekyll) which uses GitHub in the work-flow of managing the [journal](https://github.com/programminghistorian/jekyll/issues), [lessons](/en/author-guidelines) and [producing the site.](/posts/how-we-moved-to-github) * [Python Programming for the Humanities](https://github.com/fbkarsdorp/python-course) is a tutorial introducing the Python programming language. * [ProfHacker](https://www.chronicle.com/search?q=github) has posts on various projects on using GitHub in an academic context. @@ -136,11 +136,11 @@ Although there are many benefits to writing our documents in plain text files we Markdown is a way of including formatting into a plain text document. You may have come across HTML or LaTeX in the past. These markup languages also express information about the formatting and structure of plain text. Markdown, however, tries to minimize the syntax. This makes it easier to focus on the content of writing without the markup getting in the way---hence the name 'markdown.' -The Markdown syntax won't be covered in this lesson in order to keep the length short. However, it makes sense to explore Markdown once you have gone through this lesson and are comfortable with the basics of version control. GitHub integrates its own version of Markdown syntax. If you add Markdown syntax to documents you version control with GitHub Desktop these will be rendered on the GitHub website. Arguably the best way to learn Markdown is to begin using it. The [Getting Started with Markdown](/lessons/getting-started-with-markdown) lesson by Sarah Simpkin provides an overview of Markdown while the lesson [Sustainable Authorship in Plain Text using Pandoc and Markdown](/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) by Dennis Tenen and Grant Wythoff explains how to utilise Markdown in combination with Pandoc to use plain text for your academic writing. +The Markdown syntax won't be covered in this lesson in order to keep the length short. However, it makes sense to explore Markdown once you have gone through this lesson and are comfortable with the basics of version control. GitHub integrates its own version of Markdown syntax. If you add Markdown syntax to documents you version control with GitHub Desktop these will be rendered on the GitHub website. Arguably the best way to learn Markdown is to begin using it. The [Getting Started with Markdown](/en/lessons/getting-started-with-markdown) lesson by Sarah Simpkin provides an overview of Markdown while the lesson [Sustainable Authorship in Plain Text using Pandoc and Markdown](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) by Dennis Tenen and Grant Wythoff explains how to utilise Markdown in combination with Pandoc to use plain text for your academic writing. ### Text Editors -To write in plain text we want to use a text editor. There are a huge number of free and paid text editors available. Some of these are very straightforward and simple to use while others have a learning curve and potential uses beyond simple text editing. In the long run using a more advanced and extendable text editor like Vim or Emacs may save you time but for now we can start with a simpler editor. [Atom](https://atom.io/) is a good option for getting started. Atom is a text editor built by GitHub and includes syntax highlighting for Markdown alongside integration with GitHub. It is free and open source, a full 'flight manual', including installation instructions, is available [here](http://flight-manual.atom.io/). +To write in plain text we want to use a text editor. There are a huge number of free and paid text editors available. Some of these are very straightforward and simple to use while others have a learning curve and potential uses beyond simple text editing. In the long run using a more advanced and extendable text editor like Vim or Emacs may save you time but for now we can start with a simpler editor. [Atom](https://atom.io/) is a good option for getting started. Atom is a text editor built by GitHub and includes syntax highlighting for Markdown alongside integration with GitHub. It is free and open source, a full 'flight manual', including installation instructions, is available [here](https://flight-manual.atom.io/). If you don't want to install any new software then you can use your system's included text editor: TextEdit for Mac and Notepad for windows. If you decide to use Markdown beyond this tutorial then you will benefit from a text editor which includes syntax highlighting for Markdown alongside other features useful for writing. @@ -205,7 +205,7 @@ There are differences between using version control for code and text which will It is important that you use meaningful commit summaries and messages. Writing good commit messages requires some prior thought. Messages that make sense to you as an explanation of changes when you make a commit may no longer make sense to you in the future. If you are going to use version control in collaboration with other people it is especially important that other people can understand your commit messages. Version control as a system for managing changes to documents works best when active thought goes into using the software. It is therefore particularly important when collaborating with other that there is a shared understanding and approach to using version control. -One way of addressing this is to try to follow a 'commit style'. One influential [suggestion](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html) for a commit style has been made by Tim Pope. The style suggestions made by Tim Pope are partly ['built in'](https://github.com/blog/926-shiny-new-commit-styles) to the GitHub Desktop commit message interface but understanding the format will help ensure a consistent approach. The following commit message paraphrases Tim Pope's suggested format to focus on commits relating to text rather than code: +One way of addressing this is to try to follow a 'commit style'. One influential [suggestion](https://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html) for a commit style has been made by Tim Pope. The style suggestions made by Tim Pope are partly ['built in'](https://github.com/blog/926-shiny-new-commit-styles) to the GitHub Desktop commit message interface but understanding the format will help ensure a consistent approach. The following commit message paraphrases Tim Pope's suggested format to focus on commits relating to text rather than code: ``` Capitalized, short (50 chars or less) summary @@ -241,7 +241,7 @@ A potentially useful parallel to writing good commit messages is the messages in The benefits of using version control rely to a large degree on using the system effectively. This means thinking about when to make commits and how to best convey the changes in that commit in a message. Focusing on making both your messages and your commits 'atomic' will make it easier to 'move' through different stages of your repositories history. A good repository will allow you to easily understand changes that were made at different stages, will be understood by other people and will help you reflect on the changes you make to a document. -There is some difference between how you would manage a repository primarily focused on code and one focused on text. Both, however, benefit from clear and logical organisation. This is something that is important to do with your research data regardless of whether you are version controlling it and/or making it public. For a useful introduction to managing research data see James Baker's lesson [Preserving Your Research Data](/lessons/preserving-your-research-data). +There is some difference between how you would manage a repository primarily focused on code and one focused on text. Both, however, benefit from clear and logical organisation. This is something that is important to do with your research data regardless of whether you are version controlling it and/or making it public. For a useful introduction to managing research data see James Baker's lesson [Preserving Your Research Data](/en/lessons/preserving-your-research-data). ### Publishing Your Repository @@ -326,9 +326,9 @@ This may seem like a convoluted approach to dealing with conflicts but it is ver ## Version Control in a Plain Text Workflow -So far we have only implemented version control with a very basic document. Learning more about Markdown and writing in plain text will allow you to use version control in more the sorts of documents you would use in your day-to-day work. Version controlling a Markdown document will allow you to learn the Markdown syntax while reinforcing your understanding of version controlling documents. [Sustainable Authorship in Plain Text using Pandoc and Markdown](/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) by Dennis Tenen and Grant Wythoff will provide you with an understanding of how you could use plain text for academic writing using Pandoc and Markdown. Pandoc allows you to convert Markdown formated Plain Text files into numerous different formats including HTML, PDF and Word. The combination of Markdown, Pandoc and Version Control will provide a powerful, sustainable and flexible approach to academic writing. +So far we have only implemented version control with a very basic document. Learning more about Markdown and writing in plain text will allow you to use version control in more the sorts of documents you would use in your day-to-day work. Version controlling a Markdown document will allow you to learn the Markdown syntax while reinforcing your understanding of version controlling documents. [Sustainable Authorship in Plain Text using Pandoc and Markdown](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) by Dennis Tenen and Grant Wythoff will provide you with an understanding of how you could use plain text for academic writing using Pandoc and Markdown. Pandoc allows you to convert Markdown formated Plain Text files into numerous different formats including HTML, PDF and Word. The combination of Markdown, Pandoc and Version Control will provide a powerful, sustainable and flexible approach to academic writing. -The workflow introduced in this lesson can also be used as a foundation to create static websites hosted on GitHub. Once you are comfortable using GitHub Desktop, you may wish to proceed to Amanda Visconti's lesson, [Building a Static Website with Jekyll and GitHub Pages](../lessons/building-static-sites-with-jekyll-github-pages). +The workflow introduced in this lesson can also be used as a foundation to create static websites hosted on GitHub. Once you are comfortable using GitHub Desktop, you may wish to proceed to Amanda Visconti's lesson, [Building a Static Website with Jekyll and GitHub Pages](/en/lessons/building-static-sites-with-jekyll-github-pages). ## Further Resources @@ -340,5 +340,5 @@ GitHub Desktop offers an easy way of getting started with GitHub and version con * [Pro Git](https://git-scm.com/book/en/v2): A book on Git. Begins with the basics and later covers more advanced usage of Git. * For [students](https://education.github.com/pack) and [researchers](https://github.com/blog/1840-improving-github-for-science) GitHub offers free private repositories. These repositories may be useful for early drafts of work or for managing notes which are never intended to becoming public. Note: it might not be a good idea to store things which are very sensitive on GitHub even in a private repository. * [ProfHacker](https://www.chronicle.com/search?q=github) has posts on various projects on using GitHub in an academic context. -* [GitHub, Academia, and Collaborative Writing](https://www.hastac.org/blogs/harrisonm/2013/10/12/github-academia-and-collaborative-writing) discusses using GitHub for collaborative writing. -* [Introduction to the Bash Command Line](/lessons/intro-to-bash) introduces the command line which will be useful preparation for using GitHub on the command line. +* [GitHub, Academia, and Collaborative Writing](https://web.archive.org/web/20131215095438/https://www.hastac.org/blogs/harrisonm/2013/10/12/github-academia-and-collaborative-writing) discusses using GitHub for collaborative writing. +* [Introduction to the Bash Command Line](/en/lessons/intro-to-bash) introduces the command line which will be useful preparation for using GitHub on the command line. diff --git a/en/lessons/retired/graph-databases-and-SPARQL.md b/en/lessons/retired/graph-databases-and-SPARQL.md index 72412d4761..01bd456fff 100755 --- a/en/lessons/retired/graph-databases-and-SPARQL.md +++ b/en/lessons/retired/graph-databases-and-SPARQL.md @@ -17,8 +17,8 @@ topics: [lod] abstract: "This lesson explains why many cultural institutions are adopting graph databases, and how researchers can access these data though the query language called SPARQL." categories: [lessons] redirect_from: -- /lessons/graph-databases-and-SPARQL -- /en/lessons/graph-databases-and-SPARQL + - /lessons/graph-databases-and-SPARQL/ + - /en/lessons/graph-databases-and-SPARQL/ retired: true retirement-reason: | The British Museum has failed to maintain their collections database in a consistent and reliably-accessible manner. Although the SPARQL syntax and commands remain correct, the URLs they attempt to connect to have become too unreliable to use in a working lesson. @@ -53,7 +53,7 @@ great if you come looking for information about particular objects. However, it makes it difficult to aggregate information about every artist or donor that happens to be described in the dataset as well. -[api]: /lessons/intro-to-the-zotero-api.html +[api]: /en/lessons/retired/intro-to-the-zotero-api RDF databases are well-suited to expressing complex relationships between many entities, like people, places, events, and concepts tied to individual @@ -71,15 +71,15 @@ Vocabulary Program][getty], has also released their series of authoritative databases on geographic place names, terms for describing art and architecture, and variant spellings of artist names, as LOD. -[getty]: http://vocab.getty.edu +[getty]: https://vocab.getty.edu -[bm]: http://collection.britishmuseum.org +[bm]: https://collection.britishmuseum.org -[Europeana]: http://labs.europeana.eu/api/linked-open-data-introduction +[Europeana]: https://labs.europeana.eu/api/linked-open-data-introduction -[saam]: http://americanart.si.edu +[saam]: https://americanart.si.edu -[yale]: http://britishart.yale.edu/collections/using-collections/technology/linked-open-data +[yale]: https://britishart.yale.edu/collections/using-collections/technology/linked-open-data SPARQL is the language used to query these databases. This language is particularly powerful because it does not presuppose the perspectives that users @@ -137,7 +137,7 @@ maximum flexibility in deciding how they wish to query it. SPARQL lets us translate heavily interlinked, graph data into normalized, tabular data with rows and columns you can open in programs like Excel, or -import into a visualization suite such as [plot.ly](http://plot.ly) or +import into a visualization suite such as [plot.ly](https://plot.ly) or [Palladio]. It is useful to think of a SPARQL query as a [Mad @@ -284,13 +284,13 @@ SPARQL endpoint is a web address that accepts SPARQL queries and returns results. The BM endpoint is like many others: if you navigate to it in a web browser, it presents you with a text box for composing queries. -[bms]: http://collection.britishmuseum.org/sparql +[bms]: https://collection.britishmuseum.org/sparql {% include figure.html filename="sparql03.png" caption="The BM SPARQL endpoint webpage. For all the queries in this tutorial, make sure that you have left the 'Include inferred' and 'Expand results over equivalent URIs' boxes unchecked." %} When starting to explore a new RDF database, it helps to look at the relationships that stem from a single [example -object](http://collection.britishmuseum.org/id/object/PPA82633). +object](https://collection.britishmuseum.org/id/object/PPA82633). (For each of the following queries, click on the "Run query" link below to see the results. You can then run it as @@ -304,7 +304,7 @@ WHERE { } ``` -[Run query](http://collection.britishmuseum.org/sparql?query=SELECT+*%0D%0AWHERE+%7B%0D%0A++%3Chttp%3A%2F%2Fcollection.britishmuseum.org%2Fid%2Fobject%2FPPA82633%3E+%3Fp+%3Fo+.%0D%0A++%7D&_implicit=false&_equivalent=false&_form=%2Fsparql) +[Run query](https://collection.britishmuseum.org/sparql?query=SELECT+*%0D%0AWHERE+%7B%0D%0A++%3Chttp%3A%2F%2Fcollection.britishmuseum.org%2Fid%2Fobject%2FPPA82633%3E+%3Fp+%3Fo+.%0D%0A++%7D&_implicit=false&_equivalent=false&_form=%2Fsparql) By calling `SELECT ?p ?o` we're asking the database to return the values of `?p` and `?o` as described in the `WHERE {}` command. This query returns every @@ -485,14 +485,14 @@ authorities. One endpoint that does, however, is [Europeana's][eursparql]. They have created links between the objects in their database and records about individuals in -[DBPedia](http://wiki.dbpedia.org/) and [VIAF](https://viaf.org/), places in -[GeoNames](http://sws.geonames.org/), and concepts in the Getty Art & +[DBPedia](https://wiki.dbpedia.org/) and [VIAF](https://www.oclc.org/en/viaf.html), places in +[GeoNames](https://sws.geonames.org/), and concepts in the Getty Art & Architecture thesaurus. SPARQL allows you to insert `SERVICE` statements that instruct the database to "phone a friend" and run a portion of the query on an outside dataset, using the results to complete the query on the local dataset. While this lesson will go into the data models in Europeana and DBpedia in depth, the following query illustrates how a `SELECT` statement works. You may run it yourself by copying and pasting the query text into the [Europeana endpoint][eursparql]. -[eursparql]: http://sparql.europeana.eu/ +[eursparql]: https://sparql.europeana.eu/ ``` PREFIX edm: @@ -554,12 +554,12 @@ and languages. Parsing the XML verson of this output may be done with a tool like Beautiful Soup ([see its _Programming Historian_ -lesson](/lessons/intro-to-beautiful-soup.html)) or [Open -Refine](http://openrefine.org/). To quickly convert JSON results from a SPARQL +lesson](/en/lessons/intro-to-beautiful-soup)) or [Open +Refine](https://openrefine.org/). To quickly convert JSON results from a SPARQL endpoint into a tabular format, I recommend the free command line utility -[jq](http://stedolan.github.io/jq/download/). (For a tutorial on using command +[jq](https://stedolan.github.io/jq/download/). (For a tutorial on using command line programs, see ["Introduction to the Bash Command -Line"](/lessons/intro-to-bash.html).) The following query will convert the +Line"](/en/lessons/intro-to-bash).) The following query will convert the special JSON RDF format into a CSV file, which you may load into your preferred program for further analysis and visualization: @@ -579,14 +579,14 @@ to load data from the BM endpoint you must use the address aggregation query we used above to count artworks by type and clicking on "Run query". Palladio should display a preview table. -[Palladio]: http://palladio.designhumanities.org/ +[Palladio]: https://palladio.designhumanities.org/ {% include figure.html filename="sparql10.png" caption="Palladio's SPARQL query interface." %} After previewing the data returned by the endpoint, click on the "Load data" button at the bottom of the screen to begin manipulating it. (See this [_Programming Historian_ -lesson](/lessons/creating-network-diagrams-from-historical-sources.html#visualize-network-data-in-palladio) +lesson](/en/lessons/creating-network-diagrams-from-historical-sources#visualize-network-data-in-palladio) for a more in-depth tutorial on Palladio.) For example, we might make a [query that returns links to the images of prints made between 1580 and 1600](https://collection.britishmuseum.org/sparql?query=%23+Return+object+links+and+creation+date%0D%0APREFIX+bmo%3A+%3Chttp%3A%2F%2Fcollection.britishmuseum.org%2Fid%2Fontology%2F%3E%0D%0APREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+ecrm%3A+%3Chttp%3A%2F%2Ferlangen-crm.org%2Fcurrent%2F%3E%0D%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0D%0ASELECT+DISTINCT+%3Fobject+%3Fdate+%3Fimage%0D%0AWHERE+%7B%0D%0A%0D%0A++%23+We%27ll+use+our+previous+command+to+search+only+for+objects+of+type+%22print%22%0D%0A++%3Fobject+bmo%3APX_object_type+%3Fobject_type+.%0D%0A++%3Fobject_type+skos%3AprefLabel+%22print%22+.%0D%0A%0D%0A++%23+We+need+to+link+though+several+nodes+to+find+the+creation+date+associated%0D%0A++%23+with+an+object%0D%0A++%3Fobject+ecrm%3AP108i_was_produced_by+%3Fproduction+.%0D%0A++%3Fproduction+ecrm%3AP9_consists_of+%3Fdate_node+.%0D%0A++%3Fdate_node+ecrm%3AP4_has_time-span+%3Ftimespan+.%0D%0A++%3Ftimespan+ecrm%3AP82a_begin_of_the_begin+%3Fdate+.%0D%0A%0D%0A++%23+Yes%2C+we+need+to+connect+quite+a+few+dots+to+get+to+the+date+node%21+Now+that%0D%0A++%23+we+have+it%2C+we+can+filter+our+results.+Because+we+are+filtering+a+date%2C+we%0D%0A++%23+must+attach+the+xsd%3Adate+tag+to+our+date+strings+so+that+SPARQL+knows+how+to%0D%0A++%23+parse+them.%0D%0A%0D%0A++FILTER%28%3Fdate+%3E%3D+%221580-01-01%22%5E%5Exsd%3Adate+%26%26+%3Fdate+%3C%3D+%221600-01-01%22%5E%5Exsd%3Adate%29%0D%0A++%0D%0A++%3Fobject+bmo%3APX_has_main_representation+%3Fimage+.%0D%0A%7D%0D%0ALIMIT+100#query=%23+Return+object+links+and+creation+date%0APREFIX+bmo%3A+%3Chttp%3A%2F%2Fwww.researchspace.org%2Fontology%2F%3E%0APREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0APREFIX+ecrm%3A+%3Chttp%3A%2F%2Fwww.cidoc-crm.org%2Fcidoc-crm%2F%3E%0ASELECT+DISTINCT+%3Fobject+%3Fdate+%3Fimage%0AWHERE+%7B%0A++%0A++%23+We'll+use+our+previous+command+to+search+only+for+objects+of+type+%22print%22%0A++%3Fobject+bmo%3APX_object_type+%3Fobject_type+.%0A++%3Fobject_type+skos%3AprefLabel+%22print%22+.%0A%0A++%23+We+need+to+link+though+several+nodes+to+find+the+creation+date+associated%0A++%23+with+an+object%0A++%3Fobject+ecrm%3AP108i_was_produced_by+%3Fproduction+.%0A++%3Fproduction+ecrm%3AP9_consists_of+%3Fdate_node+.%0A++%3Fdate_node+ecrm%3AP4_has_time-span+%3Ftimespan+.%0A++%3Ftimespan+ecrm%3AP82a_begin_of_the_begin+%3Fdate+.%0A%0A++%0A++%23+Yes%2C+we+need+to+connect+quite+a+few+dots+to+get+to+the+date+node!+Now+that%0A++%23+we+have+it%2C+we+can+filter+our+results.+Because+we+are+filtering+a+date%2C+we%0A++%23+must+attach+the+xsd%3Adate+tag+to+our+date+strings+so+that+SPARQL+knows+how+to%0A++%23+parse+them.%0A%0A++FILTER(%3Fdate+%3E%3D+%221580-01-01%22%5E%5Exsd%3Adate+%26%26+%3Fdate+%3C%3D+%221600-01-01%22%5E%5Exsd%3Adate)%0A++%0A++%3Fobject+bmo%3APX_has_main_representation+%3Fimage+.%0A%7D%0ALIMIT+100), @@ -614,12 +614,12 @@ searching, or doing other mathematical operations more complex than counting. For a more complete rundown of the commands available in SPARQL, see these links: -- [Wikibooks SPARQL tutorial](http://en.wikibooks.org/wiki/XQuery/SPARQL_Tutorial) +- [Wikibooks SPARQL tutorial](https://en.wikibooks.org/wiki/XQuery/SPARQL_Tutorial) - [Full W3C Overview of SPARQL](https://www.w3.org/TR/sparql11-overview/) Both the Europeana and Getty Vocabularies LOD sites also offer extensive, and quite complex example queries which can be good sources for understanding how to search their data: -- [Europeana SPARQL how-to](http://labs.europeana.eu/api/linked-open-data-SPARQL-endpoint) -- [Getty Vocabularies Example Queries](http://vocab.getty.edu/queries) +- [Europeana SPARQL how-to](https://labs.europeana.eu/api/linked-open-data-SPARQL-endpoint) +- [Getty Vocabularies Example Queries](https://vocab.getty.edu/queries) diff --git a/en/lessons/retired/intro-to-augmented-reality-with-unity.md b/en/lessons/retired/intro-to-augmented-reality-with-unity.md index f1b8958e5a..abe2a2d7c8 100755 --- a/en/lessons/retired/intro-to-augmented-reality-with-unity.md +++ b/en/lessons/retired/intro-to-augmented-reality-with-unity.md @@ -21,8 +21,8 @@ retirement-reason: | [See further discussion about this retirement decision.](https://github.com/programminghistorian/jekyll/issues/717) redirect_from: - - /lessons/intro-to-augmented-reality-with-unity - - /lessons/deprecated/intro-to-augmented-reality-with-unity + - /lessons/intro-to-augmented-reality-with-unity/ + - /lessons/deprecated/intro-to-augmented-reality-with-unity/ doi: 10.46430/phen0053 --- @@ -45,19 +45,19 @@ In this introductory tutorial, you will learn how to: ## How can Humanists use Augmented Reality? -Novel applications of AR continue to surface within a variety of industries: [museums](https://www.youtube.com/watch?v=gx_UQxx54lo) are integrating AR content into their displays, [companies](http://www.gizmag.com/ikea-augmented-reality-catalog-app/28703/) are promoting AR apps in lieu of print or even web-based catalogs, and [engineering firms](https://www.youtube.com/watch?v=bXqe2zSepQ4) are creating AR applications showcasing their efforts to promote sustainability. [Predicted to grow](http://www.digi-capital.com/news/2015/04/augmentedvirtual-reality-to-hit-150-billion-disrupting-mobile-by-2020/#.VbetCU1VhHw) into a $120 billion industry within the next five years, augmented reality is an exciting new medium that humanists cannot afford to ignore. Indeed, many scholars within the growing field of digital humanities are beginning to explore how AR can be utilized as a viable medium of scholarly engagement within public spaces, objects, images, and texts. +Novel applications of AR continue to surface within a variety of industries: [museums](https://www.youtube.com/watch?v=gx_UQxx54lo) are integrating AR content into their displays, [companies](https://www.gizmag.com/ikea-augmented-reality-catalog-app/28703/) are promoting AR apps in lieu of print or even web-based catalogs, and [engineering firms](https://www.youtube.com/watch?v=bXqe2zSepQ4) are creating AR applications showcasing their efforts to promote sustainability. [Predicted to grow](https://www.digi-capital.com/news/2015/04/augmentedvirtual-reality-to-hit-150-billion-disrupting-mobile-by-2020/#.VbetCU1VhHw) into a $120 billion industry within the next five years, augmented reality is an exciting new medium that humanists cannot afford to ignore. Indeed, many scholars within the growing field of digital humanities are beginning to explore how AR can be utilized as a viable medium of scholarly engagement within public spaces, objects, images, and texts. {% include figure.html filename="new-ar-dev-1.png" caption="Augmented reality can be used to overlay digital information onto existing texts such as historical markers. This modified image is based on a photograph by Nicholas Henderson." %} -Since at least 2010, [digital artists](https://manifestarblog.wordpress.com/about/) have been creating AR applications for social advocacy and cultural intervention. For example, Tamiko Thiel's AR project [Clouding Green](http://www.tamikothiel.com/AR/clouding-green.html) reveals the carbon footprint of specific technology companies. Projects such as Thiel's capitalize on AR's unique rhetorical affordance to provide compelling, site-specific interactions between physical and digital spaces. +Since at least 2010, [digital artists](https://manifestarblog.wordpress.com/about/) have been creating AR applications for social advocacy and cultural intervention. For example, Tamiko Thiel's AR project [Clouding Green](https://www.tamikothiel.com/AR/clouding-green.html) reveals the carbon footprint of specific technology companies. Projects such as Thiel's capitalize on AR's unique rhetorical affordance to provide compelling, site-specific interactions between physical and digital spaces. -At the [Trace Initiative](http://web.archive.org/web/20180421163517/http://english.ufl.edu/trace_arcs/), a digital humanities organization in the University of Florida English Department, we seek to build upon the work of these artists by promoting the creation and circulation of humanities-focused mobile AR applications. We released our first AR application [to the Google Play store](https://play.google.com/store/apps/details?id=com.Trace.Dollars&hl=en) in spring 2016. +At the [Trace Initiative](https://web.archive.org/web/20180421163517/https://english.ufl.edu/trace_arcs/), a digital humanities organization in the University of Florida English Department, we seek to build upon the work of these artists by promoting the creation and circulation of humanities-focused mobile AR applications. We released our first AR application [to the Google Play store](https://play.google.com/store/apps/details?id=com.Trace.Dollars&hl=en) in spring 2016. The augmented reality software used in this tutorial relies on image-recognition technology, meaning that it requires some kind of visual trigger (a logo, painting, etc.) to know when to display digital content. In the example application depicted in the image above, the application is programmed to only display the digital image of John C. Calhoun if the camera "recognizes" the specific historical marker with which it is associated. For this lesson, we will augment the cover of a physical book with a digital overlay that displays a picture of the author. You could use the technical skills gained throughout this tutorial to create digital overlays for a variety of texts such as historical documents or signs. For example, you might create an application that allows readers to scan the pages of a book or document and access historical context or critique related to that specific page. Humanities scholars could also use this tutorial to create site-specific AR applications to educate visitors about cultural aspects of a location that have been excluded from its historical presentation. ## A Note About AR Creation Platforms -Unity is a very powerful and complex application used to create desktop, console, and mobile games. It is not designed exclusively for augmented reality development.As a result, this lesson has many detailed, albeit necessary, steps for navigating and operating the Unity interface. Although some of the steps might not be directly related to augmented reality development, they are certainly transferrable to other tutorials on Programming Historian or elsewhere that utilize Unity. If you would prefer to gain some familiarity with the Unity Editor prior to completing this lesson, I would suggest consulting [Unity's beginner tutorial videos](https://learn.unity.com/tutorial/live-sessions-on-unity-interface-and-essentials) and the online [Unity manual](http://docs.unity3d.com/Manual/LearningtheInterface.html). +Unity is a very powerful and complex application used to create desktop, console, and mobile games. It is not designed exclusively for augmented reality development.As a result, this lesson has many detailed, albeit necessary, steps for navigating and operating the Unity interface. Although some of the steps might not be directly related to augmented reality development, they are certainly transferrable to other tutorials on Programming Historian or elsewhere that utilize Unity. If you would prefer to gain some familiarity with the Unity Editor prior to completing this lesson, I would suggest consulting [Unity's UI Components tutorial](https://learn.unity.com/course/user-interface-ui/tutorial/ui-components) and the online [Unity manual](https://web.archive.org/web/20181204043602/https://docs.unity3d.com/Manual/LearningtheInterface.html). Within Unity, you can access additional functionality by importing "extensions." It is common to import extensions into Unity to gain access to additional functionality and/or pre-designed game components such as characters or game props. The extension used within this lesson is called "Vuforia," and it will provide the code necessary to create AR applications within the Unity game engine. I discuss how to download Unity and import the Vuforia extension in the section labelled "Software Requirements." @@ -113,7 +113,7 @@ Next, you will need to import the augmented reality package you just downloaded ### Java Development Kit -Download and install the [Java Development Kit](http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) for your operating system. +Download and install the [Java Development Kit](https://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html) for your operating system. {% include figure.html filename="ar-dev-1-9.png" caption="Download the .exe file for your operating system." %} @@ -225,7 +225,7 @@ This cover of *Of Mice and Men* has sufficient visual complexity; however, it is {% include figure.html filename="ar-dev-11.png" caption="Photo courtesy of Mark Skwarek." %} -If you are taking a picture of your book cover, make sure that there are no extraneous features present in the image. In the case of the *Of Mice and Men* image above, this would be anything beyond the edge of the cover. If your image contains such extraneous features, either take another picture or open it in a photo editor such as [Gimp](http://www.gimp.org/) and +If you are taking a picture of your book cover, make sure that there are no extraneous features present in the image. In the case of the *Of Mice and Men* image above, this would be anything beyond the edge of the cover. If your image contains such extraneous features, either take another picture or open it in a photo editor such as [Gimp](https://www.gimp.org/) and crop out these features. [Consult this video tutorial](https://www.youtube.com/watch?v=2rGGpOTSpbc) for help on cropping and resizing images in Gimp. Make sure that your image file is under 2.5 mb and that it is a .jpg or .png file. {% include figure.html filename="ar-dev-12.png" caption="Crop out the area around the book." %} @@ -304,7 +304,7 @@ To adjust your perspective in 3D space, hold the Alt button (Option on Mac) on y {% include figure.html filename="ar-dev-5.gif" caption="Position your author image on top of the book cover." %} -Because Unity is optimized for 3D environments, it is sometimes difficult to work with 2D game objects such as images. If you are new to Unity, do not be alarmed if you cannot find your images or if you feel disoriented while manipulating them in your scene view. If you want to learn more about using Unity's transform tools, I would suggest checking out [this short video tutorial by Info Gamer](https://www.youtube.com/watch?v=2Ariq8vc5Vc) and reading up on [Transforms in the Unity Manual](http://docs.unity3d.com/Manual/Transforms.html). +Because Unity is optimized for 3D environments, it is sometimes difficult to work with 2D game objects such as images. If you are new to Unity, do not be alarmed if you cannot find your images or if you feel disoriented while manipulating them in your scene view. If you want to learn more about using Unity's transform tools, I would suggest checking out [this short video tutorial by Info Gamer](https://www.youtube.com/watch?v=2Ariq8vc5Vc) and reading up on [Transforms in the Unity Manual](https://web.archive.org/web/20171101025409/https://docs.unity3d.com/Manual/Transforms.html). If you cannot find your author image in the scene view, try the following steps: @@ -329,7 +329,7 @@ If your overlay does not appear, double check the "Database Load Behaviour" comp ### Android -Before you can install your own applications on your Android device, you will need to [enable USB debugging](http://developer.android.com/tools/device.html). To do this, go to "Setting" > About Device" and tap the "Build number" seven times. Return to the previous screen and you should now see a "Developer Options" tab. Click it and make sure the option for "USB debugging" is checked. +Before you can install your own applications on your Android device, you will need to [enable USB debugging](https://developer.android.com/tools/device.html). To do this, go to "Setting" > About Device" and tap the "Build number" seven times. Return to the previous screen and you should now see a "Developer Options" tab. Click it and make sure the option for "USB debugging" is checked. {% include figure.html filename="ar-dev-25.png" caption="Tap the 'Build Number' seven times." %} diff --git a/en/lessons/retired/intro-to-beautiful-soup.md b/en/lessons/retired/intro-to-beautiful-soup.md index 6c9c4096ae..738b6cd475 100644 --- a/en/lessons/retired/intro-to-beautiful-soup.md +++ b/en/lessons/retired/intro-to-beautiful-soup.md @@ -15,9 +15,9 @@ and other markup languages." exclude_from_check: - review-ticket - reviewers -redirect_from: - - /lessons/intro-to-beautiful-soup - - /en/lessons/intro-to-beautiful-soup +redirect_from: + - /lessons/intro-to-beautiful-soup/ + - /en/lessons/intro-to-beautiful-soup/ retired: true retirement-reason: | The underlying website has changed and no longer produces the HTML referenced in the lesson. @@ -187,7 +187,7 @@ The Congressional database that we’re using is not an easy one to scrape because the URL for the search results remains the same regardless of what you’re searching for. While this can be bypassed programmatically, it is easier for our purposes to go -to , search for +to , search for Congress number 43, and to save a copy of the results page. @@ -653,11 +653,11 @@ for tr in trs: You’ve done it! You have created a CSV file from all of the data in the table, creating useful data from the confusion of the html page. - [Working with Text Files]: /lessons/working-with-text-files - [Command Line Bootcamp]: http://praxis.scholarslab.org/resources/bash/ - [Opening lines of Beautiful Soup]: http://www.crummy.com/software/BeautifulSoup/bs4/doc/ - [installing python modules]: /lessons/installing-python-modules-pip - [urllib3]: http://urllib3.readthedocs.org/en/latest/ - [Automated Downloading with Wget]: /lessons/automated-downloading-with-wget - [Downloading Multiple Records Using Query Strings]: /lessons/downloading-multiple-records-using-query-strings - [Document Object Model]: https://en.wikipedia.org/wiki/Document_Object_Model +- [Working with Text Files](/en/lessons/working-with-text-files) +- [Command Line Bootcamp](https://praxis.scholarslab.org/resources/bash/) +- [Opening lines of Beautiful Soup](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) +- [installing python modules](/en/lessons/installing-python-modules-pip) +- [urllib3](https://urllib3.readthedocs.org/en/latest/) +- [Automated Downloading with Wget](/en/lessons/automated-downloading-with-wget) +- [Downloading Multiple Records Using Query Strings](/en/lessons/downloading-multiple-records-using-query-strings) +- [Document Object Model](https://en.wikipedia.org/wiki/Document_Object_Model) diff --git a/en/lessons/retired/intro-to-the-zotero-api.md b/en/lessons/retired/intro-to-the-zotero-api.md index fe5c392722..77d578c1a5 100755 --- a/en/lessons/retired/intro-to-the-zotero-api.md +++ b/en/lessons/retired/intro-to-the-zotero-api.md @@ -12,7 +12,7 @@ activity: acquiring topics: [api] abstract: "In this lesson, you’ll learn how to use python with the Zotero API to interact with your Zotero library." -next: creating-new-items-in-zotero +next: /en/lessons/retired/creating-new-items-in-zotero categories: [lessons, api] exclude_from_check: - reviewers @@ -20,8 +20,8 @@ retired: true retirement-reason: | This lesson relied on the Python library libZotero, which is no longer maintained, and which now returns several errors when used. [See further discussion about this retirement decision.](https://github.com/programminghistorian/jekyll/issues/225) redirect_from: - - /lessons/intro-to-the-zotero-api - - /lessons/deprecated/intro-to-the-zotero-api + - /lessons/intro-to-the-zotero-api/ + - /lessons/deprecated/intro-to-the-zotero-api/ doi: 10.46430/phen0030 --- @@ -201,7 +201,7 @@ Now that we have worked through retrieving information using the Zotero API, we can continue to use it to interact with the items stored in our library. - [Zotero]: http://zotero.org + [Zotero]: https://zotero.org [Quick Start Guide]: https://www.zotero.org/support/quick_start_guide [libZotero GitHub library]: https://github.com/fcheslack/libZotero [Installing Python Modules with pip]: /lessons/installing-python-modules-pip diff --git a/en/lessons/scalable-reading-of-structured-data.md b/en/lessons/scalable-reading-of-structured-data.md index e9020ab142..ed2e8a5350 100644 --- a/en/lessons/scalable-reading-of-structured-data.md +++ b/en/lessons/scalable-reading-of-structured-data.md @@ -22,7 +22,7 @@ topics: [api] abstract: In this lesson, you will be introduced to 'scalable reading' and how to apply this workflow to your analysis of structured data. avatar_alt: Drawing of honeycomb lesson-partners: [Jisc, The National Archives] -partnership-url: /jisc-tna-partnership +partnership-url: /en/jisc-tna-partnership doi: 10.46430/phen0103 --- @@ -70,7 +70,7 @@ Below, the three steps are explained in general terms as well as specifically us If you want to reproduce the analysis we present below, using not only the overall conceptual framework but also the code, we assume that you already have a dataset containing Twitter data in a JSON format. If you don't have a dataset you can acquire one in the following ways: -1. Using one of Twitter’s APIs, e.g., their freely available so-called "Essential" API which we used to retrieve the dataset used in the example (see more about APIs this section to the [Introduction to Populating a Website with API Data](/en/lessons/introduction-to-populating-a-website-with-api-data#what-is-application-programming-interface-api)). This link will take you to [Twitter's API options](https://developer.twitter.com/en/docs/twitter-api/getting-started/about-twitter-api). You can use the 'rtweet' package, with your own Twitter account to access the Twitter API through R as described below. +1. Using one of Twitter’s APIs, e.g., their freely available so-called "Essential" API which we used to retrieve the dataset used in the example (see more about APIs this section to the [Introduction to Populating a Website with API Data](/en/lessons/introduction-to-populating-a-website-with-api-data)). This link will take you to [Twitter's API options](https://developer.twitter.com/en/docs/twitter-api/getting-started/about-twitter-api). You can use the 'rtweet' package, with your own Twitter account to access the Twitter API through R as described below. 2. Using the [Beginner's Guide to Twitter Data](/en/lessons/beginners-guide-to-twitter-data) from the _Programming Historian_. But rather than choosing a CSV output, choose a JSON. In R, you work with packages, each adding numerous functionalities to the core functions of R. Packages are often community-created code, made available for reuse. When using packages you are standing on the shoulders of other coders. In this example the relevant packages are the following: rtweet, tidyverse, lubridate and jsonlite. To install packages in R see this section of lesson [Basic Text Processing in R](/en/lessons/basic-text-processing-in-r#package-set-up). To use the packages in R they have to be loaded with the `library()` function as below: diff --git a/en/lessons/sentiment-analysis-syuzhet.md b/en/lessons/sentiment-analysis-syuzhet.md index 9cd7160f40..65f5b3c423 100644 --- a/en/lessons/sentiment-analysis-syuzhet.md +++ b/en/lessons/sentiment-analysis-syuzhet.md @@ -1,661 +1,661 @@ ---- -title: "Sentiment Analysis with 'syuzhet' using R" -slug: sentiment-analysis-syuzhet -original: analisis-de-sentimientos-r -layout: lesson -collection: lessons -date: 2021-03-23 -translation_date: 2023-04-01 -authors: -- Jennifer Isasi -translator: -- Adam Crymble -editors: -- Maria José Afanador-Llach -reviewers: -- Riva Quiroga -translation-editor: -- Rolando Rodriguez -translation-reviewer: -- Shuang Du -- Andrew Janco -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/478 -difficulty: 2 -activity: analyzing -topics: [distant-reading, r, data-visualization] -abstract: This lesson teaches you how to obtain and analyse narrative texts for patterns of sentiment and emotion. -avatar_alt: Engraving of three faces expressing different emotions -doi: 10.46430/phen0110 ---- - -{% include toc.html %} - - -# Lesson Objectives - -This lesson introduces you to the [`syuzhet`](https://perma.cc/9DNJ-ZWPW) [sentiment analysis](https://perma.cc/A92Q-PM4D) algorithm, written by [Matthew Jockers](https://perma.cc/9PF8-3GZ4) using the [R programming language](https://perma.cc/W78Z-FUAX), and applies it to a single narrative text to demonstrate its research potential. The term 'syuzhet' is Russian (сюже́т) and translates roughly as 'plot', or the order in which events in the narrative are presented to the reader, which may be different than the actual time sequence of events (the '[fabula](https://perma.cc/M7C9-XT99)'). The `syuzhet` package similarly considers sentiment analysis in a time-series-friendly manner, allowing you to explore the developing sentiment in a text across the pages. - -To make the lesson useful for scholars working with non-English texts, this tutorial uses a Spanish-language novel, *[Miau](https://perma.cc/G6V3-JCWS)* by [Benito Pérez Galdós](https://perma.cc/9P3P-2FQP) (1888) as its case study. This allows you to learn the steps necessary to work with everything from accented characters to thinking through the intellectual problems of applying English language algorithms to non-English texts. You do not need to know Spanish to follow the lesson (though you will if you want to read the original novel). Some steps in the following instructions may not be necessary if you are working with English-language texts, but those steps should be self-evident. - -Although the lesson is not intended for advanced R users, it is expected that you will have some knowledge of R, including an expectation that you already have [R installed](https://www.r-project.org/) and that you know how to load R packages. The author recommends downloading [RStudio](https://www.rstudio.com/) as a user-friendly environment for working in R. If you have not used R before, you may first want to try working through some of the following introductory R lessons: - -* Taylor Arnold and Lauren Tilton, '[Basic Text Processing in R](/en/lessons/basic-text-processing-in-r)', *Programming Historian* 6 (2017), https://doi.org/10.46430/phen0061 -* Taryn Dewar, '[R Basics with Tabular Data](/en/lessons/r-basics-with-tabular-data)', *Programming Historian* 5 (2016), https://doi.org/10.46430/phen0056 -* Nabeel Siddiqui, '[Data Wrangling and Management in R](/en/lessons/data-wrangling-and-management-in-r)', *Programming Historian* 6 (2017), https://doi.org/10.46430/phen0063 - -You may also be interested in other sentiment analysis lessons: - -* Zoë Wilkinson Saldaña, '[Sentiment Analysis for Exploratory Data Analysis](/en/lessons/sentiment-analysis),' *Programming Historian* 7 (2018), https://doi.org/10.46430/phen0079 -* Matthew Jockers, '[Introduction to the Syuzhet Package](https://perma.cc/9BN2-F3N3)' (2020). - -At the end of the lesson you will be able to: - -* Develop appropriate research questions that apply sentiment analysis to literary or narrative texts -* Use the R programming language, RStudio, and the `syuzhet` package with the [NRC Word-Emotion Association Lexicon](https://perma.cc/A8M5-2SDG) to generate sentiment scores for words in texts of various languages -* Critically interpret the results of your sentiment analysis -* Visualise the results through a range of graphs (bar, word cloud) to aid interpretation - -This lesson was written and tested using version 4.2.x of R using a Mac and on 4.0.x using a Windows machine. - -> Generally, R works the same on Windows, Mac, and Linux operating systems. However, when working on a Windows machine with non-English texts or those containing accents or special characters, you will need to include some extra instructions to apply [UTF-8](https://perma.cc/5HY2-HHN2) character encoding to ensure special characters are properly interpreted. Where this is a necessary step, it is shown below. - -
    -Translator's Note for Educators: - -A number of steps in this tutorial require loading / running time that may exceed 15 to 30 minutes during which participants have to wait. This may affect your ability to use the tutorial in a time-limited live event such as a workshop. Note also that to use this tutorial in a workshop setting, participants will need the ability to install software on their machine. -
    - -# Background Information - -This section introduces the concepts and the software that you will use to perform a sentiment analysis of a text. It also introduces the case study document, the novel *Miau* by Benito Pérez Galdós, and the ways you can apply sentiment analysis meaningfully to a text such as *Miau*. - -## Sentiment Analysis - -Sentiment analysis, also known as opinion mining, is an umbrella term for a number of processes for automatically calculating the degree of negativity or positivity in a text. It has been used for some time in the fields of marketing and politics to better understand the public mood;[^1] however, its adoption in literary studies is more recent and as of yet no one method dominates use.[^2] Some approaches to sentiment analysis also enable you to measure the presence of a number of different emotions in a text, as will be the case for the example in this tutorial. - -What is the difference between 'emotion' and 'sentiment'? The two words are often used interchageably in English but refer to different concepts. - -According to Antonio R. Damasio, 'emotions' are the biologically rooted, instinctive reactions of our bodies to environmental stimuli.[^3] There is no universally agreed list of basic emotions, however a common model includes six: anger (or rage), joy, disgust (or revulsion), fear, sadness, and surprise -- though for Damasio the last of those falls into a category he would describe as a '[secondary emotion](https://perma.cc/Y675-4C52)'. In the case of the automated system that you will use, the secondary emotions 'anticipation' and 'trust' are also options for analysis. - -'Sentiment', on the other hand, is both the action of and effect of feeling an emotion. In other words, as Óscar Pereira Zazo notes, 'when an object, a person, a situation, or a thought brings us joy, it begins a process that can lead to the feeling of being joyful or happy'.[^4] Sentiment analysis suggests that you can measure the intensity of this effect (either positive, negative, or neutral) on the manifestation of an emotion. - -This lesson distiguishes between the two terms as described above. The effect (sentiment) will be measured as it evolves across the pages of the text, while the emotions will be measured by looking at word use more generally. - -## NRC Word-Emotion Association Lexicon - -Many sentiment analysis algorithms depend upon pre-compiled lexicons or dictionaries that assign numerical sentiment scores to words or phrases based on findings from previous linguistic research. The R package `syuzhet` has been designed to allow you to choose from four of these sentiment lexicons: [Bing](https://perma.cc/G9RV-RA82), [Afinn](https://perma.cc/GZB2-J2RH), [Stanford](https://perma.cc/TK8L-44ZW), and the [NRC Word-Emotion Association Lexicon](https://perma.cc/A8M5-2SDG).[^5] This lesson uses the NRC lexicon, as it is the only one of the four that can currently be used with non-English texts. - -This lexicon, which includes positive and negative sentiment values as well as eight emotional categories, was developed by Saif M. Mohammad, a scientist at the National Research Council Canada (NRC). The dataset that forms the lexicon has been manually annotated using the [Maximum Difference Scaling](https://perma.cc/KWW4-AFJ4) technique, or MaxDiff, to determine the most negative or positive sets of words relative to other words -- a sort of ranking of sentiment intensity of words.[^6] This particular lexicon has 14,182 unigrams (words) classified as either positive or negative. It also classifies a word's connection to various emotions: anger, anticipation, disgust, fear, joy, sadness, surprise, and trust. Using automatic translation, which may lack linguistic nuance in unpredictable ways, it is available in more than one hundred languages. - -The license on the dataset allows free use of the NRC lexicon for research purposes. All data is available for download. - -The [NRC Word-Emotion Association Lexicon](https://perma.cc/A8M5-2SDG) website outlines the different categories and classifications in the dataset. It also provides a number of resources that can help you to better understand how the lexicon was built, including links to published research, more information on obtaining values for individual words, the organisation of the dataset, and how to extend it. - -## The `syuzhet` R Package - -The [R package](https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html) `syuzhet` was released in 2015 by Matthew Jockers; at the time of writing it is still being actively maintained (we use version 1.0.6, the November 2020 release, in this lesson). - -If you intend to use the software on non-English texts, you should be aware that the package has been developed and tested in English, and it has not been received without controversy, including from [Annie Swafford](https://perma.cc/TYT3-5DTU) who challenged some of the algorithm's assumptions about text and the use of `syuzhet` in a research setting. This included concerns about incorrectly splitting sentences involving quotation marks, and problems with using a sentiment lexicon designed for modern English on a historic text that uses the same words in slightly different ways. Assigning concrete values of measurement to literary texts, which are by their nature quite subjective, is always challenging and potentially problematic. A series of archived blog entries by Jockers outline [his thoughts on the method and address some of the criticisms](https://web.archive.org/web/20190708100723/http://www.matthewjockers.net/page/2/) about the degree to which sentiment can accurately be measured when sometimes even humans disagree on a passage of text's effects on the reader. - - -> Some Research Warnings: The lexicon assigns values to individual words which are used as the basis for conducting the quantitative analysis. Those values were assigned by humans working in North America and may carry English-language and North American cultural biases. Researchers must therefore take several things into account before applying this methodology in their work: -> -> - The Spanish lexicon (and other non-English versions) is a direct translation carried out via machine translation. In the author's opinion, these systems are already fairly reliable when translating between English and Spanish but less so for other languages that NRC claims to be operable with, including Basque, for example. -> - The sentiment and emotion scores of each word need to be understood in cultural and temporal context. A term that the people building the NRC lexicon labelled positive may be negative in other contexts. This type of approach is therefore inherently coarse in its ability to reflect a *true* reading of the texts as conducted by a subject specialist through close reading. -> - The author does not recommend the use of this methodology in texts that are significantly metaphorical or symbolic. -> - This particular method does not properly handle negation. For example, it will wrongly classify 'I am not happy' as positive because it looks at individual words only. Research by Richard Socher (2014) has attempted to improve issues of negation in sentiment analysis, and may be worth exploring for those with a genuine research need.[^7] -> Following the spirit of adaptability of *Programming Historian* lessons in other languages, the author has decided to use `syuzhet` in its original form; however, at the end of the lesson you will be introduced to some advanced functions that will help you use your own sentiment dictionary with the package. - -As this tutorial works with emotion of a Spanish text, Table 1 provides a simple translation matrix of the key emotion names for ease of reference. - - -Table 1: Emotion categories in English and Spanish - -| English | Spanish | -| -------- | ------- | -| anger | enfado | -| anticipation | anticipación | -| disgust | disgusto | -| fear | miedo | -| joy | alegría | -| sadness | tristeza | -| surprise | sorpresa | -| trust | confianza | -| negative | negativo | -| positive | positivo | - - -## A Brief Example - -Before diving into the full analysis of our text *Miau*, we offer a short example of sentiment analysis in action, using `syuzhet` together with the NRC lexicon, focusing on the outputs instead of the code. This analysis uses R and prompts you to [tokenise](https://perma.cc/243B-E9M7) the text into a list of single-word strings (unigrams) that are then analysed one at a time. Sentence-level analysis is also possible in sentiment analysis, but is not the focus of this tutorial. - -Consider the analysis of the final passage from *Miau*: - -> **Spanish Original**: Retumbó el disparo en la soledad de aquel abandonado y tenebroso lugar; Villaamil, dando terrible salto, hincó la cabeza en la movediza tierra, y rodó seco hacia el abismo, sin que el conocimiento le durase más que el tiempo necesario para poder decir: «Pues... sí...». -> -> **Rough English Translation**: The shot boomed out in the solitude of that abandoned and gloomy space; Villaamil, taking a terrible leap, bowed his head to the moving earth and rolled towards the abyss, his awareness lasting no longer than the time necessary to say: 'Well...yes...'. -> -> *Miau* by Benito Pérez Galdós. - -This passage will be transformed into a list of words: - -```R -example: - -> [1] "retumbó" "el" "disparo" "en" "la" "soledad" -> [7] "de" "aquel" "abandonado" "y" "tenebroso" "lugar" -> [13] "villaamil" "dando" "terrible" "salto" "hincó" "la" ... -``` - -Using the sentiment analysis function, you then calculate the eight emotions as classified by NRC, as well as the positive and negative scores of each word. The following is the result for the first few words in this short passage: - -```R -print(example_2, row.names = example) - -> anger anticipation disgust fear joy sadness surprise trust negative positive -> retumbó 0 0 0 0 0 0 0 0 0 0 -> el 0 0 0 0 0 0 0 0 0 0 -> disparo 3 0 0 2 0 2 1 0 3 0 -> en 0 0 0 0 0 0 0 0 0 0 -> la 0 0 0 0 0 0 0 0 0 0 -> solitude 0 0 0 2 0 2 0 0 2 0 -> de 0 0 0 0 0 0 0 0 0 0 -> aquel 0 0 0 0 0 0 0 0 0 0 -> abandonado 2 0 0 1 0 2 0 0 3 0 -> y 0 0 0 0 0 0 0 0 0 0 -> tenebroso 0 0 0 0 0 0 0 0 0 0 -> lugar 0 0 0 0 0 0 0 0 0 0 -> villaamil 0 0 0 0 0 0 0 0 0 0 -> dando 0 0 0 0 0 0 0 0 0 1 -> terrible 2 1 2 2 0 2 0 0 2 0 -> salto 0 0 0 0 0 0 0 0 0 0 -> hincó 0 0 0 0 0 0 0 0 0 0 -> la 0 0 0 0 0 0 0 0 0 0 -... -``` - -
    -Translator's Note: -R will not translate these into English for you, but to make the tutorial easier to follow for English speakers, the same output would look like the following if the passage was in English (notice that when translating word-by-word the results are slightly different than when translating whole passages, as above): -
    - -```R -print(example_2, row.names = example) - -> anger anticipation disgust fear joy sadness surprise trust negative positive -> boomed 0 0 0 0 0 0 0 0 0 0 -> the 0 0 0 0 0 0 0 0 0 0 -> shot 3 0 0 2 0 2 1 0 3 0 -> in 0 0 0 0 0 0 0 0 0 0 -> the 0 0 0 0 0 0 0 0 0 0 -> solitude 0 0 0 2 0 2 0 0 2 0 -> of 0 0 0 0 0 0 0 0 0 0 -> that 0 0 0 0 0 0 0 0 0 0 -> abandoned 2 0 0 1 0 2 0 0 3 0 -> and 0 0 0 0 0 0 0 0 0 0 -> gloomy 0 0 0 0 0 0 0 0 0 0 -> place 0 0 0 0 0 0 0 0 0 0 -> villaamil 0 0 0 0 0 0 0 0 0 0 -> taking 0 0 0 0 0 0 0 0 0 1 -> terrible 2 1 2 2 0 2 0 0 2 0 -> leap 0 0 0 0 0 0 0 0 0 0 -> bowed 0 0 0 0 0 0 0 0 0 0 -> his 0 0 0 0 0 0 0 0 0 0 -... -``` - -The results are returned in a [data frame](https://perma.cc/ER4M-WRRC). Using this scoring system, every word in our human languages has a default value of 0 indicating no connection to the corresponding emotion. Any words not in the NRC lexicon will be treated by the code as if they have values of 0 for all categories. Any word with a scores greater than 0 indicates that it is both present in the NRC lexicon, and that it has been assigned a value by the researchers responsible for that lexicon indicating the strength of its connection to one of the emotional categories. - -In this example we can see that the words 'disparo' (shot), 'soledad' (solitude), 'abandonado' (abandoned), and 'terrible' (terrible) have a negative score associated with them (second-to-last column), while 'dando' (taking) is judged as a positive word (last column). - -We are also able to see which emotions each word is connected to: 'disparo' (shot) is associated with *anger* (3), *fear* (2), *sadness* (2), and *surprise* (1). Higher numbers mean greater strength of the connection to that emotion. - -The possibilities of exploring, analysing, and visualising these results depend on your programming skills, but also your research needs. To help you reach your potential with sentiment analysis, this lesson introduces you how to analyse data and build understanding of the results through various visualisations. - -## Appropriate Research Questions - -As already stated, in this lesson, you will analyse the Spanish novel *Miau* by [Benito Pérez Galdós](https://perma.cc/9P3P-2FQP), published in 1888. Known for his Spanish realist novels, this particular Pérez Galdós story takes place in Madrid at the end of the nineteenth century and satirises the government administration of the day. In a kind of tragic comedy, we witness the final days of Ramón Villaamil after becoming unemployed, while his family is trying to stretch their meagre budget while keeping up the pretence of wealthy living. Villaamil's spiral of misfortune and his inability to find a new job ends in tragedy. - -From a research standpoint, the question is: Can we observe the emotional downward spiral of this plot through an automatic extraction of sentiment in the text? Does a human reader's interpretation of the negative experiences of Villaamil match the results of the algorithm? And if so, what words within the novel are used most to signal the emotional trajectory of the story? - - -# Obtaining Sentiment and Emotion Scores - -The process of conducting the sentiment analysis is a four stage affair. First, code must be installed and loaded into the R environment of your choice. Then, you must load and pre-process the text you want to analyse. Then you conduct your analysis. Finally, you turn your attention to interpreting the results. - -## Install and Load Relevant R Packages - -Before processing the text, you must first install and load the correct R code packages. In this case, that includes [`syuzhet`](https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html). You will also be visualising the results, which will require a number of other R packages: [`RColorBrewer`](https://cran.r-project.org/web/packages/RColorBrewer/index.html), [`wordcloud`](https://perma.cc/GM67-HBH3), [`tm`](https://perma.cc/T2JG-LEBJ) and [`NLP`](https://perma.cc/NS79-H5DH). - -To install and load these packages, copy and execute the sample code below in your chosen R coding environment. The first few lines will install the packages (only needed if you haven't already got the packages installed). The second set of lines will load them so that you can use them in your programme. The installation of these packages may take a few minutes. - -```R -# Install the Packages -install.packages("syuzhet") -install.packages("RColorBrewer") -install.packages("wordcloud") -install.packages("tm") - -# Load the Packages -library(syuzhet) -library(RColorBrewer) -library(wordcloud) -library(tm) -``` - -## Load and Prepare the Text - -Next, download a machine readable copy of the novel: [*Miau*](/assets/analisis-de-sentimientos-r/galdos_miau.txt) and make sure to save it as a .txt file. When you open the file you will see that the novel is in [plain text](https://perma.cc/Z5WH-V9SW) format, which is essential for this particular analysis using R. - -With the text at hand, you first need to load it into R as one long string so that you can work with it programmatically. Make sure to replace `FILEPATH` with the location of the novel on your own computer (don't just type 'FILEPATH'). This loading process is slightly different on Mac/Linux and Windows machines: - -### On Mac and Linux - -You can [find the FILEPATH](https://perma.cc/ZXZ8-FZHG) using your preferred method. The final format on my computer is `/Users/Isasi/Desktop/miau.txt` - -On a Mac/Linux machine, use the function `get_text_as_string`, which is part of the `syuzhet` package: - -```R -text_string <- get_text_as_string("FILEPATH") -``` - -### On Windows - -You can [find the FILEPATH](https://perma.cc/N9R4-HEJY) using your preferred method. The final format on my computer is `C:\\Users\\Isasi\\Desktop\\miau.txt` - -The Windows operating system cannot directly read characters with tildes, accents, or from extended alphabet sets, all of which are commonly used in languages such as Spanish, French, and Portuguese. Therefore we must first alert the software that our novel uses the [UTF-8](https://perma.cc/5HY2-HHN2) set of characters (which includes accents and many other non-English characters). We do this using the `scan` function. - -> Note that when typing your filepath, you may need to escape the backslashes (`\`) in the filepath. To do this, just add a second backslash each time it appears in the path. (E.g. "`C:\\...`" - -```R -text_string <- scan(file = "FILEPATH", fileEncoding = "UTF-8", what = character(), sep = "\n", allowEscapes = T) -``` ---- - -Now that the data has loaded, you have to format it in the way the sentiment analysis algorithm expects to receive it. In this particular case, that is as a [list](https://perma.cc/LPV9-XGX8) containing either single words or sentences (here you will focus on individual words only). - -This means you need an intermediate step between loading the text and extracting the sentiment values. To meet this need, we will divide the character string into a list of words, sometimes also referred to as [unigrams](https://perma.cc/FX4C-ZLYB) or [tokens](https://perma.cc/V6UY-KKVK). - -To do this you can use the package's built-in `get_tokens()` function to generate a new data object containing each individual word as a list. This function also removes spaces and punctuation from the original text. This approach to tokenisation uses [regular expressions](https://perma.cc/W7YD-K3R7) and is not always appropriate in all use cases. It will, for example, split hyphenated words into two. Depending on your text, you should consider the implications of your chosen method of tokenisation as you can use any method you like as long as the output is in the same format as in the example below. - -```R -text_words <- get_tokens(text_string) -head(text_words) - -> [1] "miau" "por" "b" "pérez" "galdós" "14" -``` - -Now you can use the `length()` function to count how many words are in the original text: - -```R -length(text_words) - -> [1] 97254 -``` - -If you want to analyse the text by sentence, use the `get_sentences()` function and follow the same proccess except for creating the word cloud below: - -```R -> sentence_vector <- get_sentences(text_string) -length(sentence_vector) -[1] 6022 -``` - - -## Extracting Data with the NRC Sentiment Lexicon - -Now you can use the `get_nrc_sentiment` function to obtain the sentiment scores for each word in the novel. The default vocabulary for the software is English. Since this text is in Spanish, you will use the `lang` argument to set the vocabulary to Spanish. This would not be necessary if working on an English text. Then you will create a new data object to store the extracted data so that you can work with it further. This `get_nrc_sentiment` function searches for the presence of the eight emotions and two sentiments against each word in your list, and assigns each a number greater than 0 if the word is found within the NRC's lexicon. Depending on the speed of your computer and the nature of your text, this process may take between 15 and 30 minutes. - -```R -sentiment_scores <- get_nrc_sentiment(text_words, lang="spanish") -``` -You can also use this package with [a range of other languages](https://perma.cc/9BN2-F3N3), though the 2020 release only works on languages with Latin-based alphabets. Other lessons that can be substituted for `spanish` in the above line of code are: `basque`, `catalan`, `danish`, `dutch`, `english`, `esperanto`, `finnish`, `french`, `german`, `irish`, `italian`, `latin`, `portuguese`, `romanian`, `swedish`, and `welsh`. We can hope that the functionality will improve in future to include more languages. - -Some users reported getting a warning message when the code finished running. At the time of writing this is a warning that the `syuzhet` codebase may need to be updated in future, but should not affect your ability to use it at present. The warning was that "spread_() was deprecated in tidyr 1.2.0. Please use spread() instead. The deprecated feature was likely used in the syuzhet package. Please report the issue to the authors." In this case, only Matthew Jockers can fix the error, as it is an issue with the code he created, not with your instructions to run it. - -When the process finishes, you may want to verify the contents of the new data object. To avoid printing thousands of lines of text, you can use the `head()` function to show only the first six unigrams. If you are following the example, you should see the following (which is lacking in context at this point). - -```R -head(sentiment_scores) - -> anger anticipation disgust fear joy sadness surprise trust negative positive -> 1 0 0 0 0 0 0 0 0 0 0 -> 2 0 0 0 0 0 0 0 0 0 0 -> 3 0 0 0 0 0 0 0 0 0 0 -> 4 0 0 0 0 0 0 0 0 0 0 -> 5 0 0 0 0 0 0 0 0 0 0 -> 6 0 0 0 0 0 0 0 0 0 0 -``` - -### Summary of the Text - -More interesting is a summary of the values associated with each of the six emotions and two sentiments, which can be displayed using the `summary()` function. This can be very useful when comparing various texts, and can allow you to see different measures, such as the average relative value of each of the emotions and the two sentiments. For example, we can see that the novel *Miau* on average ([mean](https://perma.cc/5NKH-2TYV)), uses more positive (0.05153) language than negative (0.04658), according to the algorithm. However, it seems that terms associated with sadness (0.02564) are also more prevalent than those associated with joy (0.01929). - -This summary output also shows a number of other calculations, many of which have a value of 0, including the [median](https://perma.cc/KB36-B855). Words that are not found in the sentiment lexicon (NRC) will automatically be treated as if they have a value of 0. Because there are a lot of categories and the story is quite complex, it is not surprising that no one emotion or sentiment has distinctively high statistical values. This makes the minimum, maximum, and mean the most useful measures from this summary output. - -```R -summary(sentiment_scores) - -> anger anticipation disgust fear -> Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 -> 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 -> Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000 -> Mean :0.01596 Mean :0.02114 Mean :0.01263 Mean :0.02243 -> 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 -> Max. :5.00000 Max. :3.00000 Max. :6.00000 Max. :5.00000 -> joy sadness surprise trust -> Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 -> 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 -> Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000 -> Mean :0.01929 Mean :0.02564 Mean :0.01035 Mean :0.03004 -> 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 -> Max. :5.00000 Max. :7.00000 Max. :2.00000 Max. :3.00000 -> negative positive -> Min. :0.00000 Min. :0.00000 -> 1st Qu.:0.00000 1st Qu.:0.00000 -> Median :0.00000 Median :0.00000 -> Mean :0.04658 Mean :0.05153 -> 3rd Qu.:0.00000 3rd Qu.:0.00000 -> Max. :7.00000 Max. :5.00000 -``` - -# Interpreting the Results - -You now have the quantitative results of your sentiment analysis of a text. Now, what can you do with these numbers? This section introduces three different visualisations of the data: bar charts, word counts, and word clouds, which offer quick but different ways of making sense of the outputs and telling a story or forming an argument about what you've discovered. - -## Bar Chart by Emotion - -To quickly get a sense of which emotions have a major presence in the text, a bar chart is both a simple and effective format for displaying your data (Figure 1). The built-in [`barplot()`](https://perma.cc/5DXU-CYS9) function can be paired with the summary data of each of the emotions: *anger*, *anticipation*, *disgust*, *fear*, *joy*, *sadness*, *surprise*, and *trust*. These are stored in columns 1 to 8 of our data table. This approach of displaying the data uses the `prop.table()` function with the results of each of the emotion words to present the results.[^8] - -```R -barplot( - colSums(prop.table(sentiment_scores[, 1:8])), - space = 0.2, - horiz = FALSE, - las = 1, - cex.names = 0.7, - col = brewer.pal(n = 8, name = "Set3"), - main = "'Miau' by Benito Pérez Galdós, 1907 edition", - sub = "Analysis by Dr Jennifer Isasi", - xlab="emotions", ylab = NULL) -``` - -The rest of the parameters that you can see in the code are optional and have been added to help you learn how to customise the graph outputs. They include indicating the space between the bars (`space = 0.2`), that the chart should include vertical not horizontal bars (`horiz=FALSE`), and that the values on the axis should increase in units of 1 (`las=1`). We also reduce the font size of the labels (`cex.names = 0.7`) to make sure they fit nicely on the screen. Thanks to the [`RColorBrewer`](https://perma.cc/BHK9-AY7S) package that we installed and loaded at the beginning of the lesson, we can automatically colour the columns. In this case we've used the `brewer.pal` colour palette from `Set3`, and specified we need 8 colours (`n=8`) – one colour per columnn. You can learn more about `RColorBrewer` and its options on [the documentation page for that package](https://perma.cc/4EHL-P8E9). Finally, we add a title and subtitle to the graph using the `main` and `sub` parameters, along with the word `emotions` on the X axis. We have not added a label to the Y axis, but you could do so if you wished by following the model above. - -{% include figure.html filename="tr-en-analisis-de-sentimientos-r-1.png" alt="Bar chart showing the calculated scores of six emotions and two sub-emotions measured in the novel ‘Miau’ (1907) by Pérez Galdós. The emotions are anger, anticipation, disgust, fear, joy, sadness, surprise, and trust. The ‘trust’ bar is the tallest, followed by ‘sadness’ and ‘fear’, while ‘disgust’ and ‘surprise’ are the shortest. This is included because it shows the relative outputs of the sentiment analysis algorithm across these seven emotions." caption="Figure 1: Bar chart showing the calculated scores of six emotions and two sub-emotions measured in the novel 'Miau' by Pérez Galdós." %} - -If you are not interested in modifying these parameters, you could create a bar chart with default styling using the following code: - -```R -barplot(colSums(prop.table(sentiment_scores[, 1:8]))) -``` - -> Make sure you have enough space in the display window for the graph to draw properly, including space for the labels. - -This information already indicates to us that the *sadness* and *fear* emotions are more prevalent than those of *disgust* or *surprise*. But what words does Galdós use to express *fear*? And how often does each emotionally charged word appear in the novel? - -## Counting Words by Emotion - -One of the measures you can calculate using sentiment analysis is the frequency of words appearing in the text and how those words relate with each emotional category. To start with, you need to create a data object with all of the words that have a value greater than 0 -- in this case you will start with those corresponding to the *sadness* column. In order to select only that column, use the dollar symbol `$` after the name of your `sentiment_scores` variable to specify the name of the column you want to work with: *sadness*. - - -```R -sad_words <- text_words[sentiment_scores$sadness> 0] -``` - -The contents of `sad_words` does not tell you much on its own, since it only offers you the list of relevant words without any further context. To also obtain the number of appearances of each 'sadness' word, you can generate a table. To get a quick look of some of the top entries, use the `unlist` and `table` functions along with the `decreasing` argument to display the matches in descending order (if you want ascending order, change TRUE to FALSE); you can create a new table object to print the first twelve words in the list, along with their frequency using the following code (see Table 2 for translations of the Spanish words): - -```R -sad_word_order <- sort(table(unlist(sad_words)), decreasing = TRUE) -head(sad_word_order, n = 12) - -> muy nada pobre tarde -> 271 156 64 58 -> mal caso malo salir -> 57 50 39 35 -> madre insignificante ay culpa -> 33 29 24 22 -``` - -Table 2: English translations of the Spanish words in the preceding code output block - -| Spanish | English | -| ------- | ------- | -| muy | very | -| nada | nothing | -| pobre | poor | -| tarde | late | -| mal | bad | -| caso | case | -| malo | bad | -| salir | to leave | -| madre | mother | -| insignificante | insignificant | -| ay | ow! | -| culpa | fault | - - -If you want to know how many unique words are connected to sadness, you can use the `length` function on the newly created `sad_word_order` variable: - -```R -length(sad_word_order) - -> [1] 349 -``` - -You can repeat the same operation with the rest of the emotion categories, or those that you are interested in, as well as those with positive or negative sentiment scores. To make sure you understand how to adapt the code, try to obtain the results for the emotion 'joy' and compare them with 'sadness'.[^9] - -Depending on the type of analysis that you want to conduct, this may be an efficient approach. For the purposes of this introductory lesson, you are also going next generate a word cloud to help visualise the terms associated with each emotional category (for demonstration purposes, you will use four). - - -## An Emotional Word Cloud - -In order to create a word cloud of terms that correspond with each emotion in *Miau*, you are going to first collect all words with an emotion score greater than 0. Similarly to the previous example, you use the `$` symbol to specify which column of data (which emotion) you are interested in, indicating that you want entries with a value greater than 0. - -If working on a machine running Windows, you will have to indicate to the programme if your text contains accented characters using the following approach: - -### On Mac and Linux - -```R -cloud_emotions_data <- c( - paste(text_words[sentiment_scores$sadness> 0], collapse = " "), - paste(text_words[sentiment_scores$joy > 0], collapse = " "), - paste(text_words[sentiment_scores$anger > 0], collapse = " "), - paste(text_words[sentiment_scores$fear > 0], collapse = " ")) -``` - -### On Windows - -Windows needs an additional step to indicate the text is in UTF-8 format, which is done using the `iconv` function. - -```R -cloud_emotions_data <- c( - paste(text_words[sentiment_scores$sadness> 0], collapse = " "), - paste(text_words[sentiment_scores$joy > 0], collapse = " "), - paste(text_words[sentiment_scores$anger > 0], collapse = " "), - paste(text_words[sentiment_scores$fear > 0], collapse = " ")) - -cloud_emotions_data <- iconv(cloud_emotions_data, "latin1", "UTF-8") -``` - -Once you have collected the data for the four target emotions, you can organise it into four separate `documents` to use as the basis for creating each of your four word clouds: - -```R -cloud_corpus <- Corpus(VectorSource(cloud_emotions_data)) -``` - -Next, you transform the corpus into a term-document matrix using the `TermDocumentMatrix()` function. Then you specify that you want the data organised as a matrix using the `as.matrix()` function. - -To see the first few entries of this output, use the `head` function: - -```R -cloud_tdm <- TermDocumentMatrix(cloud_corpus) -cloud_tdm <- as.matrix(cloud_tdm) -head(cloud_tdm) - -> Docs -> Terms 1 2 3 4 -> abandonado 4 0 4 0 -> abandonar 1 0 0 0 -> abandonará 2 0 0 0 -> abandonaré 1 0 0 0 -> abandonarías 1 0 0 0 -> abandono 3 0 3 0 - -``` - -Now, rename the numbered columns with the relevant emotion words so that the output is more human-readable. Again, you can see the state of your dataset with the `head` function: - -```R -colnames(cloud_tdm) <- c('sadness', 'happiness', 'anger', 'joy') -head(cloud_tdm) - -> Docs -> Terms sadness happiness anger trust -> abandonado 4 0 4 4 -> abandonar 1 0 0 1 -> abandonará 2 0 0 2 -> abandonaré 1 0 0 1 -> abandonarías 1 0 0 1 -> abandono 3 0 3 3 -``` - -Finally, you can visualise these results as a word cloud. The font size of a word in a word cloud is linked to the frequency of its appearance in the document. We can also control a number of other aspects of the word cloud's presentation. - -To start, use the `set.seed()` function to ensure that while following along your outputs will look the same as in the example (if you don't do this your output will have a randomised pattern and may not match the screenshots herein - which may not be important for your own research results but is helpful when following along). - -To generate the cloud itself, use the [comparison.cloud](https://perma.cc/6QRY-5KBG) function from the R `wordcloud` package. In this example, you will indicate that the object `cloud_tdm` will have a non-random word order. You will also specify the colour scheme of each group of words, the title size, and general scale of the visualisation. To make the cloud readable, you will also specify a maximum number of terms. These parameters are all adjustable. - -```R -set.seed(757) # this can be set to any integer -comparison.cloud(cloud_tdm, random.order = FALSE, - colors = c("green", "red", "orange", "blue"), - title.size = 1, max.words = 50, scale = c(2.5, 1), rot.per = 0.4) -``` - -You should get an image similar to Figure 2 although with the location of the words altered since it is generated according to the size of the canvas. - -{% include figure.html filename="tr-en-analisis-de-sentimientos-r-2.png" alt="Word Cloud of most frequent words corresponding to sadness, happiness, anger, and joy in the novel ‘Miau’ by Pérez Galdós. The words are colour-coded to show that they correspond with one of the four emotions, and use a cartesian coordinate system so that all words most closely associated with happiness are in the top left quadrant, sadness in the top right, and so on. Words that are most prevalent in the text appear closest to the centre of the graph. The word ‘muy’ (Spanish for ‘very’) is the largest word, and is associated with sadness. This is included because it shows which words are prevalent, and which emotions they are most closely associated with according to the sentiment analysis algorithm." caption="Figure 2: Word Cloud of most frequent words corresponding to sadness, happiness, anger, and joy in the novel 'Miau' by Pérez Galdós." %} - -What does the word cloud suggest to you? Surely the connection of 'very' (muy) to the sadness emotion and of 'money' (dinero) to the anger emotion needs further consideration. These less obvious results are exactly what many scholars warn about when thinking about sentiment analysis, and demonstrate why a researcher must always ask if the outcomes of the analysis make sense before trying to draw any research conclusions from them. As noted, the sentiment analysis vocabulary used in this tutorial uses a vocabulary that's been automatically translated from English, and is thus not perfect when used on Spanish-language text. - -## Visualising Emotion and Sentiment Across the Progression of a Text - -To complement the isolated readings of emotions as above, you can also study the fluctuation of positive and negative sentiment across the text (Figure 3). R provides a way to both normalise and visualise this time-series sentiment analysis data. Since the sentiment analysis algorithm assigns both positive and negative sentiment scores, you need to generate data between a range of -1 (most negative moments) and 1 (most positive moments); 0 is considered neutral. To calculate these scores, you multiply the values in the negative values of the original `sentiment_scores` data table by -1 and then add the result to the positive values. - - -```R -sentiment_valence <- (sentiment_scores$negative *-1) + sentiment_scores$positive -``` - -Finally, you can generate a graph with the `simple_plot()` function, which is built into the `syuzhet` package, and which offers you a choice of two different graphs; the first presents the various measurements calculated by the algorithm, and the second is a normalisation of those measures. The horizontal axis (X axis) presents the text in 100 normalised fragments and the vertical axis (Y axis) shows the strength of the sentiment in the text. Depending on the computing power of your machine, the graph may take 20 to 30 minutes to finish rendering. - -```R -simple_plot(sentiment_valence) -``` - -> Make sure your graph display window is sized large enough to actually draw the graph. If it isn't you will see the error message: `Error in plot.new() : figure margins too large.` - -{% include figure.html filename="tr-en-analisis-de-sentimientos-r-3.png" alt="A pair of line charts that show the rough emotional intensity of positive and negative sentiment across the whole novel. The graphs use a line graph with a solid curving line moving left-to-right to represent the beginning, middle, and end. In this particular novel, a simplified chart shows that the sentiment rises through the first quarter of the story, before diving in the middle and staying low until the end, representing quite a depressing story. A less simplified version shows that the sentiment picks up a few times later in the novel, but dips well into negative sentiment a number of times. This is included because it shows the emotional intensity of the novel over time." caption="Figure 3: Evolution of the use of positive and negative sentiment through the novel 'Miau' by Pérez Galdós" %} - -Based on Figure 3, you might conclude that the novel *Miau* begins with fairly neutral language, transitions into moments of happiness early on, and moves into some quite negative description in the remaining pages, ending on a negative note, as indicated by the sample sentence we drew upon earlier in the lesson in which Villaamil dies. Anyone who has read the novel will know well the protagonist's despair, so in this case the analysis matches a traditional reading of the text, which answers our research question about whether or not the automated sentiment analysis reflects a close reading of the text. - - -# Save Your Data - -If you want to save the data so that you can come back to it later, you can archive it in comma separates values ([CSV](https://perma.cc/64FY-NTSU)) format, using the function `write.csv()`. This will save your main data table, `sentiment_scores`, which contains the results of the eight emotions and two sentiments we generated, and puts that into a CSV file. You can also add the keyword associated with each row in the left-most column to act as helpful labels. - - -```R -write.csv(sentiment_scores, file = "analysis_sent_miau.csv", row.names = text_words) -``` - -Now you have all of the tools and knowledge you need to start to analyse your own texts and compare them with each other. - -# Loading your own Sentiment Lexicon - -While the above introduction provides you with many tools for exploring sentiment analysis, this tutorial has not presented an exhaustive list of possibilities. - -You may be working on a project in which you have already created a sentiment dictionary that you would like to use. Or perhaps you need to be able to customise a vocabulary and its corresponding sentiment scores to apply to a particular cultural or temporal context related to your research. Maybe you're looking to improve upon the automatically translated results of the NRC lexicon used here. In each of those cases, as of mid 2022, you can also load your own lexicon dataset into the software using the `custom` function to repeat some of the calculations and visualisations used in this lesson. - -To load your own sentiment lexicon, you first have to create or modify a dataframe containing at minimum a column of words and a column containing the corresponding scores for those words, which the author recommends saving in a CSV file format. - -Try this example: - -```R -|word|value| -|---|---| -|amor|1| -|cólera|-1| -|alfombra|0| -|catástrofe|-2| -``` - -Next, to load your saved data from a CSV file, use the `read.csv` function, which will create a new dataset that you can access in R just as you have in the above examples (change 'FILEPATH' to the full location of your CSV file): - -```R -personalised_vocabulary <- read.csv("FILEPATH") -method <- "custom" -sentiments_sentences <- get_sentiment(sentences_vector, method = method, lexicon = personalised_vocabulary) -``` - -
    -Warning: If you get an error message 'incomplete final line found by readTableHeader', this indicates that your CSV file has not formatted properly and lacks an 'end of line' character at the end of the file. The easiest way to correct this is to open your CSV file in a text editor (not MS Word), scroll to the end of the file, press return, and re-save the file. A fuller explanation of this error is available on Stack Overflow. -
    - -If you want to visualise sentiment across the progression of a text, you can use the `plot` function, which uses the same graphing parameters that you've already learned: - -```R -plot(sentiments_sentences, - type = "l", - main = "'Miau' by Benito Pérez Galdós, 1907 edition", - sub = "Analysis by Dr Jennifer Isasi", - xlab="emotions", ylab = " " - ) -``` - -Keep in mind that this form of customised analysis is limited, and that you may not be able to perform all of the same operations that we introduced above. For example, following the model example with your own dictionary, as you would not have information about emotions you would not be able to make a word cloud in the same way. - - -# Works Cited - -* Arnold, Taylor, and Lauren Tilton. 'Basic Text Processing in R', *Programming Historian* 6 (2017), https://doi.org/10.46430/phen0061 -* Damasio, Antonio R. *El error de Descartes: La razón de las emociones* (Andres Bello, 1999). -* Dewar, Taryn. 'R Basics with Tabular Data', *Programming Historian* 5 (2016), https://doi.org/10.46430/phen0056. -* Gottschalk, Louis, and Goldine Gleser. *The Measurement of Psychological States through the Content Analysis of Verbal Behaviour* (University of California, 1969). -* Heuser, Ryan, Franco Moretti, Erik Steiner. 'The Emotions of London' *Stanford Literary Lab*, Pamphlet 13 (2016) 1-10. -* Hu, Minqing, and Bing Liu, 'Mining and Summarizing Customer Reviews.', *Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery & Data Mining* (KDD-2004), 2004. -* Jockers, Matthew. 'Introduction to the Syuzhet Package' *CRAN* (2020), [https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html](https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html -). -* Jockers, Matthew. 'Some thoughts on Annie's thoughts...about Syuzhet' *Matthew L. Jockers* (2015), [http://www.matthewjockers.net/page/2/](https://web.archive.org/web/20190708100723/http://www.matthewjockers.net/page/2/). -* Leemans, Inger, Janneke M. van der Zwaan, Isa Maks, Erika Kujpers, Kristine Steenberge. 'Mining Embodied Emotions: A Comparative Analysis of Sentiment and Emotion in Dutch Texts, 1600-1800' *Digital Humanities Quarterly* 11 (2017). -* Liu, Bing. *Sentiment Analysis and Opinion Mining* (Morgan & Claypool, 2012). -* Meder, Theo, Dong Nguyen, Rilana Gravel. ‘The Apocalypse on Twitter’ *Digital Scholarship in the Humanities* 31 (2016), 398-410. -* Mohammad, Saif. 'NRC Word-Emotion Association Lexicon', *National Research Council Canada* (2010), [https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm](https://perma.cc/A8M5-2SDG). -* Mohammad, Saif, and Peter D. Turney. 'Crowdsourcing a Word–Emotion Association Lexicon' *Computational Intelligence* 29 (2013): 436-465, doi: 10.1111/j.1467-8640.2012.00460.x. -* Nguyen, Thein Hai, Kiyoaki Shirai, Julien Velcin. 'Sentiment Analysis on Social Media for Stock Movement Prediction' *Expert Systems with Applications* 42 (2015), 9603-9611. -* Nielsen, Finn Årup. 'AFINN Sentiment Lexicon' (2009-2011). -* Pereira Zazo, Óscar. *El analisis de la comunicación en español* (Kendal Hunt, 2015). -* Pérez Galdós, Benito. *Miau* (La Guirnalda, 1888). -* Pérez Galdós, Benito. *Miau* (Sucesores de Hernando, 1907). -* Rodríguez Aldape, Fernando Manuel. *Cuantificación del Interés de un usuario en un tema mediante minería de texto y análisis de sentimiento.* (MA Thesis, Universidad Autónoma de Nuevo León, 2013). -* Schmidt, Thomas, Manuel Burghardt, Christian Wolff. 'Towards Multimodal Sentiment Analysis of Historic Plays: A Case Study with Text and Audio for Lessing's Emilia Galotti' *4th Conference of the Association of Digital Humanities in the Nordic Countries* (2019). -* Siddiqui, Nabeel. 'Data Wrangling and Management in R', *Programming Historian* 6 (2017), https://doi.org/10.46430/phen0063. -* Sprugnoli, Rachele, Sara Tonelli, Alessandro Marchetti, Giovanni Moretti. 'Towards Sentiment Analysis for Historical Texts' *Digital Scholarship in the Humanities* 31 (2016): 762-772. -* Stone, Philip, Dexter Dunphy, Marshall Smith. ‘The General Inquirer: A Computer Approach to Content Analysis’ (M.I.T. Press, 1966). -* Swafford, Annie. 'Problems with the Syuzhet Package' *Anglophile in Academia* (2015), [https://annieswafford.wordpress.com/2015/03/02/syuzhet/](https://perma.cc/TYT3-5DTU). -* Wilkinson Saldaña, Zoë. 'Sentiment Analysis for Exploratory Data Analysis,' *Programming Historian* 7 (2018), https://doi.org/10.46430/phen0079 - - - -# Notes - -[^1]: For example, see: Louis Gottschalk, Goldine Gleser (1969) *The Measurement of Psychological States through the Content Analysis of Verbal Behaviour* (University of California); Philip Stone, Dexter Dunphy, Marshall Smith (1966) ‘The General Inquirer: A Computer Approach to Content Analysis’ (M.I.T. Press); Bing Liu, (2012) *Sentiment Analysis and Opinion Mining* (Morgan & Claypool); Thein Hai Nguyen, Kiyoaki Shirai, Julien Velcin (2015). ‘Sentiment Analysis on Social Media for Stock Movement Prediction’ *Expert Systems with Applications* 42: 9603-9611; Theo Meder, Dong Nguyen, Rilana Gravel (2016). ‘The Apocalypse on Twitter’ *Digital Scholarship in the Humanities* 31 (2): 398-410. -[^2]: For some examples in English, see: Inger Leemans, Janneke M. van der Zwaan, Isa Maks, Erika Kujpers, Kristine Steenberge (2017). 'Mining Embodied Emotions: A Comparative Analysis of Sentiment and Emotion in Dutch Texts, 1600-1800' *Digital Humanities Quarterly* 11 (4); Rachele Sprugnoli, Sara Tonelli, Alessandro Marchetti, Giovanni Moretti (2016). 'Towards Sentiment Analysis for Historical Texts' *Digital Scholarship in the Humanities* 31 (4): 762-772; Thomas Schmidt, Manuel Burghardt, Christian Wolff (2019). 'Towards Multimodal Sentiment Analysis of Historic Plays: A Case Study with Text and Audio for Lessing's Emilia Galotti' *4th Conference of the Association of Digital Humanities in the Nordic Countries*; Ryan Heuser, Franco Moretti, Erik Steiner (2016). 'The Emotions of London' *Stanford Literary Lab*, Pamphlet 13: 1-10. -[^3]: Antonio R. Damasio, *El Error de Descartes: La razón de las emociones*. (Barcelona: Andres Bello, 1999). -[^4]: Óscar Pereira Zazo, *El analisis de la comunicación en español* (Iowa: Kendal Hunt, 2015), 32. -[^5]: 'Bing': Minqing Hu and Bing Liu, 'Mining and summarizing customer reviews.', *Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery & Data Mining* (KDD-2004), 2004; 'Afinn': Finn Årup Nielsen, 'AFINN Sentiment Lexicon' (2009-2011); 'NRC': Saif Mohammad, '[NRC Word-Emotion Association Lexicon](https://perma.cc/A8M5-2SDG)', *National Research Council Canada* (2010). -[^6]: Saif Mohammad and Peter D. Turney, 'Crowdsourcing a Word–Emotion Association Lexicon', *Computational intelligence* 29 (2013): 436-465, doi: 10.1111/j.1467-8640.2012.00460.x -[^7]: Richard Socher, 'Recursive Deep Learning for Natural Language Processing and Computer Vision' PhD diss., (Stanford University, 2014). -[^8]: Thanks to Mounika Puligurthi, intern at the University of Texas (UT) Digital Scholarship Office (during the spring of 2019), for her help interpreting this calculation. -[^9]: There are more words assigned to the emotion *sadness* than to *joy*, both in total number of words (2,061 vs 1,552) and in unique words (349 vs 263). The word 'Mother' appears under both sadness and joy with a value of 33 points. What do you think the significance of that classification decision is? +--- +title: "Sentiment Analysis with 'syuzhet' using R" +slug: sentiment-analysis-syuzhet +original: analisis-de-sentimientos-r +layout: lesson +collection: lessons +date: 2021-03-23 +translation_date: 2023-04-01 +authors: +- Jennifer Isasi +translator: +- Adam Crymble +editors: +- Maria José Afanador-Llach +reviewers: +- Riva Quiroga +translation-editor: +- Rolando Rodriguez +translation-reviewer: +- Shuang Du +- Andrew Janco +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/478 +difficulty: 2 +activity: analyzing +topics: [distant-reading, r, data-visualization] +abstract: This lesson teaches you how to obtain and analyse narrative texts for patterns of sentiment and emotion. +avatar_alt: Engraving of three faces expressing different emotions +doi: 10.46430/phen0110 +--- + +{% include toc.html %} + + +# Lesson Objectives + +This lesson introduces you to the [`syuzhet`](https://perma.cc/9DNJ-ZWPW) [sentiment analysis](https://perma.cc/A92Q-PM4D) algorithm, written by [Matthew Jockers](https://perma.cc/9PF8-3GZ4) using the [R programming language](https://perma.cc/W78Z-FUAX), and applies it to a single narrative text to demonstrate its research potential. The term 'syuzhet' is Russian (сюже́т) and translates roughly as 'plot', or the order in which events in the narrative are presented to the reader, which may be different than the actual time sequence of events (the '[fabula](https://perma.cc/M7C9-XT99)'). The `syuzhet` package similarly considers sentiment analysis in a time-series-friendly manner, allowing you to explore the developing sentiment in a text across the pages. + +To make the lesson useful for scholars working with non-English texts, this tutorial uses a Spanish-language novel, *[Miau](https://perma.cc/G6V3-JCWS)* by [Benito Pérez Galdós](https://perma.cc/9P3P-2FQP) (1888) as its case study. This allows you to learn the steps necessary to work with everything from accented characters to thinking through the intellectual problems of applying English language algorithms to non-English texts. You do not need to know Spanish to follow the lesson (though you will if you want to read the original novel). Some steps in the following instructions may not be necessary if you are working with English-language texts, but those steps should be self-evident. + +Although the lesson is not intended for advanced R users, it is expected that you will have some knowledge of R, including an expectation that you already have [R installed](https://www.r-project.org/) and that you know how to load R packages. The author recommends downloading [RStudio](https://www.rstudio.com/) as a user-friendly environment for working in R. If you have not used R before, you may first want to try working through some of the following introductory R lessons: + +* Taylor Arnold and Lauren Tilton, '[Basic Text Processing in R](/en/lessons/basic-text-processing-in-r)', *Programming Historian* 6 (2017), https://doi.org/10.46430/phen0061 +* Taryn Dewar, '[R Basics with Tabular Data](/en/lessons/r-basics-with-tabular-data)', *Programming Historian* 5 (2016), https://doi.org/10.46430/phen0056 +* Nabeel Siddiqui, '[Data Wrangling and Management in R](/en/lessons/data-wrangling-and-management-in-r)', *Programming Historian* 6 (2017), https://doi.org/10.46430/phen0063 + +You may also be interested in other sentiment analysis lessons: + +* Zoë Wilkinson Saldaña, '[Sentiment Analysis for Exploratory Data Analysis](/en/lessons/sentiment-analysis),' *Programming Historian* 7 (2018), https://doi.org/10.46430/phen0079 +* Matthew Jockers, '[Introduction to the Syuzhet Package](https://perma.cc/9BN2-F3N3)' (2020). + +At the end of the lesson you will be able to: + +* Develop appropriate research questions that apply sentiment analysis to literary or narrative texts +* Use the R programming language, RStudio, and the `syuzhet` package with the [NRC Word-Emotion Association Lexicon](https://perma.cc/A8M5-2SDG) to generate sentiment scores for words in texts of various languages +* Critically interpret the results of your sentiment analysis +* Visualise the results through a range of graphs (bar, word cloud) to aid interpretation + +This lesson was written and tested using version 4.2.x of R using a Mac and on 4.0.x using a Windows machine. + +> Generally, R works the same on Windows, Mac, and Linux operating systems. However, when working on a Windows machine with non-English texts or those containing accents or special characters, you will need to include some extra instructions to apply [UTF-8](https://perma.cc/5HY2-HHN2) character encoding to ensure special characters are properly interpreted. Where this is a necessary step, it is shown below. + +
    +Translator's Note for Educators: + +A number of steps in this tutorial require loading / running time that may exceed 15 to 30 minutes during which participants have to wait. This may affect your ability to use the tutorial in a time-limited live event such as a workshop. Note also that to use this tutorial in a workshop setting, participants will need the ability to install software on their machine. +
    + +# Background Information + +This section introduces the concepts and the software that you will use to perform a sentiment analysis of a text. It also introduces the case study document, the novel *Miau* by Benito Pérez Galdós, and the ways you can apply sentiment analysis meaningfully to a text such as *Miau*. + +## Sentiment Analysis + +Sentiment analysis, also known as opinion mining, is an umbrella term for a number of processes for automatically calculating the degree of negativity or positivity in a text. It has been used for some time in the fields of marketing and politics to better understand the public mood;[^1] however, its adoption in literary studies is more recent and as of yet no one method dominates use.[^2] Some approaches to sentiment analysis also enable you to measure the presence of a number of different emotions in a text, as will be the case for the example in this tutorial. + +What is the difference between 'emotion' and 'sentiment'? The two words are often used interchageably in English but refer to different concepts. + +According to Antonio R. Damasio, 'emotions' are the biologically rooted, instinctive reactions of our bodies to environmental stimuli.[^3] There is no universally agreed list of basic emotions, however a common model includes six: anger (or rage), joy, disgust (or revulsion), fear, sadness, and surprise -- though for Damasio the last of those falls into a category he would describe as a '[secondary emotion](https://perma.cc/Y675-4C52)'. In the case of the automated system that you will use, the secondary emotions 'anticipation' and 'trust' are also options for analysis. + +'Sentiment', on the other hand, is both the action of and effect of feeling an emotion. In other words, as Óscar Pereira Zazo notes, 'when an object, a person, a situation, or a thought brings us joy, it begins a process that can lead to the feeling of being joyful or happy'.[^4] Sentiment analysis suggests that you can measure the intensity of this effect (either positive, negative, or neutral) on the manifestation of an emotion. + +This lesson distiguishes between the two terms as described above. The effect (sentiment) will be measured as it evolves across the pages of the text, while the emotions will be measured by looking at word use more generally. + +## NRC Word-Emotion Association Lexicon + +Many sentiment analysis algorithms depend upon pre-compiled lexicons or dictionaries that assign numerical sentiment scores to words or phrases based on findings from previous linguistic research. The R package `syuzhet` has been designed to allow you to choose from four of these sentiment lexicons: [Bing](https://perma.cc/G9RV-RA82), [Afinn](https://perma.cc/GZB2-J2RH), [Stanford](https://perma.cc/TK8L-44ZW), and the [NRC Word-Emotion Association Lexicon](https://perma.cc/A8M5-2SDG).[^5] This lesson uses the NRC lexicon, as it is the only one of the four that can currently be used with non-English texts. + +This lexicon, which includes positive and negative sentiment values as well as eight emotional categories, was developed by Saif M. Mohammad, a scientist at the National Research Council Canada (NRC). The dataset that forms the lexicon has been manually annotated using the [Maximum Difference Scaling](https://perma.cc/KWW4-AFJ4) technique, or MaxDiff, to determine the most negative or positive sets of words relative to other words -- a sort of ranking of sentiment intensity of words.[^6] This particular lexicon has 14,182 unigrams (words) classified as either positive or negative. It also classifies a word's connection to various emotions: anger, anticipation, disgust, fear, joy, sadness, surprise, and trust. Using automatic translation, which may lack linguistic nuance in unpredictable ways, it is available in more than one hundred languages. + +The license on the dataset allows free use of the NRC lexicon for research purposes. All data is available for download. + +The [NRC Word-Emotion Association Lexicon](https://perma.cc/A8M5-2SDG) website outlines the different categories and classifications in the dataset. It also provides a number of resources that can help you to better understand how the lexicon was built, including links to published research, more information on obtaining values for individual words, the organisation of the dataset, and how to extend it. + +## The `syuzhet` R Package + +The [R package](https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html) `syuzhet` was released in 2015 by Matthew Jockers; at the time of writing it is still being actively maintained (we use version 1.0.6, the November 2020 release, in this lesson). + +If you intend to use the software on non-English texts, you should be aware that the package has been developed and tested in English, and it has not been received without controversy, including from [Annie Swafford](https://perma.cc/TYT3-5DTU) who challenged some of the algorithm's assumptions about text and the use of `syuzhet` in a research setting. This included concerns about incorrectly splitting sentences involving quotation marks, and problems with using a sentiment lexicon designed for modern English on a historic text that uses the same words in slightly different ways. Assigning concrete values of measurement to literary texts, which are by their nature quite subjective, is always challenging and potentially problematic. A series of archived blog entries by Jockers outline [his thoughts on the method and address some of the criticisms](https://web.archive.org/web/20190708100723/https://www.matthewjockers.net/page/2/) about the degree to which sentiment can accurately be measured when sometimes even humans disagree on a passage of text's effects on the reader. + + +> Some Research Warnings: The lexicon assigns values to individual words which are used as the basis for conducting the quantitative analysis. Those values were assigned by humans working in North America and may carry English-language and North American cultural biases. Researchers must therefore take several things into account before applying this methodology in their work: +> +> - The Spanish lexicon (and other non-English versions) is a direct translation carried out via machine translation. In the author's opinion, these systems are already fairly reliable when translating between English and Spanish but less so for other languages that NRC claims to be operable with, including Basque, for example. +> - The sentiment and emotion scores of each word need to be understood in cultural and temporal context. A term that the people building the NRC lexicon labelled positive may be negative in other contexts. This type of approach is therefore inherently coarse in its ability to reflect a *true* reading of the texts as conducted by a subject specialist through close reading. +> - The author does not recommend the use of this methodology in texts that are significantly metaphorical or symbolic. +> - This particular method does not properly handle negation. For example, it will wrongly classify 'I am not happy' as positive because it looks at individual words only. Research by Richard Socher (2014) has attempted to improve issues of negation in sentiment analysis, and may be worth exploring for those with a genuine research need.[^7] +> Following the spirit of adaptability of *Programming Historian* lessons in other languages, the author has decided to use `syuzhet` in its original form; however, at the end of the lesson you will be introduced to some advanced functions that will help you use your own sentiment dictionary with the package. + +As this tutorial works with emotion of a Spanish text, Table 1 provides a simple translation matrix of the key emotion names for ease of reference. + + +Table 1: Emotion categories in English and Spanish + +| English | Spanish | +| -------- | ------- | +| anger | enfado | +| anticipation | anticipación | +| disgust | disgusto | +| fear | miedo | +| joy | alegría | +| sadness | tristeza | +| surprise | sorpresa | +| trust | confianza | +| negative | negativo | +| positive | positivo | + + +## A Brief Example + +Before diving into the full analysis of our text *Miau*, we offer a short example of sentiment analysis in action, using `syuzhet` together with the NRC lexicon, focusing on the outputs instead of the code. This analysis uses R and prompts you to [tokenise](https://perma.cc/243B-E9M7) the text into a list of single-word strings (unigrams) that are then analysed one at a time. Sentence-level analysis is also possible in sentiment analysis, but is not the focus of this tutorial. + +Consider the analysis of the final passage from *Miau*: + +> **Spanish Original**: Retumbó el disparo en la soledad de aquel abandonado y tenebroso lugar; Villaamil, dando terrible salto, hincó la cabeza en la movediza tierra, y rodó seco hacia el abismo, sin que el conocimiento le durase más que el tiempo necesario para poder decir: «Pues... sí...». +> +> **Rough English Translation**: The shot boomed out in the solitude of that abandoned and gloomy space; Villaamil, taking a terrible leap, bowed his head to the moving earth and rolled towards the abyss, his awareness lasting no longer than the time necessary to say: 'Well...yes...'. +> +> *Miau* by Benito Pérez Galdós. + +This passage will be transformed into a list of words: + +```R +example: + +> [1] "retumbó" "el" "disparo" "en" "la" "soledad" +> [7] "de" "aquel" "abandonado" "y" "tenebroso" "lugar" +> [13] "villaamil" "dando" "terrible" "salto" "hincó" "la" ... +``` + +Using the sentiment analysis function, you then calculate the eight emotions as classified by NRC, as well as the positive and negative scores of each word. The following is the result for the first few words in this short passage: + +```R +print(example_2, row.names = example) + +> anger anticipation disgust fear joy sadness surprise trust negative positive +> retumbó 0 0 0 0 0 0 0 0 0 0 +> el 0 0 0 0 0 0 0 0 0 0 +> disparo 3 0 0 2 0 2 1 0 3 0 +> en 0 0 0 0 0 0 0 0 0 0 +> la 0 0 0 0 0 0 0 0 0 0 +> solitude 0 0 0 2 0 2 0 0 2 0 +> de 0 0 0 0 0 0 0 0 0 0 +> aquel 0 0 0 0 0 0 0 0 0 0 +> abandonado 2 0 0 1 0 2 0 0 3 0 +> y 0 0 0 0 0 0 0 0 0 0 +> tenebroso 0 0 0 0 0 0 0 0 0 0 +> lugar 0 0 0 0 0 0 0 0 0 0 +> villaamil 0 0 0 0 0 0 0 0 0 0 +> dando 0 0 0 0 0 0 0 0 0 1 +> terrible 2 1 2 2 0 2 0 0 2 0 +> salto 0 0 0 0 0 0 0 0 0 0 +> hincó 0 0 0 0 0 0 0 0 0 0 +> la 0 0 0 0 0 0 0 0 0 0 +... +``` + +
    +Translator's Note: +R will not translate these into English for you, but to make the tutorial easier to follow for English speakers, the same output would look like the following if the passage was in English (notice that when translating word-by-word the results are slightly different than when translating whole passages, as above): +
    + +```R +print(example_2, row.names = example) + +> anger anticipation disgust fear joy sadness surprise trust negative positive +> boomed 0 0 0 0 0 0 0 0 0 0 +> the 0 0 0 0 0 0 0 0 0 0 +> shot 3 0 0 2 0 2 1 0 3 0 +> in 0 0 0 0 0 0 0 0 0 0 +> the 0 0 0 0 0 0 0 0 0 0 +> solitude 0 0 0 2 0 2 0 0 2 0 +> of 0 0 0 0 0 0 0 0 0 0 +> that 0 0 0 0 0 0 0 0 0 0 +> abandoned 2 0 0 1 0 2 0 0 3 0 +> and 0 0 0 0 0 0 0 0 0 0 +> gloomy 0 0 0 0 0 0 0 0 0 0 +> place 0 0 0 0 0 0 0 0 0 0 +> villaamil 0 0 0 0 0 0 0 0 0 0 +> taking 0 0 0 0 0 0 0 0 0 1 +> terrible 2 1 2 2 0 2 0 0 2 0 +> leap 0 0 0 0 0 0 0 0 0 0 +> bowed 0 0 0 0 0 0 0 0 0 0 +> his 0 0 0 0 0 0 0 0 0 0 +... +``` + +The results are returned in a [data frame](https://perma.cc/ER4M-WRRC). Using this scoring system, every word in our human languages has a default value of 0 indicating no connection to the corresponding emotion. Any words not in the NRC lexicon will be treated by the code as if they have values of 0 for all categories. Any word with a scores greater than 0 indicates that it is both present in the NRC lexicon, and that it has been assigned a value by the researchers responsible for that lexicon indicating the strength of its connection to one of the emotional categories. + +In this example we can see that the words 'disparo' (shot), 'soledad' (solitude), 'abandonado' (abandoned), and 'terrible' (terrible) have a negative score associated with them (second-to-last column), while 'dando' (taking) is judged as a positive word (last column). + +We are also able to see which emotions each word is connected to: 'disparo' (shot) is associated with *anger* (3), *fear* (2), *sadness* (2), and *surprise* (1). Higher numbers mean greater strength of the connection to that emotion. + +The possibilities of exploring, analysing, and visualising these results depend on your programming skills, but also your research needs. To help you reach your potential with sentiment analysis, this lesson introduces you how to analyse data and build understanding of the results through various visualisations. + +## Appropriate Research Questions + +As already stated, in this lesson, you will analyse the Spanish novel *Miau* by [Benito Pérez Galdós](https://perma.cc/9P3P-2FQP), published in 1888. Known for his Spanish realist novels, this particular Pérez Galdós story takes place in Madrid at the end of the nineteenth century and satirises the government administration of the day. In a kind of tragic comedy, we witness the final days of Ramón Villaamil after becoming unemployed, while his family is trying to stretch their meagre budget while keeping up the pretence of wealthy living. Villaamil's spiral of misfortune and his inability to find a new job ends in tragedy. + +From a research standpoint, the question is: Can we observe the emotional downward spiral of this plot through an automatic extraction of sentiment in the text? Does a human reader's interpretation of the negative experiences of Villaamil match the results of the algorithm? And if so, what words within the novel are used most to signal the emotional trajectory of the story? + + +# Obtaining Sentiment and Emotion Scores + +The process of conducting the sentiment analysis is a four stage affair. First, code must be installed and loaded into the R environment of your choice. Then, you must load and pre-process the text you want to analyse. Then you conduct your analysis. Finally, you turn your attention to interpreting the results. + +## Install and Load Relevant R Packages + +Before processing the text, you must first install and load the correct R code packages. In this case, that includes [`syuzhet`](https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html). You will also be visualising the results, which will require a number of other R packages: [`RColorBrewer`](https://cran.r-project.org/web/packages/RColorBrewer/index.html), [`wordcloud`](https://perma.cc/GM67-HBH3), [`tm`](https://perma.cc/T2JG-LEBJ) and [`NLP`](https://perma.cc/NS79-H5DH). + +To install and load these packages, copy and execute the sample code below in your chosen R coding environment. The first few lines will install the packages (only needed if you haven't already got the packages installed). The second set of lines will load them so that you can use them in your programme. The installation of these packages may take a few minutes. + +```R +# Install the Packages +install.packages("syuzhet") +install.packages("RColorBrewer") +install.packages("wordcloud") +install.packages("tm") + +# Load the Packages +library(syuzhet) +library(RColorBrewer) +library(wordcloud) +library(tm) +``` + +## Load and Prepare the Text + +Next, download a machine readable copy of the novel: [*Miau*](/assets/analisis-de-sentimientos-r/galdos_miau.txt) and make sure to save it as a .txt file. When you open the file you will see that the novel is in [plain text](https://perma.cc/Z5WH-V9SW) format, which is essential for this particular analysis using R. + +With the text at hand, you first need to load it into R as one long string so that you can work with it programmatically. Make sure to replace `FILEPATH` with the location of the novel on your own computer (don't just type 'FILEPATH'). This loading process is slightly different on Mac/Linux and Windows machines: + +### On Mac and Linux + +You can [find the FILEPATH](https://perma.cc/ZXZ8-FZHG) using your preferred method. The final format on my computer is `/Users/Isasi/Desktop/miau.txt` + +On a Mac/Linux machine, use the function `get_text_as_string`, which is part of the `syuzhet` package: + +```R +text_string <- get_text_as_string("FILEPATH") +``` + +### On Windows + +You can [find the FILEPATH](https://perma.cc/N9R4-HEJY) using your preferred method. The final format on my computer is `C:\\Users\\Isasi\\Desktop\\miau.txt` + +The Windows operating system cannot directly read characters with tildes, accents, or from extended alphabet sets, all of which are commonly used in languages such as Spanish, French, and Portuguese. Therefore we must first alert the software that our novel uses the [UTF-8](https://perma.cc/5HY2-HHN2) set of characters (which includes accents and many other non-English characters). We do this using the `scan` function. + +> Note that when typing your filepath, you may need to escape the backslashes (`\`) in the filepath. To do this, just add a second backslash each time it appears in the path. (E.g. "`C:\\...`" + +```R +text_string <- scan(file = "FILEPATH", fileEncoding = "UTF-8", what = character(), sep = "\n", allowEscapes = T) +``` +--- + +Now that the data has loaded, you have to format it in the way the sentiment analysis algorithm expects to receive it. In this particular case, that is as a [list](https://perma.cc/LPV9-XGX8) containing either single words or sentences (here you will focus on individual words only). + +This means you need an intermediate step between loading the text and extracting the sentiment values. To meet this need, we will divide the character string into a list of words, sometimes also referred to as [unigrams](https://perma.cc/FX4C-ZLYB) or [tokens](https://perma.cc/V6UY-KKVK). + +To do this you can use the package's built-in `get_tokens()` function to generate a new data object containing each individual word as a list. This function also removes spaces and punctuation from the original text. This approach to tokenisation uses [regular expressions](https://perma.cc/W7YD-K3R7) and is not always appropriate in all use cases. It will, for example, split hyphenated words into two. Depending on your text, you should consider the implications of your chosen method of tokenisation as you can use any method you like as long as the output is in the same format as in the example below. + +```R +text_words <- get_tokens(text_string) +head(text_words) + +> [1] "miau" "por" "b" "pérez" "galdós" "14" +``` + +Now you can use the `length()` function to count how many words are in the original text: + +```R +length(text_words) + +> [1] 97254 +``` + +If you want to analyse the text by sentence, use the `get_sentences()` function and follow the same proccess except for creating the word cloud below: + +```R +> sentence_vector <- get_sentences(text_string) +length(sentence_vector) +[1] 6022 +``` + + +## Extracting Data with the NRC Sentiment Lexicon + +Now you can use the `get_nrc_sentiment` function to obtain the sentiment scores for each word in the novel. The default vocabulary for the software is English. Since this text is in Spanish, you will use the `lang` argument to set the vocabulary to Spanish. This would not be necessary if working on an English text. Then you will create a new data object to store the extracted data so that you can work with it further. This `get_nrc_sentiment` function searches for the presence of the eight emotions and two sentiments against each word in your list, and assigns each a number greater than 0 if the word is found within the NRC's lexicon. Depending on the speed of your computer and the nature of your text, this process may take between 15 and 30 minutes. + +```R +sentiment_scores <- get_nrc_sentiment(text_words, lang="spanish") +``` +You can also use this package with [a range of other languages](https://perma.cc/9BN2-F3N3), though the 2020 release only works on languages with Latin-based alphabets. Other lessons that can be substituted for `spanish` in the above line of code are: `basque`, `catalan`, `danish`, `dutch`, `english`, `esperanto`, `finnish`, `french`, `german`, `irish`, `italian`, `latin`, `portuguese`, `romanian`, `swedish`, and `welsh`. We can hope that the functionality will improve in future to include more languages. + +Some users reported getting a warning message when the code finished running. At the time of writing this is a warning that the `syuzhet` codebase may need to be updated in future, but should not affect your ability to use it at present. The warning was that "spread_() was deprecated in tidyr 1.2.0. Please use spread() instead. The deprecated feature was likely used in the syuzhet package. Please report the issue to the authors." In this case, only Matthew Jockers can fix the error, as it is an issue with the code he created, not with your instructions to run it. + +When the process finishes, you may want to verify the contents of the new data object. To avoid printing thousands of lines of text, you can use the `head()` function to show only the first six unigrams. If you are following the example, you should see the following (which is lacking in context at this point). + +```R +head(sentiment_scores) + +> anger anticipation disgust fear joy sadness surprise trust negative positive +> 1 0 0 0 0 0 0 0 0 0 0 +> 2 0 0 0 0 0 0 0 0 0 0 +> 3 0 0 0 0 0 0 0 0 0 0 +> 4 0 0 0 0 0 0 0 0 0 0 +> 5 0 0 0 0 0 0 0 0 0 0 +> 6 0 0 0 0 0 0 0 0 0 0 +``` + +### Summary of the Text + +More interesting is a summary of the values associated with each of the six emotions and two sentiments, which can be displayed using the `summary()` function. This can be very useful when comparing various texts, and can allow you to see different measures, such as the average relative value of each of the emotions and the two sentiments. For example, we can see that the novel *Miau* on average ([mean](https://perma.cc/5NKH-2TYV)), uses more positive (0.05153) language than negative (0.04658), according to the algorithm. However, it seems that terms associated with sadness (0.02564) are also more prevalent than those associated with joy (0.01929). + +This summary output also shows a number of other calculations, many of which have a value of 0, including the [median](https://perma.cc/KB36-B855). Words that are not found in the sentiment lexicon (NRC) will automatically be treated as if they have a value of 0. Because there are a lot of categories and the story is quite complex, it is not surprising that no one emotion or sentiment has distinctively high statistical values. This makes the minimum, maximum, and mean the most useful measures from this summary output. + +```R +summary(sentiment_scores) + +> anger anticipation disgust fear +> Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 +> 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 +> Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000 +> Mean :0.01596 Mean :0.02114 Mean :0.01263 Mean :0.02243 +> 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 +> Max. :5.00000 Max. :3.00000 Max. :6.00000 Max. :5.00000 +> joy sadness surprise trust +> Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :0.00000 +> 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 +> Median :0.00000 Median :0.00000 Median :0.00000 Median :0.00000 +> Mean :0.01929 Mean :0.02564 Mean :0.01035 Mean :0.03004 +> 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 +> Max. :5.00000 Max. :7.00000 Max. :2.00000 Max. :3.00000 +> negative positive +> Min. :0.00000 Min. :0.00000 +> 1st Qu.:0.00000 1st Qu.:0.00000 +> Median :0.00000 Median :0.00000 +> Mean :0.04658 Mean :0.05153 +> 3rd Qu.:0.00000 3rd Qu.:0.00000 +> Max. :7.00000 Max. :5.00000 +``` + +# Interpreting the Results + +You now have the quantitative results of your sentiment analysis of a text. Now, what can you do with these numbers? This section introduces three different visualisations of the data: bar charts, word counts, and word clouds, which offer quick but different ways of making sense of the outputs and telling a story or forming an argument about what you've discovered. + +## Bar Chart by Emotion + +To quickly get a sense of which emotions have a major presence in the text, a bar chart is both a simple and effective format for displaying your data (Figure 1). The built-in [`barplot()`](https://perma.cc/5DXU-CYS9) function can be paired with the summary data of each of the emotions: *anger*, *anticipation*, *disgust*, *fear*, *joy*, *sadness*, *surprise*, and *trust*. These are stored in columns 1 to 8 of our data table. This approach of displaying the data uses the `prop.table()` function with the results of each of the emotion words to present the results.[^8] + +```R +barplot( + colSums(prop.table(sentiment_scores[, 1:8])), + space = 0.2, + horiz = FALSE, + las = 1, + cex.names = 0.7, + col = brewer.pal(n = 8, name = "Set3"), + main = "'Miau' by Benito Pérez Galdós, 1907 edition", + sub = "Analysis by Dr Jennifer Isasi", + xlab="emotions", ylab = NULL) +``` + +The rest of the parameters that you can see in the code are optional and have been added to help you learn how to customise the graph outputs. They include indicating the space between the bars (`space = 0.2`), that the chart should include vertical not horizontal bars (`horiz=FALSE`), and that the values on the axis should increase in units of 1 (`las=1`). We also reduce the font size of the labels (`cex.names = 0.7`) to make sure they fit nicely on the screen. Thanks to the [`RColorBrewer`](https://perma.cc/BHK9-AY7S) package that we installed and loaded at the beginning of the lesson, we can automatically colour the columns. In this case we've used the `brewer.pal` colour palette from `Set3`, and specified we need 8 colours (`n=8`) – one colour per columnn. You can learn more about `RColorBrewer` and its options on [the documentation page for that package](https://perma.cc/4EHL-P8E9). Finally, we add a title and subtitle to the graph using the `main` and `sub` parameters, along with the word `emotions` on the X axis. We have not added a label to the Y axis, but you could do so if you wished by following the model above. + +{% include figure.html filename="tr-en-analisis-de-sentimientos-r-1.png" alt="Bar chart showing the calculated scores of six emotions and two sub-emotions measured in the novel ‘Miau’ (1907) by Pérez Galdós. The emotions are anger, anticipation, disgust, fear, joy, sadness, surprise, and trust. The ‘trust’ bar is the tallest, followed by ‘sadness’ and ‘fear’, while ‘disgust’ and ‘surprise’ are the shortest. This is included because it shows the relative outputs of the sentiment analysis algorithm across these seven emotions." caption="Figure 1: Bar chart showing the calculated scores of six emotions and two sub-emotions measured in the novel 'Miau' by Pérez Galdós." %} + +If you are not interested in modifying these parameters, you could create a bar chart with default styling using the following code: + +```R +barplot(colSums(prop.table(sentiment_scores[, 1:8]))) +``` + +> Make sure you have enough space in the display window for the graph to draw properly, including space for the labels. + +This information already indicates to us that the *sadness* and *fear* emotions are more prevalent than those of *disgust* or *surprise*. But what words does Galdós use to express *fear*? And how often does each emotionally charged word appear in the novel? + +## Counting Words by Emotion + +One of the measures you can calculate using sentiment analysis is the frequency of words appearing in the text and how those words relate with each emotional category. To start with, you need to create a data object with all of the words that have a value greater than 0 -- in this case you will start with those corresponding to the *sadness* column. In order to select only that column, use the dollar symbol `$` after the name of your `sentiment_scores` variable to specify the name of the column you want to work with: *sadness*. + + +```R +sad_words <- text_words[sentiment_scores$sadness> 0] +``` + +The contents of `sad_words` does not tell you much on its own, since it only offers you the list of relevant words without any further context. To also obtain the number of appearances of each 'sadness' word, you can generate a table. To get a quick look of some of the top entries, use the `unlist` and `table` functions along with the `decreasing` argument to display the matches in descending order (if you want ascending order, change TRUE to FALSE); you can create a new table object to print the first twelve words in the list, along with their frequency using the following code (see Table 2 for translations of the Spanish words): + +```R +sad_word_order <- sort(table(unlist(sad_words)), decreasing = TRUE) +head(sad_word_order, n = 12) + +> muy nada pobre tarde +> 271 156 64 58 +> mal caso malo salir +> 57 50 39 35 +> madre insignificante ay culpa +> 33 29 24 22 +``` + +Table 2: English translations of the Spanish words in the preceding code output block + +| Spanish | English | +| ------- | ------- | +| muy | very | +| nada | nothing | +| pobre | poor | +| tarde | late | +| mal | bad | +| caso | case | +| malo | bad | +| salir | to leave | +| madre | mother | +| insignificante | insignificant | +| ay | ow! | +| culpa | fault | + + +If you want to know how many unique words are connected to sadness, you can use the `length` function on the newly created `sad_word_order` variable: + +```R +length(sad_word_order) + +> [1] 349 +``` + +You can repeat the same operation with the rest of the emotion categories, or those that you are interested in, as well as those with positive or negative sentiment scores. To make sure you understand how to adapt the code, try to obtain the results for the emotion 'joy' and compare them with 'sadness'.[^9] + +Depending on the type of analysis that you want to conduct, this may be an efficient approach. For the purposes of this introductory lesson, you are also going next generate a word cloud to help visualise the terms associated with each emotional category (for demonstration purposes, you will use four). + + +## An Emotional Word Cloud + +In order to create a word cloud of terms that correspond with each emotion in *Miau*, you are going to first collect all words with an emotion score greater than 0. Similarly to the previous example, you use the `$` symbol to specify which column of data (which emotion) you are interested in, indicating that you want entries with a value greater than 0. + +If working on a machine running Windows, you will have to indicate to the programme if your text contains accented characters using the following approach: + +### On Mac and Linux + +```R +cloud_emotions_data <- c( + paste(text_words[sentiment_scores$sadness> 0], collapse = " "), + paste(text_words[sentiment_scores$joy > 0], collapse = " "), + paste(text_words[sentiment_scores$anger > 0], collapse = " "), + paste(text_words[sentiment_scores$fear > 0], collapse = " ")) +``` + +### On Windows + +Windows needs an additional step to indicate the text is in UTF-8 format, which is done using the `iconv` function. + +```R +cloud_emotions_data <- c( + paste(text_words[sentiment_scores$sadness> 0], collapse = " "), + paste(text_words[sentiment_scores$joy > 0], collapse = " "), + paste(text_words[sentiment_scores$anger > 0], collapse = " "), + paste(text_words[sentiment_scores$fear > 0], collapse = " ")) + +cloud_emotions_data <- iconv(cloud_emotions_data, "latin1", "UTF-8") +``` + +Once you have collected the data for the four target emotions, you can organise it into four separate `documents` to use as the basis for creating each of your four word clouds: + +```R +cloud_corpus <- Corpus(VectorSource(cloud_emotions_data)) +``` + +Next, you transform the corpus into a term-document matrix using the `TermDocumentMatrix()` function. Then you specify that you want the data organised as a matrix using the `as.matrix()` function. + +To see the first few entries of this output, use the `head` function: + +```R +cloud_tdm <- TermDocumentMatrix(cloud_corpus) +cloud_tdm <- as.matrix(cloud_tdm) +head(cloud_tdm) + +> Docs +> Terms 1 2 3 4 +> abandonado 4 0 4 0 +> abandonar 1 0 0 0 +> abandonará 2 0 0 0 +> abandonaré 1 0 0 0 +> abandonarías 1 0 0 0 +> abandono 3 0 3 0 + +``` + +Now, rename the numbered columns with the relevant emotion words so that the output is more human-readable. Again, you can see the state of your dataset with the `head` function: + +```R +colnames(cloud_tdm) <- c('sadness', 'happiness', 'anger', 'joy') +head(cloud_tdm) + +> Docs +> Terms sadness happiness anger trust +> abandonado 4 0 4 4 +> abandonar 1 0 0 1 +> abandonará 2 0 0 2 +> abandonaré 1 0 0 1 +> abandonarías 1 0 0 1 +> abandono 3 0 3 3 +``` + +Finally, you can visualise these results as a word cloud. The font size of a word in a word cloud is linked to the frequency of its appearance in the document. We can also control a number of other aspects of the word cloud's presentation. + +To start, use the `set.seed()` function to ensure that while following along your outputs will look the same as in the example (if you don't do this your output will have a randomised pattern and may not match the screenshots herein - which may not be important for your own research results but is helpful when following along). + +To generate the cloud itself, use the [comparison.cloud](https://perma.cc/6QRY-5KBG) function from the R `wordcloud` package. In this example, you will indicate that the object `cloud_tdm` will have a non-random word order. You will also specify the colour scheme of each group of words, the title size, and general scale of the visualisation. To make the cloud readable, you will also specify a maximum number of terms. These parameters are all adjustable. + +```R +set.seed(757) # this can be set to any integer +comparison.cloud(cloud_tdm, random.order = FALSE, + colors = c("green", "red", "orange", "blue"), + title.size = 1, max.words = 50, scale = c(2.5, 1), rot.per = 0.4) +``` + +You should get an image similar to Figure 2 although with the location of the words altered since it is generated according to the size of the canvas. + +{% include figure.html filename="tr-en-analisis-de-sentimientos-r-2.png" alt="Word Cloud of most frequent words corresponding to sadness, happiness, anger, and joy in the novel ‘Miau’ by Pérez Galdós. The words are colour-coded to show that they correspond with one of the four emotions, and use a cartesian coordinate system so that all words most closely associated with happiness are in the top left quadrant, sadness in the top right, and so on. Words that are most prevalent in the text appear closest to the centre of the graph. The word ‘muy’ (Spanish for ‘very’) is the largest word, and is associated with sadness. This is included because it shows which words are prevalent, and which emotions they are most closely associated with according to the sentiment analysis algorithm." caption="Figure 2: Word Cloud of most frequent words corresponding to sadness, happiness, anger, and joy in the novel 'Miau' by Pérez Galdós." %} + +What does the word cloud suggest to you? Surely the connection of 'very' (muy) to the sadness emotion and of 'money' (dinero) to the anger emotion needs further consideration. These less obvious results are exactly what many scholars warn about when thinking about sentiment analysis, and demonstrate why a researcher must always ask if the outcomes of the analysis make sense before trying to draw any research conclusions from them. As noted, the sentiment analysis vocabulary used in this tutorial uses a vocabulary that's been automatically translated from English, and is thus not perfect when used on Spanish-language text. + +## Visualising Emotion and Sentiment Across the Progression of a Text + +To complement the isolated readings of emotions as above, you can also study the fluctuation of positive and negative sentiment across the text (Figure 3). R provides a way to both normalise and visualise this time-series sentiment analysis data. Since the sentiment analysis algorithm assigns both positive and negative sentiment scores, you need to generate data between a range of -1 (most negative moments) and 1 (most positive moments); 0 is considered neutral. To calculate these scores, you multiply the values in the negative values of the original `sentiment_scores` data table by -1 and then add the result to the positive values. + + +```R +sentiment_valence <- (sentiment_scores$negative *-1) + sentiment_scores$positive +``` + +Finally, you can generate a graph with the `simple_plot()` function, which is built into the `syuzhet` package, and which offers you a choice of two different graphs; the first presents the various measurements calculated by the algorithm, and the second is a normalisation of those measures. The horizontal axis (X axis) presents the text in 100 normalised fragments and the vertical axis (Y axis) shows the strength of the sentiment in the text. Depending on the computing power of your machine, the graph may take 20 to 30 minutes to finish rendering. + +```R +simple_plot(sentiment_valence) +``` + +> Make sure your graph display window is sized large enough to actually draw the graph. If it isn't you will see the error message: `Error in plot.new() : figure margins too large.` + +{% include figure.html filename="tr-en-analisis-de-sentimientos-r-3.png" alt="A pair of line charts that show the rough emotional intensity of positive and negative sentiment across the whole novel. The graphs use a line graph with a solid curving line moving left-to-right to represent the beginning, middle, and end. In this particular novel, a simplified chart shows that the sentiment rises through the first quarter of the story, before diving in the middle and staying low until the end, representing quite a depressing story. A less simplified version shows that the sentiment picks up a few times later in the novel, but dips well into negative sentiment a number of times. This is included because it shows the emotional intensity of the novel over time." caption="Figure 3: Evolution of the use of positive and negative sentiment through the novel 'Miau' by Pérez Galdós" %} + +Based on Figure 3, you might conclude that the novel *Miau* begins with fairly neutral language, transitions into moments of happiness early on, and moves into some quite negative description in the remaining pages, ending on a negative note, as indicated by the sample sentence we drew upon earlier in the lesson in which Villaamil dies. Anyone who has read the novel will know well the protagonist's despair, so in this case the analysis matches a traditional reading of the text, which answers our research question about whether or not the automated sentiment analysis reflects a close reading of the text. + + +# Save Your Data + +If you want to save the data so that you can come back to it later, you can archive it in comma separates values ([CSV](https://perma.cc/64FY-NTSU)) format, using the function `write.csv()`. This will save your main data table, `sentiment_scores`, which contains the results of the eight emotions and two sentiments we generated, and puts that into a CSV file. You can also add the keyword associated with each row in the left-most column to act as helpful labels. + + +```R +write.csv(sentiment_scores, file = "analysis_sent_miau.csv", row.names = text_words) +``` + +Now you have all of the tools and knowledge you need to start to analyse your own texts and compare them with each other. + +# Loading your own Sentiment Lexicon + +While the above introduction provides you with many tools for exploring sentiment analysis, this tutorial has not presented an exhaustive list of possibilities. + +You may be working on a project in which you have already created a sentiment dictionary that you would like to use. Or perhaps you need to be able to customise a vocabulary and its corresponding sentiment scores to apply to a particular cultural or temporal context related to your research. Maybe you're looking to improve upon the automatically translated results of the NRC lexicon used here. In each of those cases, as of mid 2022, you can also load your own lexicon dataset into the software using the `custom` function to repeat some of the calculations and visualisations used in this lesson. + +To load your own sentiment lexicon, you first have to create or modify a dataframe containing at minimum a column of words and a column containing the corresponding scores for those words, which the author recommends saving in a CSV file format. + +Try this example: + +```R +|word|value| +|---|---| +|amor|1| +|cólera|-1| +|alfombra|0| +|catástrofe|-2| +``` + +Next, to load your saved data from a CSV file, use the `read.csv` function, which will create a new dataset that you can access in R just as you have in the above examples (change 'FILEPATH' to the full location of your CSV file): + +```R +personalised_vocabulary <- read.csv("FILEPATH") +method <- "custom" +sentiments_sentences <- get_sentiment(sentences_vector, method = method, lexicon = personalised_vocabulary) +``` + +
    +Warning: If you get an error message 'incomplete final line found by readTableHeader', this indicates that your CSV file has not formatted properly and lacks an 'end of line' character at the end of the file. The easiest way to correct this is to open your CSV file in a text editor (not MS Word), scroll to the end of the file, press return, and re-save the file. A fuller explanation of this error is available on Stack Overflow. +
    + +If you want to visualise sentiment across the progression of a text, you can use the `plot` function, which uses the same graphing parameters that you've already learned: + +```R +plot(sentiments_sentences, + type = "l", + main = "'Miau' by Benito Pérez Galdós, 1907 edition", + sub = "Analysis by Dr Jennifer Isasi", + xlab="emotions", ylab = " " + ) +``` + +Keep in mind that this form of customised analysis is limited, and that you may not be able to perform all of the same operations that we introduced above. For example, following the model example with your own dictionary, as you would not have information about emotions you would not be able to make a word cloud in the same way. + + +# Works Cited + +* Arnold, Taylor, and Lauren Tilton. 'Basic Text Processing in R', *Programming Historian* 6 (2017), https://doi.org/10.46430/phen0061 +* Damasio, Antonio R. *El error de Descartes: La razón de las emociones* (Andres Bello, 1999). +* Dewar, Taryn. 'R Basics with Tabular Data', *Programming Historian* 5 (2016), https://doi.org/10.46430/phen0056. +* Gottschalk, Louis, and Goldine Gleser. *The Measurement of Psychological States through the Content Analysis of Verbal Behaviour* (University of California, 1969). +* Heuser, Ryan, Franco Moretti, Erik Steiner. 'The Emotions of London' *Stanford Literary Lab*, Pamphlet 13 (2016) 1-10. +* Hu, Minqing, and Bing Liu, 'Mining and Summarizing Customer Reviews.', *Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery & Data Mining* (KDD-2004), 2004. +* Jockers, Matthew. 'Introduction to the Syuzhet Package' *CRAN* (2020), [https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html](https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html +). +* Jockers, Matthew. 'Some thoughts on Annie's thoughts...about Syuzhet' *Matthew L. Jockers* (2015), [https://www.matthewjockers.net/page/2/](https://web.archive.org/web/20190708100723/https://www.matthewjockers.net/page/2/). +* Leemans, Inger, Janneke M. van der Zwaan, Isa Maks, Erika Kujpers, Kristine Steenberge. 'Mining Embodied Emotions: A Comparative Analysis of Sentiment and Emotion in Dutch Texts, 1600-1800' *Digital Humanities Quarterly* 11 (2017). +* Liu, Bing. *Sentiment Analysis and Opinion Mining* (Morgan & Claypool, 2012). +* Meder, Theo, Dong Nguyen, Rilana Gravel. ‘The Apocalypse on Twitter’ *Digital Scholarship in the Humanities* 31 (2016), 398-410. +* Mohammad, Saif. 'NRC Word-Emotion Association Lexicon', *National Research Council Canada* (2010), [https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm](https://perma.cc/A8M5-2SDG). +* Mohammad, Saif, and Peter D. Turney. 'Crowdsourcing a Word–Emotion Association Lexicon' *Computational Intelligence* 29 (2013): 436-465, doi: 10.1111/j.1467-8640.2012.00460.x. +* Nguyen, Thein Hai, Kiyoaki Shirai, Julien Velcin. 'Sentiment Analysis on Social Media for Stock Movement Prediction' *Expert Systems with Applications* 42 (2015), 9603-9611. +* Nielsen, Finn Årup. 'AFINN Sentiment Lexicon' (2009-2011). +* Pereira Zazo, Óscar. *El analisis de la comunicación en español* (Kendal Hunt, 2015). +* Pérez Galdós, Benito. *Miau* (La Guirnalda, 1888). +* Pérez Galdós, Benito. *Miau* (Sucesores de Hernando, 1907). +* Rodríguez Aldape, Fernando Manuel. *Cuantificación del Interés de un usuario en un tema mediante minería de texto y análisis de sentimiento.* (MA Thesis, Universidad Autónoma de Nuevo León, 2013). +* Schmidt, Thomas, Manuel Burghardt, Christian Wolff. 'Towards Multimodal Sentiment Analysis of Historic Plays: A Case Study with Text and Audio for Lessing's Emilia Galotti' *4th Conference of the Association of Digital Humanities in the Nordic Countries* (2019). +* Siddiqui, Nabeel. 'Data Wrangling and Management in R', *Programming Historian* 6 (2017), https://doi.org/10.46430/phen0063. +* Sprugnoli, Rachele, Sara Tonelli, Alessandro Marchetti, Giovanni Moretti. 'Towards Sentiment Analysis for Historical Texts' *Digital Scholarship in the Humanities* 31 (2016): 762-772. +* Stone, Philip, Dexter Dunphy, Marshall Smith. ‘The General Inquirer: A Computer Approach to Content Analysis’ (M.I.T. Press, 1966). +* Swafford, Annie. 'Problems with the Syuzhet Package' *Anglophile in Academia* (2015), [https://annieswafford.wordpress.com/2015/03/02/syuzhet/](https://perma.cc/TYT3-5DTU). +* Wilkinson Saldaña, Zoë. 'Sentiment Analysis for Exploratory Data Analysis,' *Programming Historian* 7 (2018), https://doi.org/10.46430/phen0079 + + + +# Notes + +[^1]: For example, see: Louis Gottschalk, Goldine Gleser (1969) *The Measurement of Psychological States through the Content Analysis of Verbal Behaviour* (University of California); Philip Stone, Dexter Dunphy, Marshall Smith (1966) ‘The General Inquirer: A Computer Approach to Content Analysis’ (M.I.T. Press); Bing Liu, (2012) *Sentiment Analysis and Opinion Mining* (Morgan & Claypool); Thein Hai Nguyen, Kiyoaki Shirai, Julien Velcin (2015). ‘Sentiment Analysis on Social Media for Stock Movement Prediction’ *Expert Systems with Applications* 42: 9603-9611; Theo Meder, Dong Nguyen, Rilana Gravel (2016). ‘The Apocalypse on Twitter’ *Digital Scholarship in the Humanities* 31 (2): 398-410. +[^2]: For some examples in English, see: Inger Leemans, Janneke M. van der Zwaan, Isa Maks, Erika Kujpers, Kristine Steenberge (2017). 'Mining Embodied Emotions: A Comparative Analysis of Sentiment and Emotion in Dutch Texts, 1600-1800' *Digital Humanities Quarterly* 11 (4); Rachele Sprugnoli, Sara Tonelli, Alessandro Marchetti, Giovanni Moretti (2016). 'Towards Sentiment Analysis for Historical Texts' *Digital Scholarship in the Humanities* 31 (4): 762-772; Thomas Schmidt, Manuel Burghardt, Christian Wolff (2019). 'Towards Multimodal Sentiment Analysis of Historic Plays: A Case Study with Text and Audio for Lessing's Emilia Galotti' *4th Conference of the Association of Digital Humanities in the Nordic Countries*; Ryan Heuser, Franco Moretti, Erik Steiner (2016). 'The Emotions of London' *Stanford Literary Lab*, Pamphlet 13: 1-10. +[^3]: Antonio R. Damasio, *El Error de Descartes: La razón de las emociones*. (Barcelona: Andres Bello, 1999). +[^4]: Óscar Pereira Zazo, *El analisis de la comunicación en español* (Iowa: Kendal Hunt, 2015), 32. +[^5]: 'Bing': Minqing Hu and Bing Liu, 'Mining and summarizing customer reviews.', *Proceedings of the ACM SIGKDD International Conference on Knowledge Discovery & Data Mining* (KDD-2004), 2004; 'Afinn': Finn Årup Nielsen, 'AFINN Sentiment Lexicon' (2009-2011); 'NRC': Saif Mohammad, '[NRC Word-Emotion Association Lexicon](https://perma.cc/A8M5-2SDG)', *National Research Council Canada* (2010). +[^6]: Saif Mohammad and Peter D. Turney, 'Crowdsourcing a Word–Emotion Association Lexicon', *Computational intelligence* 29 (2013): 436-465, doi: 10.1111/j.1467-8640.2012.00460.x +[^7]: Richard Socher, 'Recursive Deep Learning for Natural Language Processing and Computer Vision' PhD diss., (Stanford University, 2014). +[^8]: Thanks to Mounika Puligurthi, intern at the University of Texas (UT) Digital Scholarship Office (during the spring of 2019), for her help interpreting this calculation. +[^9]: There are more words assigned to the emotion *sadness* than to *joy*, both in total number of words (2,061 vs 1,552) and in unique words (349 vs 263). The word 'Mother' appears under both sadness and joy with a value of 33 points. What do you think the significance of that classification decision is? diff --git a/en/lessons/sentiment-analysis.md b/en/lessons/sentiment-analysis.md index eded4c83fd..10a4565d61 100755 --- a/en/lessons/sentiment-analysis.md +++ b/en/lessons/sentiment-analysis.md @@ -17,7 +17,7 @@ review-ticket: https://github.com/programminghistorian/ph-submissions/issues/108 activity: analyzing topics: [distant-reading] abstract: "In this lesson you will learn to conduct 'sentiment analysis' on texts and to interpret the results. This is a form of exploratory data analysis based on natural language processing. You will learn to install all appropriate software and to build a reusable program that can be applied to your own texts." -redirect_from: /lessons/sentiment-analysis +redirect_from: /lessons/sentiment-analysis/ avatar_alt: A laughing man and a grouchy man doi: 10.46430/phen0079 --- @@ -26,10 +26,10 @@ doi: 10.46430/phen0079 # Lesson Goals -This lesson uses [sentiment analysis](https://en.wikipedia.org/wiki/Sentiment_analysis) as the basis for an [exploratory data analysis](https://en.wikipedia.org/wiki/Exploratory_data_analysis) of a large textual corpus. It is appropriate for readers with some basic prior experience programming with [Python](https://www.python.org/). If you have no experience with Python or computer programming, the author recommends working through the first few lessons in the [Introduction to Python series](/lessons/introduction-and-installation). By the end of this lesson, you will be able to: +This lesson uses [sentiment analysis](https://en.wikipedia.org/wiki/Sentiment_analysis) as the basis for an [exploratory data analysis](https://en.wikipedia.org/wiki/Exploratory_data_analysis) of a large textual corpus. It is appropriate for readers with some basic prior experience programming with [Python](https://www.python.org/). If you have no experience with Python or computer programming, the author recommends working through the first few lessons in the [Introduction to Python series](/en/lessons/introduction-and-installation). By the end of this lesson, you will be able to: * Devise appropriate research questions that use [Natural Language Processing](https://en.wikipedia.org/wiki/Natural_language_processing) (NLP) on a textual corpus. -* Use Python and the [Natural Language Processing Toolkit](http://www.nltk.org/) (NLTK) to generate sentiment scores for a text. +* Use Python and the [Natural Language Processing Toolkit](https://www.nltk.org/) (NLTK) to generate sentiment scores for a text. * Critically evaluate the sentiment analysis scores and adjust [parameters](https://en.wikipedia.org/wiki/Parameter) and methodology as appropriate. * Identify next steps to continue learning about exploratory data analysis and programmatic approaches to qualitative data. @@ -47,7 +47,7 @@ When confronted with a promising yet large corpus, how can one go about determin [Natural Language Processing](https://en.wikipedia.org/wiki/Natural_language_processing) (NLP) covers a broad range of techniques that apply computational analytical methods to textual content, which provide means of categorizing and quantifying text. These NLP approaches, which include sentiment analysis, can help researchers explore their textual data. In the words of Tukey, it can help the researcher to find "clues" about their texts and "indications" that something might be worth investigating further. -In this lesson, we will focus on one tool in the NLP toolkit: [sentiment analysis](https://en.wikipedia.org/wiki/Sentiment_analysis). Sentiment analysis seeks to quantify the emotional intensity of words and phrases within a text. Some sentiment analysis tools can also factor in the emotional weight of other features of language such as punctuation or [emojis](https://en.wikipedia.org/wiki/Emoji). Sentiment analysis tools generally process a unit of text (a sentence, paragraph, book, etc) and output quantitative scores or classifications to indicate whether the algorithm considers that text to convey *positive* or *negative* emotion. Some tools can also quantify the *degree of positivity* or *degree of negativity* within a text. Combined with other NLP methods like [topic modeling](/lessons/topic-modeling-and-mallet), sentiment analysis provides a means of characterising the emotions expressed about different topics of conversation. When used in conjunction with [network analysis](/lessons/correspondence-analysis-in-R) it could shed light on the ways that individuals interact with one another. A researcher interested in attitudes towards a political event might use sentiment analysis to characterize how individuals describe that event on social media. Given the right data to input into the tool, it could be possible to make regional comparisons, or to understand how different demographics viewed the event differently. Because the tool can process lots of data sequentially, it is even possible to analyse the sentiment in hundreds of thousands or even millions of speech events. +In this lesson, we will focus on one tool in the NLP toolkit: [sentiment analysis](https://en.wikipedia.org/wiki/Sentiment_analysis). Sentiment analysis seeks to quantify the emotional intensity of words and phrases within a text. Some sentiment analysis tools can also factor in the emotional weight of other features of language such as punctuation or [emojis](https://en.wikipedia.org/wiki/Emoji). Sentiment analysis tools generally process a unit of text (a sentence, paragraph, book, etc) and output quantitative scores or classifications to indicate whether the algorithm considers that text to convey *positive* or *negative* emotion. Some tools can also quantify the *degree of positivity* or *degree of negativity* within a text. Combined with other NLP methods like [topic modeling](/en/lessons/topic-modeling-and-mallet), sentiment analysis provides a means of characterising the emotions expressed about different topics of conversation. When used in conjunction with [network analysis](/en/lessons/correspondence-analysis-in-R) it could shed light on the ways that individuals interact with one another. A researcher interested in attitudes towards a political event might use sentiment analysis to characterize how individuals describe that event on social media. Given the right data to input into the tool, it could be possible to make regional comparisons, or to understand how different demographics viewed the event differently. Because the tool can process lots of data sequentially, it is even possible to analyse the sentiment in hundreds of thousands or even millions of speech events. To get you started, this lesson provides an introduction to sentiment analysis that is both practical and critical. Like any computational tool, sentiment analysis has a number of limitations and biases that researchers should take into account. Researchers should be especially cautious about making empirical claims based on the results of sentiment analysis. You may be better served using sentiment analysis in provisional and exploratory situations, as a means for guiding the research process. When wielding these tools both skeptically and effectively, one can accomplish some pretty remarkable detective work. @@ -67,7 +67,7 @@ For researchers, the Enron Scandal resulted in the creation of one of the larges When the organized and redacted [Enron E-mail Dataset](https://www.cs.cmu.edu/~./enron/) was released in 2004, researchers discovered an unprecedented opportunity: direct access to the spontaneous, largely uncensored way employees in a doomed corporation communicated with one another. Suddenly, researchers had access to how people communicated at work at an unprecedented scale. This mattered for researchers interested in the special case of the Enron scandal and collapse, but also for researchers interested in a wide spectrum of questions about everyday communication at work. -In the following decade, hundreds of new studies sprouted up from the e-mails pursuing questions as diverse as [social network theory](https://en.wikipedia.org/wiki/Social_network), community and [anomaly detection](https://en.wikipedia.org/wiki/Anomaly_detection), gender and communication within organizations, behavioral change during an organizational crisis, and insularity and community formation. The use of social network theory in the humanities proposes some [fascinating possibilities](http://journals.sagepub.com/doi/abs/10.1177/1749975514542486), but is not without [significant debate](http://www.emeraldinsight.com/doi/abs/10.1108/S0733-558X%282014%290000040001). +In the following decade, hundreds of new studies sprouted up from the e-mails pursuing questions as diverse as [social network theory](https://en.wikipedia.org/wiki/Social_network), community and [anomaly detection](https://en.wikipedia.org/wiki/Anomaly_detection), gender and communication within organizations, behavioral change during an organizational crisis, and insularity and community formation. The use of social network theory in the humanities proposes some [fascinating possibilities](https://journals.sagepub.com/doi/abs/10.1177/1749975514542486), but is not without [significant debate](https://www.emeraldinsight.com/doi/abs/10.1108/S0733-558X%282014%290000040001). In addition to the sheer quantity of messages included (the corpus contains over 600,000 messages), the Enron E-mail Corpus also includes the metadata necessary for researchers to pursue a number of research questions. Just as the presence of envelopes with legible sender and recipient addresses would be a wonderful asset for researchers of historic letter correspondences, the presence of sender and recipient e-mail addresses allows researchers to associate e-mails with particular known individuals within the corporation. As some individuals had multiple e-mail addresses, or more than one individual may have shared the same address, the metadata is not fool proof, but it is incredibly insightful. The rest of the tutorial will go through how to apply and interpret sentiment analysis of e-mails in this corpus. @@ -84,17 +84,17 @@ In this tutorial, you will be using [Python](https://www.python.org/) along with To complete the example below, you will need to install the following: * Python 3 (ideally 3.5 or higher) - [Download & install instructions from the Python wiki](https://wiki.python.org/moin/BeginnersGuide/Download) -* NLTK (3.2.5 or higher) - [Download & install instructions from NLTK.org](http://www.nltk.org/install.html) +* NLTK (3.2.5 or higher) - [Download & install instructions from NLTK.org](https://www.nltk.org/install.html) ## Getting Started with NLTK The Natural Language Toolkit (NLTK) is a collection of reusable Python tools (also known as a Python [library](https://en.wikipedia.org/wiki/Library_(computing)) that help researchers apply a set of computational methods to texts. The tools range from methods of breaking up text into smaller pieces, to identifying whether a word belongs in a given language, to sample texts that researchers can use for training and development purposes (such as the complete text of *Moby Dick*). -If you need any help downloading and installing the module for [Python 3](https://www.python.org/download/releases/3.0/), take a look at the [Installing Python Modules with pip lesson](/lessons/installing-python-modules-pip) by Fred Gibbs. +If you need any help downloading and installing the module for [Python 3](https://www.python.org/download/releases/3.0/), take a look at the [Installing Python Modules with pip lesson](/en/lessons/installing-python-modules-pip) by Fred Gibbs. In our case, we will be using two NLTK tools in particular: -* The '[VADER Sentiment Analysis](http://www.nltk.org/_modules/nltk/sentiment/vader.html)' tool (generates positive, negative, and neutral sentiment scores for a given input) +* The '[VADER Sentiment Analysis](https://www.nltk.org/_modules/nltk/sentiment/vader.html)' tool (generates positive, negative, and neutral sentiment scores for a given input) * The 'word_tokenize' tokenizer tool (splits a large text into a sequence of smaller units, like sentences or words) To use VADER and word_tokenize, we first need to download and install a little extra data for NLTK. NLTK is a very large toolkit, and several of its tools actually require a second download step to gather the necessary collection of data (often coded lexicons) to function correctly. @@ -109,13 +109,13 @@ nltk.download('punkt') You can save this file as "`installation.py`". If you are unsure how to save and run Python scripts, please review the appropriate tutorial on setting up an 'Integrated Development Environment' using Python, replacing the command '%(python) %f' with '%(python3) %f' when you reach that point in the tutorial. -1. Setting Up an Integrated Development Environment for Python [Windows](/lessons/windows-installation). -2. Setting Up an Integrated Development Environment for Python [Mac](/lessons/mac-installation). -3. Setting Up an Integrated Development Environment for Python [Linux](/lessons/linux-installation). +1. Setting Up an Integrated Development Environment for Python [Windows](/en/lessons/windows-installation). +2. Setting Up an Integrated Development Environment for Python [Mac](/en/lessons/mac-installation). +3. Setting Up an Integrated Development Environment for Python [Linux](/en/lessons/linux-installation). If you do know how to run Python scripts, run the file using Python 3. -[*VADER*](http://www.nltk.org/_modules/nltk/sentiment/vader.html "Vader page in the NLTK Documentation") (Valence Aware Dictionary and sEntiment Reasoner) is a sentiment intensity tool added to NLTK in 2014. Unlike other techniques that require training on related text before use, *VADER* is ready to go for analysis without any special setup. *VADER* is unique in that it makes fine-tuned distinctions between varying degrees of positivity and negativity. For example, *VADER* scores "comfort" moderately positively and "euphoria" extremely positively. It also attempts to capture and score textual features common in informal online text such as capitalizations, exclamation points, and emoticons, as shown in the table below: +[*VADER*](https://www.nltk.org/_modules/nltk/sentiment/vader.html "Vader page in the NLTK Documentation") (Valence Aware Dictionary and sEntiment Reasoner) is a sentiment intensity tool added to NLTK in 2014. Unlike other techniques that require training on related text before use, *VADER* is ready to go for analysis without any special setup. *VADER* is unique in that it makes fine-tuned distinctions between varying degrees of positivity and negativity. For example, *VADER* scores "comfort" moderately positively and "euphoria" extremely positively. It also attempts to capture and score textual features common in informal online text such as capitalizations, exclamation points, and emoticons, as shown in the table below: {% include figure.html filename="sentiment-analysis1.png" caption="Vader captures slight gradations in enthusiasm. (Hutto and Gilbert, 2014)" %} @@ -420,4 +420,4 @@ Klimt, B., & Yang, Y. (2004). The Enron corpus: A new dataset for email classifi Tukey, J.W. (1977). *Exploratory Data Analysis*. Addison-Wesley Publishing Company -Quinn, J. (2006, November 14). Ex-Enron man goes back into energy. Retrieved January 10, 2018, from http://www.telegraph.co.uk/finance/2950645/Ex-Enron-man-goes-back-into-energy.html +Quinn, J. (2006, November 14). Ex-Enron man goes back into energy. Retrieved January 10, 2018, from https://www.telegraph.co.uk/finance/2950645/Ex-Enron-man-goes-back-into-energy.html diff --git a/en/lessons/shiny-leaflet-newspaper-map-tutorial.md b/en/lessons/shiny-leaflet-newspaper-map-tutorial.md index 1044a0b67e..4b31659ca2 100644 --- a/en/lessons/shiny-leaflet-newspaper-map-tutorial.md +++ b/en/lessons/shiny-leaflet-newspaper-map-tutorial.md @@ -18,7 +18,7 @@ topics: [mapping, website, r, data-visualization] avatar_alt: Reflection of moonlight on a lake abstract: This lesson demonstrates how to build an interactive webmap using R and the Shiny library. In the lesson, you will design and implement a simple application, consisting of a slider which allows a user to select a date range, and display a set of corresponding points, on an interactive map. lesson-partners: [Jisc, The National Archives] -partnership-url: /jisc-tna-partnership +partnership-url: /en/jisc-tna-partnership doi: 10.46430/phen0105 --- @@ -310,7 +310,7 @@ The following code will create a slider with two draggable ends, set by default sliderInput('years', 'Years', min = 1621, max = 2000, value = c(1700, 1750)) ``` -Insert this code between the parentheses of the `sidebarPanel = sidebarPanel( )` command in your script. If you get lost or need to debug, take a look at [the finished code](#Final-code) provided at the end of this lesson. +Insert this code between the parentheses of the `sidebarPanel = sidebarPanel( )` command in your script. If you get lost or need to debug, take a look at [the finished code](#final-code) provided at the end of this lesson. At this point, run the application to see how the slider looks. You should see a grey panel on the left (the sidebar panel), containing the slider widget. If you hover over the slider, you'll notice that you can drag each end (to select a range size) and you can also drag the middle (which will move the entire slider over a window of the selected range size). @@ -402,7 +402,7 @@ Pause here and run the application again. All being well, you should see an inte {% include figure.html filename="shiny-leaflet-newspaper-map-tutorial-5.png" alt="Figure 5. Screenshot of the application with Leaflet map and slider input widget." caption="Figure 5. Screenshot of the application with Leaflet map and slider input widget." %} -To do this, use the command `addCircleMarkers()`, which adds a graphical layer of circles to the `leaflet` map, with coordinates taken from a geographic data object. Using the `%>%` pipe, add the following after the `addCircleMarkers()` function (see the [final code](#Final-code) if you're not sure where this should go): +To do this, use the command `addCircleMarkers()`, which adds a graphical layer of circles to the `leaflet` map, with coordinates taken from a geographic data object. Using the `%>%` pipe, add the following after the `addCircleMarkers()` function (see the [final code](#final-code) if you're not sure where this should go): ``` %>% diff --git a/en/lessons/simulating-historical-communication-networks-python.md b/en/lessons/simulating-historical-communication-networks-python.md index 082d1ec712..fd197f2cb7 100644 --- a/en/lessons/simulating-historical-communication-networks-python.md +++ b/en/lessons/simulating-historical-communication-networks-python.md @@ -36,11 +36,11 @@ The model we will build together is relatively basic, featuring only simple inte The model we build here will not be sufficiently complex to give genuinely valuable perspectives on this case study on its own, but it will highlight some key properties of ABMs, and various ways to implement them. Crucially, by the end of this lesson, you will be able to extend the model further with more complex functionalities. -In the [first part](#Part-1:-Introduction-to-Simulations-and-Agent-based-Modeling), you will learn what historical simulation methods are all about, their methodological and epistemological quirks, and how to start applying Agent-Based Modeling to your own research. +In the [first part](#part-1-introduction-to-simulations-and-agent-based-modeling), you will learn what historical simulation methods are all about, their methodological and epistemological quirks, and how to start applying Agent-Based Modeling to your own research. -In the [second part](#Part-2:-Programming-Agent-based-Models-with-Mesa), you will follow a step-by-step guide to building your first Agent-Based Model, using the Python package mesa. This will be accompanied by further comments and reflections on the methodology of Agent-Based Modeling. +In the [second part](#part-2-programming-agent-based-models-with-mesa), you will follow a step-by-step guide to building your first Agent-Based Model, using the Python package mesa. This will be accompanied by further comments and reflections on the methodology of Agent-Based Modeling. -In the [third part](#Part-3:-A-Summary,-Open-Questions-and-Next-Steps), you will explore ways to extend the model and further enhance your expertise in building Agent-Based Models. +In the [third part](#part-3-summary-open-questions-and-next-steps), you will explore ways to extend the model and further enhance your expertise in building Agent-Based Models. ## Lesson Goals @@ -925,13 +925,13 @@ Do not hesitate to get in touch with us if you want to be part of this discussio [^1]: Hotson, Howard, and Thomas Wallnig, [Eds.] (2019), Reassembling the Republic of Letters in the Digital Age: Standards, Systems, Scholarship. Göttingen, Germany: Göttingen University Press. [https://doi.org/10.17875/gup2019-1146](https://doi.org/10.17875/gup2019-1146). -[^2]: Ureña-Carrion, Javier, Petri Leskinen, Jouni Tuominen, Charles van den Heuvel, Eero Hyvönen, and Mikko Kivelä (2021), Communication Now and Then: Analyzing the Republic of Letters as a Communication Network. [http://arxiv.org/abs/2112.04336](http://arxiv.org/abs/2112.04336). +[^2]: Ureña-Carrion, Javier, Petri Leskinen, Jouni Tuominen, Charles van den Heuvel, Eero Hyvönen, and Mikko Kivelä (2021), Communication Now and Then: Analyzing the Republic of Letters as a Communication Network. [https://arxiv.org/abs/2112.04336](https://arxiv.org/abs/2112.04336). [^3]: Miert, Dirk van (2014), “What was the Republic of Letters? A brief introduction to a long history.” Groniek, no. 204/5 (2014). [https://ugp.rug.nl/groniek/article/view/27601](https://perma.cc/36K9-7LUU). [^4]: Schmitz, Jascha Merijn: Simulation. In: AG Digital Humanities Theorie des Verbandes Digital Humanities im deutschsprachigen Raum e. V. (Hg.): Begriffe der Digital Humanities. Ein diskursives Glossar (= Zeitschrift für digitale Geisteswissenschaften / Working Papers, 2). Wolfenbüttel 2023. 25.05.2023. Version 2.0 vom 16.05.2024. HTML / XML / PDF. [https://doi.org/10.17175/wp_2023_011_v2](https://doi.org/10.17175/wp_2023_011_v2). -[^5]: Gavin, Michael. Agent-Based Modeling and Historical Simulation. Digital Humanities Quarterly, 008(4):195, December 2014. [http://www.digitalhumanities.org/dhq/vol/8/4/000195/000195.html](https://perma.cc/S3WG-SMXR). +[^5]: Gavin, Michael. Agent-Based Modeling and Historical Simulation. Digital Humanities Quarterly, 008(4):195, December 2014. [https://www.digitalhumanities.org/dhq/vol/8/4/000195/000195.html](https://perma.cc/S3WG-SMXR). [^6]: Romein, C. A., Max Kemman, Julie M. Birkholz, J. Baker, M. D. Gruijter, Albert Meroño-Peñuela, T. Ries, Ruben Ros, S. Scagliola (2020). State of the Field: Digital History. In: Journal of the Historical Association 105 (365), pp. 291-312. diff --git a/en/lessons/sonification.md b/en/lessons/sonification.md index 35a11e8916..9eeb08ab37 100755 --- a/en/lessons/sonification.md +++ b/en/lessons/sonification.md @@ -14,7 +14,7 @@ review-ticket: https://github.com/programminghistorian/ph-submissions/issues/4 activity: transforming topics: [distant-reading] abstract: "There are any number of guides that will help you visualize the past, but this lesson will help you hear the past." -redirect_from: /lessons/sonification +redirect_from: /lessons/sonification/ avatar_alt: A violin doi: 10.46430/phen0057 --- @@ -31,11 +31,11 @@ doi: 10.46430/phen0057 I am too tired of seeing the past. There are any number of guides that will help you _visualize_ that past which cannot be seen, but often we forget what a creative act visualization is. We are perhaps too tied to our screens, too much invested in ‘seeing’. Let me hear something of the past instead. -While there is a deep history and literature on archaeoacoustics and soundscapes that try to capture the sound of a place _as it was_ ([see for instance the Virtual St. Paul's](https://www.digitalstudies.org/articles/10.16995/dscn.58) or the work of [Jeff Veitch on ancient Ostia](https://jeffdveitch.wordpress.com/)), I am interested instead to ’sonify' what I have _right now_, the data themselves. I want to figure out a grammar for representing data in sound that is appropriate for history. [Drucker](#Drucker) [famously reminds us](http://web.archive.org/web/20190203083307/http://www.digitalhumanities.org/dhq/vol/5/1/000091/000091.html) that ‘data’ are not really things given, but rather things captured, things transformed: that is to say, ‘capta’. In sonifying data, I literally perform the past in the present, and so the assumptions, the transformations, I make are foregrounded. The resulting aural experience is a literal ‘deformance’ (portmanteau of ‘deform’ and ‘perform’) that makes us hear modern layers of the past in a new way. +While there is a deep history and literature on archaeoacoustics and soundscapes that try to capture the sound of a place _as it was_ ([see for instance the Virtual St. Paul's](https://www.digitalstudies.org/articles/10.16995/dscn.58) or the work of [Jeff Veitch on ancient Ostia](https://jeffdveitch.wordpress.com/)), I am interested instead to ’sonify' what I have _right now_, the data themselves. I want to figure out a grammar for representing data in sound that is appropriate for history. Drucker [famously reminds us](https://web.archive.org/web/20190203083307/https://www.digitalhumanities.org/dhq/vol/5/1/000091/000091.html) that ‘data’ are not really things given, but rather things captured, things transformed: that is to say, ‘capta’[^1]. In sonifying data, I literally perform the past in the present, and so the assumptions, the transformations, I make are foregrounded. The resulting aural experience is a literal ‘deformance’ (portmanteau of ‘deform’ and ‘perform’) that makes us hear modern layers of the past in a new way. I want to hear the meaning of the past, but I know that I can’t. Nevertheless, when I hear an instrument, I can imagine the physicality of the player playing it; in its echoes and resonances I can discern the physical space. I can feel the bass; I can move to the rhythm. The music engages my whole body, my whole imagination. Its associations with sounds, music, and tones I’ve heard before create a deep temporal experience, a system of embodied relationships between myself and the past. Visual? We have had visual representations of the past for so long, we have almost forgotten the artistic and performative aspect of those grammars of expression. -In this tutorial, you will learn to make some noise from your data about the past. The _meaning_ of that noise, well... that's up to you. Part of the point of this tutorial is to make your data unfamiliar again. By translating it, transcoding it, [_remediating_](http://blog.taracopplestone.co.uk/making-things-photobashing-as-archaeological-remediation/) it, we begin to see elements of the data that our familiarity with visual modes of expression have blinded us to. This deformation, this deformance, is in keeping with arguments made by for instance Mark Sample on [breaking things](http://www.samplereality.com/2012/05/02/notes-towards-a-deformed-humanities/), or Bethany Nowviskie on the '[resistance in the materials](http://nowviskie.org/2013/resistance-in-the-materials/)'. Sonification moves us along the continuum from data to capta, social science to art, [glitch to aesthetic](http://nooart.org/post/73353953758/temkin-glitchhumancomputerinteraction). So let's see what this all sounds like. +In this tutorial, you will learn to make some noise from your data about the past. The _meaning_ of that noise, well... that's up to you. Part of the point of this tutorial is to make your data unfamiliar again. By translating it, transcoding it, [_remediating_](https://blog.taracopplestone.co.uk/making-things-photobashing-as-archaeological-remediation/) it, we begin to see elements of the data that our familiarity with visual modes of expression have blinded us to. This deformation, this deformance, is in keeping with arguments made by for instance Mark Sample on [breaking things](https://www.samplereality.com/2012/05/02/notes-towards-a-deformed-humanities/), or Bethany Nowviskie on the '[resistance in the materials](https://nowviskie.org/2013/resistance-in-the-materials/)'. Sonification moves us along the continuum from data to capta, social science to art, [glitch to aesthetic](https://nooart.org/post/73353953758/temkin-glitchhumancomputerinteraction). So let's see what this all sounds like. ## Objectives @@ -46,9 +46,9 @@ In the first, we will use a freely available and free-to-use system developed by You will see that 'sonification' moves us along the spectrum from mere 'visualization/auralization' to actual performance. ### Tools -+ Musicalgorithms [http://musicalgorithms.org/](http://musicalgorithms.org/) ++ Musicalgorithms [https://musicalgorithms.org/](https://musicalgorithms.org/) + MIDITime [https://github.com/cirlabs/miditime](https://github.com/cirlabs/miditime) (I have forked a copy [here](https://github.com/shawngraham/miditime)) -+ Sonic Pi [http://sonic-pi.net/](http://sonic-pi.net/) ++ Sonic Pi [https://sonic-pi.net/](https://sonic-pi.net/) ### Example Data @@ -58,22 +58,22 @@ You will see that 'sonification' moves us along the spectrum from mere 'visualiz # Some Background on Sonification -Sonification is the practice of mapping aspects of the data to produce sound signals. In general, a technique can be called ‘sonification’ if it meets certain conditions. These include reproducibility (the same data can be transformed the same ways by other researchers and produce the same results) and what might be called intelligibility - that the ‘objective’ elements of the original data are reflected systematically in the resulting sound (see [Hermann](#Hermann) [2008](http://www.icad.org/Proceedings/2008/Hermann2008.pdf) for a taxonomy of sonification). [Last and Usyskin](#Last) [(2015)](https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data) designed a series of experiments to determine what kinds of data-analytic tasks could be performed when the data were sonified. Their experimental results (Last and Usyskin 2015) have shown that even untrained listeners (listeners with no formal training in music) can make useful distinctions in the data. They found listeners could discriminate in the sonified data common data exploration tasks such as classification and clustering. (Their sonified outputs mapped the underlying data to the Western musical scale.) +Sonification is the practice of mapping aspects of the data to produce sound signals. In general, a technique can be called ‘sonification’ if it meets certain conditions. These include reproducibility (the same data can be transformed the same ways by other researchers and produce the same results) and what might be called intelligibility - that the ‘objective’ elements of the original data are reflected systematically in the resulting sound (see Hermann [2008](https://www.icad.org/Proceedings/2008/Hermann2008.pdf) for a taxonomy of sonification)[^2]. Last and Usyskin [(2015)](https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data) designed a series of experiments to determine what kinds of data-analytic tasks could be performed when the data were sonified. Their experimental results have shown that even untrained listeners (listeners with no formal training in music) can make useful distinctions in the data. They found listeners could discriminate in the sonified data common data exploration tasks such as classification and clustering[^3]. (Their sonified outputs mapped the underlying data to the Western musical scale.) -Last and Usyskin focused on time-series data. They argue that time-series data are particularly well suited to sonification because there are natural parallels with musical sound. Music is sequential, it has duration, and it evolves over time; so too with time-series data [(Last and Usyskin 2015: 424)](https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data). It becomes a problem of matching the data to the appropriate sonic outputs. In many applications of sonification, a technique called ‘parameter mapping’ is used to marry aspects of the data along various auditory dimensions such as [pitch](#pitch), variation, brilliance, and onset. The problem with this approach is that where there is no temporal relationship (or rather, no non-linear relationship) between the original data points, the resulting sound can be ‘confusing’ (2015: 422). +Last and Usyskin focused on time-series data. They argue that time-series data are particularly well suited to sonification because there are natural parallels with musical sound. Music is sequential, it has duration, and it evolves over time; so too with time-series data [(Last and Usyskin 2015: 424)](https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data). It becomes a problem of matching the data to the appropriate sonic outputs. In many applications of sonification, a technique called ‘parameter mapping’ is used to marry aspects of the data along various auditory dimensions such as [pitch](#terms), variation, brilliance, and onset. The problem with this approach is that where there is no temporal relationship (or rather, no non-linear relationship) between the original data points, the resulting sound can be ‘confusing’ (2015: 422). ## Hearing the Gaps -There is also the way that we fill in gaps in the sound with our expectations. Consider this video where the [mp3](#mp3) has been converted to [MIDI](#midi) back to mp3; the music has been 'flattened' so that all sonic information is being played by one instrument. (Generating this effect is rather like saving a webpage as .txt, opening it in Word, and then resaving it as .html). All sounds (including vocals) have been translated to their corresponding note values, and then turned back into an mp3. +There is also the way that we fill in gaps in the sound with our expectations. Consider this video where the [mp3](#terms) has been converted to [MIDI](#terms) back to mp3; the music has been 'flattened' so that all sonic information is being played by one instrument. (Generating this effect is rather like saving a webpage as .txt, opening it in Word, and then resaving it as .html). All sounds (including vocals) have been translated to their corresponding note values, and then turned back into an mp3. It is noisy; yet we perceive meaning. Consider the video below: -What's going on here? If that song was already known to you, you probably heard the actual 'words'. Yet, no words are present in the song! If the song was not already familiar to you, it sounded like garbled nonsense (see more examples on [Andy Baio's](#Baio) [website](http://waxy.org/2015/12/if_drake_was_born_a_piano/)). This effect is sometimes called an 'auditory hallucination'(cf. [Koebler, 2015](#Koebler)). This example shows how in any representation of data we can hear/see what is not, strictly speaking, there. We fill the holes with our own expectations. +What's going on here? If that song was already known to you, you probably heard the actual 'words'. Yet, no words are present in the song! If the song was not already familiar to you, it sounded like garbled nonsense (see more examples on Andy Baio's [website](https://waxy.org/2015/12/if_drake_was_born_a_piano/)[^4]). This effect is sometimes called an 'auditory hallucination'(cf. Koebler, 2015[^5]). This example shows how in any representation of data we can hear/see what is not, strictly speaking, there. We fill the holes with our own expectations. Consider the implications for history. If we sonify our data, and begin to hear patterns in the sound, or odd outliers, our cultural expectations about how music works (our memories of similar snippets of music, heard in particular contexts) are going to colour our interpretation. This I would argue is true about all representations of the past, but sonifying is just odd enough to our regular methods that this self-awareness will help us identify or communicate the critical patterns in the (data of the) past. -We will progress through three different tools for sonifying data, noting how choices in one tool affect the output, and can be mitigated by reimagining the data via another tool. Ultimately, there is nothing any more objective in 'sonification' than there is in 'visualization', and so the investigator has to be prepared to justify her choices, and to make these choices transparent and reproducible for others. (And lest we think that sonification and algorithmically generated music is somehow a 'new' thing, I direct the interested reader to [Hedges, (1978)](#hedges).) +We will progress through three different tools for sonifying data, noting how choices in one tool affect the output, and can be mitigated by reimagining the data via another tool. Ultimately, there is nothing any more objective in 'sonification' than there is in 'visualization', and so the investigator has to be prepared to justify her choices, and to make these choices transparent and reproducible for others. (And lest we think that sonification and algorithmically generated music is somehow a 'new' thing, I direct the interested reader to Hedges, (1978)[^6].) In each section, I will give a conceptual introduction, followed by a walkthrough using sample archaeological or historical data. @@ -81,7 +81,7 @@ In each section, I will give a conceptual introduction, followed by a walkthroug There are a wide variety of tools out there to sonify data. Some for instance are packages for the widely-used [R statistical environment](https://cran.r-project.org/), such as ‘[playitbyR](https://cran.r-project.org/web/packages/playitbyr/index.html)’ and ‘[AudiolyzR](https://cran.r-project.org/web/packages/audiolyzR/index.html)’. The first of these however has not been maintained or updated to work with the current version of R (its last update was a number of years ago), and the second requires considerable configuration of extra software to make it work properly. -By contrast, the [Musicalgorithms](http://musicalgorithms.org/) site is quite easy to use. The Musicalgorithms site has been online for over a decade. Though it is not open source, it represents a long-term research project in computational music by its creator, Jonathan Middleton. It is currently in its third major iteration (earlier iterations remain usable online). We will begin with Musicalalgorithms because it allows us to quickly enter and tweak our data to produce a MIDI file representation. Make sure to select '[Version 3](http://musicalgorithms.org/3.0/index.html).' +By contrast, the [Musicalgorithms](https://musicalgorithms.org/) site is quite easy to use. The Musicalgorithms site has been online for over a decade. Though it is not open source, it represents a long-term research project in computational music by its creator, Jonathan Middleton. It is currently in its third major iteration (earlier iterations remain usable online). We will begin with Musicalalgorithms because it allows us to quickly enter and tweak our data to produce a MIDI file representation. Make sure to select '[Version 3](https://musicalgorithms.org/3.0/index.html).' {% include figure.html filename="sonification-musicalgorithms-main-site-1.png" caption="The Musicalgorithms Website as it appeared on February 2nd, 2016" %} @@ -109,7 +109,7 @@ The key field for us is ‘areaPitch1,’ which contains the space-delimited inp {% include figure.html filename="sonification-musicalgorithms-pitch-mapping-2.png" caption="After you load your data, you can select the different operations across the top menu bar of the site. In the screenshot, the information mouseover is explaining what happens to the scaling of your data if you select the division operation to scale your data to the range of notes selected." %} -Now, as you page across the various tabs in the interface (‘[duration](#duration) input’, ‘[pitch mapping](#pitch mapping)’, ‘duration mapping’, ‘scale options’) you can effect various transformations. In ‘pitch mapping’, there are a number of mathematical options for mapping the data against the full 88 keys/pitches of a piano keyboard (in a linear mapping, the _mean_ of one’s data would be mapped to middle C, or 40). One can also choose the kind of scale, whether it is a minor or major and so on. At this point, once you've selected your various transformations, you should save the text file. On the file tab, ‘play’, one can download a midi file. Your default audio program can play midi files (often defaulting to a piano tone). More complicated instrumentation can be assigned by opening the midi file in music mixing programs such as GarageBand (Mac) or [LMMS](https://lmms.io/) (Windows, Mac, Linux). (Using Garageband or LMMS are outside the scope of this tutorial. A video tutorial on LMMS is available [here](https://youtu.be/4dYxV3tqTUc), while Garageband tutorials proliferate online. Lynda.com has [an excellent one](http://www.lynda.com/GarageBand-tutorials/Importing-audio-tracks/156620/164050-4.html)) +Now, as you page across the various tabs in the interface (‘[duration](#terms) input’, ‘[pitch mapping](#terms)’, ‘duration mapping’, ‘scale options’) you can effect various transformations. In ‘pitch mapping’, there are a number of mathematical options for mapping the data against the full 88 keys/pitches of a piano keyboard (in a linear mapping, the _mean_ of one’s data would be mapped to middle C, or 40). One can also choose the kind of scale, whether it is a minor or major and so on. At this point, once you've selected your various transformations, you should save the text file. On the file tab, ‘play’, one can download a midi file. Your default audio program can play midi files (often defaulting to a piano tone). More complicated instrumentation can be assigned by opening the midi file in music mixing programs such as GarageBand (Mac) or [LMMS](https://lmms.io/) (Windows, Mac, Linux). (Using Garageband or LMMS are outside the scope of this tutorial. A video tutorial on LMMS is available [here](https://youtu.be/4dYxV3tqTUc), while Garageband tutorials proliferate online. Lynda.com has [an excellent one](https://www.lynda.com/GarageBand-tutorials/Importing-audio-tracks/156620/164050-4.html)) If you had several columns of data for the same points - say, in our example from Roman Britain, we also wanted to sonify counts of a pottery type for those same towns - you can reload your next data series, effect the transformations and mappings, and generate another MIDI file. Since Garageband and LMMS allow for overlaying of voices, you can begin to build up complicated sequences of music. @@ -133,12 +133,12 @@ The [sample dataset](/assets/sonification/sonification-roman-data.csv) provided ``` ...so that your data follows immediately after that last comma (as like [this](/assets/sonification/sonification-romancoin-data-music.csv)). Save the file with a useful name like `coinsounds1.csv`. -3. Go to the [Musicalgorithms](http://musicalgorithms.org/3.0/index.html) site (version 3), and hit the load button. In the pop-up, click the blue 'load' button and select the file saved in step 2. The site will load your materials and display a green check mark if it loaded successfully. If it did not, make sure that your values are separated by spaces, and that they follow immediately the last comma in the code block in step 2. You may also try loading up the [demo file for this tutorial](/assets/sonification/sonification-romancoin-data-music.csv) instead.{% include figure.html filename="sonification-musicalgorithms-upload-4.png" caption="Click 'load' on the main screen to get this dialogue box. Then 'load csv'. Select your file; it will appear in the box. Then click the bottom load button." %} +3. Go to the [Musicalgorithms](https://musicalgorithms.org/3.0/index.html) site (version 3), and hit the load button. In the pop-up, click the blue 'load' button and select the file saved in step 2. The site will load your materials and display a green check mark if it loaded successfully. If it did not, make sure that your values are separated by spaces, and that they follow immediately the last comma in the code block in step 2. You may also try loading up the [demo file for this tutorial](/assets/sonification/sonification-romancoin-data-music.csv) instead.{% include figure.html filename="sonification-musicalgorithms-upload-4.png" caption="Click 'load' on the main screen to get this dialogue box. Then 'load csv'. Select your file; it will appear in the box. Then click the bottom load button." %} 4. Click on 'Pitch Input'. You'll see the values of your data. For now, **do not select** any further options on this page (thus using the site's default values). 5. Click on 'Duration Input'. **Do not select any options here for now**. The options here will map various transformations against your data that will alter the duration for each note. Do not worry about these options for now; move on. 6. Click on 'Pitch Mapping'. This is the most crucial choice, as it will transform (that is, scale) your raw data to a mapping against the keys of the keyboard. Leave the `mapping` set to 'division'. (The other options are modulo or logarithmic). The option `Range` 1 to 88 uses the full 88 keys of the keyboard; thus your lowest value would accord to the deepest note on the piano and your highest value with the highest note. You might wish instead to constrain your music around middle C, so enter 25 to 60 as your range. The output should change to: `31,34,34,34,25,28,30,60,28,25,26,26,25,25,60,25,25,38,33,26,25,25,25` These are no longer your counts; they are notes on the keyboard.{% include figure.html filename="sonification-musicalgorithms-settings-for-pitch-mapping-5.png" caption="Click into the 'range' box and set it to 25. The values underneath will change automatically. Click into the 'to' box and set it to 60. Click back into the other box; the values will update." %} 7. Click on 'Duration Mapping'. Like Pitch Mapping, this takes a range of times that you specify and uses the various mathematical options to map that range of possibilities against your notes. If you mouse over the `i` you will see how the numbers correspond with whole notes, quarter notes, eigth notes, and so on. Leave the default values for now. -8. Click on 'Scale Options'. Here we can begin to select something of what might be called the 'emotional' aspect to sound. We commonly think of major scales being 'happy' while minor scales are 'sad'; for an accessible discussion see [this blog post](http://www.ethanhein.com/wp/2010/scales-and-emotions/). For now, select 'scale by: major'. Leave the 'scale' as `C`. +8. Click on 'Scale Options'. Here we can begin to select something of what might be called the 'emotional' aspect to sound. We commonly think of major scales being 'happy' while minor scales are 'sad'; for an accessible discussion see [this blog post](https://www.ethanhein.com/wp/2010/scales-and-emotions/). For now, select 'scale by: major'. Leave the 'scale' as `C`. You have now sonified one column of data! Click on the 'save' button, then 'save csv'. {% include figure.html filename="sonification-musicalgorithms-save-6.png" caption="The save data dialogue box." %}You'll have a file that looks something like this: @@ -169,15 +169,15 @@ When you have multiple voices of data, what stands out? Note that in this approa # A quick word about getting Python set up -The next section of this tutorial requires Python. If you haven't experimented with Python yet, you will need to spend some time [becoming familiar with the command line (PC) or terminal (OS)](/lessons/intro-to-bash). You might find this quick [guide to installing python 'modules'](/lessons/installing-python-modules-pip) handy (but come back to it after you read the rest of this section). +The next section of this tutorial requires Python. If you haven't experimented with Python yet, you will need to spend some time [becoming familiar with the command line (PC) or terminal (OS)](/en/lessons/intro-to-bash). You might find this quick [guide to installing python 'modules'](/en/lessons/installing-python-modules-pip) handy (but come back to it after you read the rest of this section). Mac users will already have Python installed on their machine. You can test this by holding down the COMMAND button and the spacebar; in the search window, type `terminal` and click on the terminal application. At the prompt, eg, the cursor blinking at `$` type `python --version` and the computer will respond with what version of python you have. _This next section of the tutorial assumes Python 2.7; it has not been tested on Python 3_. -For Windows users, Python is not installed by default on your machine so [this page](http://docs.python-guide.org/en/latest/starting/install/win/) will help you get started, though things are a bit more complicated than that page makes out. First, download the `.msi` file that that page recommends (Python 2.7). Double click the file, and it should install itself in a new directory, eg `C:\Python27\`. Then, we have to tell Windows the location of where to look for Python whenever you run a python program; that is, you put the location of that directory into your 'path', or the environment variable that windows always checks when confronted with a new command. There are a couple ways of doing this, but perhaps the easiest is to search your computer for the program `Powershell` (type 'powershell' into your windows computer search). Open Powershell, and at the `>` prompt, paste this entire line: +For Windows users, Python is not installed by default on your machine so [this page](https://docs.python-guide.org/en/latest/starting/install/win/) will help you get started, though things are a bit more complicated than that page makes out. First, download the `.msi` file that that page recommends (Python 2.7). Double click the file, and it should install itself in a new directory, eg `C:\Python27\`. Then, we have to tell Windows the location of where to look for Python whenever you run a python program; that is, you put the location of that directory into your 'path', or the environment variable that windows always checks when confronted with a new command. There are a couple ways of doing this, but perhaps the easiest is to search your computer for the program `Powershell` (type 'powershell' into your windows computer search). Open Powershell, and at the `>` prompt, paste this entire line: `[Environment]::SetEnvironmentVariable("Path", "$env:Path;C:\Python27\;C:\Python27\Scripts\", "User")` -You can close powershell when you're done. You'll know it worked if nothing very much happens once you've pressed 'enter'. To test that everything is okay, open a command prompt (here are [10 ways to do this](http://www.howtogeek.com/235101/10-ways-to-open-the-command-prompt-in-windows-10/)) and type at the `>` prompt `python --version`. It should tell you `Python 2.7.10` or similar. +You can close powershell when you're done. You'll know it worked if nothing very much happens once you've pressed 'enter'. To test that everything is okay, open a command prompt (here are [10 ways to do this](https://www.howtogeek.com/235101/10-ways-to-open-the-command-prompt-in-windows-10/)) and type at the `>` prompt `python --version`. It should tell you `Python 2.7.10` or similar. The last piece of the puzzle that all users will need is a program called `Pip`. Mac users can install it by typing at the terminal :`sudo easy_install pip`. Windows users have a bit of a harder time. First, right-click and save-as this link: [https://bootstrap.pypa.io/get-pip.py](https://bootstrap.pypa.io/get-pip.py) (If you just click on the link, it will show you the code in your browser). Save it somewhere handy. Open a command prompt in the directory where you saved `get-pip.py`. Then, type at the command prompt `python get-pip.py`. Conventionally, in tutorials, you will see `>` or `$` at points where you are required to enter something at the command prompt or the terminal. You don't ever have to type those two characters. @@ -189,10 +189,10 @@ MIDITime is a python package developed by [Reveal News (formerly, the Centre for While the Musicalgorithms tool has a more-or-less intuitive interface, the investigator sacrifices the ability to know what, exactly, is going on under the hood. In principle, one could examine the underlying code for the MIDITime package to see exactly what's going on. More importantly, the previous tool had no ability to account for data where the points are distant from one another in clock-time. MIDITime lets us take into account that our data might be clustering in time. -Let us assume that you have a historic diary to which you've fitted a [topic model](/lessons/topic-modeling-and-mallet). The resulting output might have diary entries as rows, and the percentage composition each topic contributes to that entry as the columns. In which case, _listening_ to these values might help you understand the patterns of thought in the diary in a way that visualizing as a graph might not. Outliers or recurrent musical patterns could stand out to the ear in a way the grammar of graphs obscures. +Let us assume that you have a historic diary to which you've fitted a [topic model](/en/lessons/topic-modeling-and-mallet). The resulting output might have diary entries as rows, and the percentage composition each topic contributes to that entry as the columns. In which case, _listening_ to these values might help you understand the patterns of thought in the diary in a way that visualizing as a graph might not. Outliers or recurrent musical patterns could stand out to the ear in a way the grammar of graphs obscures. ### Installing MIDITime -Installing miditime is straightforward using [pip](/lessons/installing-python-modules-pip): +Installing miditime is straightforward using [pip](/en/lessons/installing-python-modules-pip): `$ pip install miditime` or `$ sudo pip install miditime` for a Mac or Linux machine; `> python pip install miditime` on a Windows machine. (Windows users, if the instructions above didn't quite work for you, you might want to try [this helper program](https://pydatalog.readthedocs.io/en/latest/installation/#using-pip) instead to get Pip working properly on your machine). @@ -229,7 +229,7 @@ Save this script as `music1.py`. At your terminal or command prompt, run the scr A new file, `myfile.mid` will be written to your directory. To hear this file, you can open it with Quicktime or Windows Media Player. (You can add instrumentation to it by opening it in Garageband or [LMMS](https://lmms.io/)). -`Music1.py` imports miditime (remember, you must do `pip install miditime` before running the script). Then, it creates an output file destination and sets the tempo. The notes are all listed individually, where the first number is the time when the note should be played, the pitch of the note (ie, the actual note!), how hard or rythmically the note is hit (the [attack](#attack)), and then how long the note lasts. The notes are then written to the track, and then the track is written to `myfile.mid`. +`Music1.py` imports miditime (remember, you must do `pip install miditime` before running the script). Then, it creates an output file destination and sets the tempo. The notes are all listed individually, where the first number is the time when the note should be played, the pitch of the note (ie, the actual note!), how hard or rythmically the note is hit (the [attack](#terms)), and then how long the note lasts. The notes are then written to the track, and then the track is written to `myfile.mid`. Play with this script now, and add more notes. The notes for 'Baa Baa Black Sheep' are: @@ -238,13 +238,13 @@ D, D, A, A, B, B, B, B, A Baa, Baa, black, sheep, have, you, any, wool? ``` -Can you make your computer play this song? (This [chart](https://web.archive.org/web/20171211192102/http://www.electronics.dit.ie/staff/tscarff/Music_technology/midi/midi_note_numbers_for_octaves.htm) will help). +Can you make your computer play this song? (This [chart](https://web.archive.org/web/20171211192102/https://www.electronics.dit.ie/staff/tscarff/Music_technology/midi/midi_note_numbers_for_octaves.htm) will help). -**By the way** There is a text file specification for describing music called '[ABC Notation](https://web.archive.org/web/20160617203735/http://abcnotation.com/wiki/abc:standard:v2.1)'. It is beyond us for now, but one could write a sonification script in say a spreadsheet, mapping values to note names in the ABC specification (if you've ever used an IF - THEN in Excel to convert percentage grades to letter grades, you'll have a sense of how this might be done) and then using a site like [this one](http://trillian.mit.edu/~jc/music/abc/ABCcontrib.html) to convert the ABC notation into a .mid file. +**By the way** There is a text file specification for describing music called '[ABC Notation](https://web.archive.org/web/20160617203735/http://abcnotation.com/wiki/abc:standard:v2.1)'. It is beyond us for now, but one could write a sonification script in say a spreadsheet, mapping values to note names in the ABC specification (if you've ever used an IF - THEN in Excel to convert percentage grades to letter grades, you'll have a sense of how this might be done) and then using a site like [this one](https://trillian.mit.edu/~jc/music/abc/ABCcontrib.html) to convert the ABC notation into a .mid file. ### Getting your own data in -[This file](/assets/sonification/sonification-diary.csv) is a selection from the topic model fitted to John Adams' Diaries for[The Macroscope](http://themacroscope.org). Only the strongest signals have been preserved by rounding the values in the columns to two decimal places (remembering that .25 for instance would indicate that that topic is contributing to a quarter of that diary entry's composition). To get this data into your python script, it has to be formatted in a particular away. The tricky bit is getting the date field right. +[This file](/assets/sonification/sonification-diary.csv) is a selection from the topic model fitted to John Adams' Diaries for[The Macroscope](https://themacroscope.org). Only the strongest signals have been preserved by rounding the values in the columns to two decimal places (remembering that .25 for instance would indicate that that topic is contributing to a quarter of that diary entry's composition). To get this data into your python script, it has to be formatted in a particular away. The tricky bit is getting the date field right. _For the purposes of this tutorial, we are going to leave the names of variables and so on unchanged from the sample script. The sample script was developed with earthquake data in mind; so where it says 'magnitude' we can think of it as equating to '% topic composition.'_ @@ -369,13 +369,13 @@ For each column of data in your original data, **have a unique script and rememb # Sonic Pi -Having unique midifiles that you arrange (in Garageband or some other music composition program) moves you from 'sonifying' towards composition and sound art. In this final section, I do not offer you a full tutorial on using [Sonic Pi](http://sonic-pi.net), but rather point you towards this environment that allows for the actual live-coding and performance of your data (see [this video](https://www.youtube.com/watch?v=oW-3HVOeUQA) for an actual live-coding performance). Sonic Pi's built-in tutorials will show you something of the potential of using your computer as an actual musical instrument (where you type Ruby code into its built-in editor while the interpreter plays what you encode). +Having unique midifiles that you arrange (in Garageband or some other music composition program) moves you from 'sonifying' towards composition and sound art. In this final section, I do not offer you a full tutorial on using [Sonic Pi](https://sonic-pi.net), but rather point you towards this environment that allows for the actual live-coding and performance of your data (see [this video](https://www.youtube.com/watch?v=oW-3HVOeUQA) for an actual live-coding performance). Sonic Pi's built-in tutorials will show you something of the potential of using your computer as an actual musical instrument (where you type Ruby code into its built-in editor while the interpreter plays what you encode). Why would you want to do this? As has progressively become clear in tutorial, when you sonify your data you begin to make choices about how the data maps into sound, and these choices reflect implicit or explicit decisions about which data matter. There is a continuum of 'objectivity', if you will. At one end, a sonification that supports an argument about the past; at the other, a performance about the past as riveting and personal as any well-done public lecture. Sonification moves our data off the page and into the ears of our listeners: it is a kind of public history. Performing our data... imagine that! Here, I offer simply a code snippet that will allow you to import your data, where your data is simply a list of values saved as csv. I am indebted to George Washington University librarian Laura Wrubel who posted to [gist.github.com](https://gist.github.com/lwrubel) her experiments in sonifying her library's circulation transactions. -In this [sample file](/assets/sonification/sonification-jesuittopics.csv)(a topic model generated from the [Jesuit Relations](http://puffin.creighton.edu/jesuit/relations/)), there are two topics. The first row contains the headers: topic1, topic2. +In this [sample file](/assets/sonification/sonification-jesuittopics.csv)(a topic model generated from the [Jesuit Relations](https://puffin.creighton.edu/jesuit/relations/)), there are two topics. The first row contains the headers: topic1, topic2. ### Practice @@ -412,7 +412,7 @@ data.each do |line| # end ``` -The first few lines load the columns of data in; then we say which sound sample we wish to use (piano) and then tell Sonic Pi to play topic 1 according to the following criteria (a random value less than 0.5 for the attack; a decay using a random value less than 1; and an [amplitude](#amplitude) using a random value less than 0.25). See the x 100 in the line? That takes our data value (which is a decimal, remember) and turns it into a whole number. In this piece of code (the way I've written it), that number equates directly with a note. If 88 is the lowest note and 1 is the highest, you can see that this approach is a bit problematic: we haven't actually done any pitch mapping here! In which case, you could use Musicalgorithms to do your pitch mapping, and then feed those values back into Sonic Pi. Alternatively, since this code is more or less Ruby, you could look up how to normalize the data and then do a linear mapping of your values against the range 1 - 88. A good place to start would be to study [this worksheet by Steve Lloyd](https://github.com/stevelloyd/Learn-sonification-with-Sonic-Pi) on sonifying weather data with the Sonic Pi. Finally, the other thing to notice here is that the 'rand' value (random) allows us to add a bit of 'humanity' into the music in terms of the dynamics. Then we do the same thing again for topic2. +The first few lines load the columns of data in; then we say which sound sample we wish to use (piano) and then tell Sonic Pi to play topic 1 according to the following criteria (a random value less than 0.5 for the attack; a decay using a random value less than 1; and an [amplitude](#terms) using a random value less than 0.25). See the x 100 in the line? That takes our data value (which is a decimal, remember) and turns it into a whole number. In this piece of code (the way I've written it), that number equates directly with a note. If 88 is the lowest note and 1 is the highest, you can see that this approach is a bit problematic: we haven't actually done any pitch mapping here! In which case, you could use Musicalgorithms to do your pitch mapping, and then feed those values back into Sonic Pi. Alternatively, since this code is more or less Ruby, you could look up how to normalize the data and then do a linear mapping of your values against the range 1 - 88. A good place to start would be to study [this worksheet by Steve Lloyd](https://github.com/stevelloyd/Learn-sonification-with-Sonic-Pi) on sonifying weather data with the Sonic Pi. Finally, the other thing to notice here is that the 'rand' value (random) allows us to add a bit of 'humanity' into the music in terms of the dynamics. Then we do the same thing again for topic2. You can then add beats, loops, samples, and the whole parephernalia that Sonic Pi permits. Where you put code chunks affects the playback; if you put a loop before the data block above, the loop will play first. For instance, if you insert the following after the `use_bpm 100` line, @@ -439,34 +439,55 @@ The code is pretty clear: loop the 'bd_boom' sample with the reverb sound effect By the way, 'live-coding'? What makes this a 'live-coding' environment is that you can make changes to the code _while Sonic Pi is turning it into music_. Don't like what you're hearing? Change the code up on the fly! -For more on Sonic Pi, [this workshop website](https://web.archive.org/web/20150907155822/https://www.miskatonic.org/music/access2015/) is a good place to start. See also Laura Wrubel's [report on attending that workshop, and her and her colleague's work in this area](http://library.gwu.edu/scholarly-technology-group/posts/sound-library-work). +For more on Sonic Pi, [this workshop website](https://web.archive.org/web/20150907155822/https://www.miskatonic.org/music/access2015/) is a good place to start. See also Laura Wrubel's [report on attending that workshop, and her and her colleague's work in this area](https://library.gwu.edu/scholarly-technology-group/posts/sound-library-work). # Nihil Novi Sub Sole -Again, lest we think that we are at the cutting edge in our algorithmic generation of music, a salutary reminder was published in 1978 on 'dice music games' of the eighteenth century, where rolls of the dice determined the recombination of pre-written snippets of music. [Some of these games have been explored and re-coded for the Sonic-Pi by Robin Newman](https://rbnrpi.wordpress.com/project-list/mozart-dice-generated-waltz-revisited-with-sonic-pi/). Newman also uses a tool that could be described as Markdown+Pandoc for musical notation, [Lilypond](http://www.lilypond.org/) to score these compositions. The antecedents for everything you will find at _The Programming Historian_ are deeper than you might suspect! +Again, lest we think that we are at the cutting edge in our algorithmic generation of music, a salutary reminder was published in 1978 on 'dice music games' of the eighteenth century, where rolls of the dice determined the recombination of pre-written snippets of music. [Some of these games have been explored and re-coded for the Sonic-Pi by Robin Newman](https://rbnrpi.wordpress.com/project-list/mozart-dice-generated-waltz-revisited-with-sonic-pi/). Newman also uses a tool that could be described as Markdown+Pandoc for musical notation, [Lilypond](https://www.lilypond.org/) to score these compositions. The antecedents for everything you will find at _The Programming Historian_ are deeper than you might suspect! # Conclusion -Sonifying our data forces us to confront the ways our data are often not so much about the past, but rather our constructed versions of it. It does so partly by virtue of its novelty and the art and artifice required to map data to sound. But it does so also by its contrast with our received notions of visualization of data. It may be that the sounds one generates never rise to the level of 'music'; but if it helps transform how we encounter the past, and how others engage with the past, then the effort will be worth it. As Trevor Owens might have put it, 'Sonfication is about [discovery, not justification'](http://www.trevorowens.org/2012/11/discovery-and-justification-are-different-notes-on-sciencing-the-humanities/). +Sonifying our data forces us to confront the ways our data are often not so much about the past, but rather our constructed versions of it. It does so partly by virtue of its novelty and the art and artifice required to map data to sound. But it does so also by its contrast with our received notions of visualization of data. It may be that the sounds one generates never rise to the level of 'music'; but if it helps transform how we encounter the past, and how others engage with the past, then the effort will be worth it. As Trevor Owens might have put it, 'Sonfication is about [discovery, not justification'](https://www.trevorowens.org/2012/11/discovery-and-justification-are-different-notes-on-sciencing-the-humanities/). ## Terms -+ **MIDI**,musical instrument digital interface. It is a description of a note's value and timing, not of its dynamics or how one might play it (this is an important distinction). It allows computers and instruments to talk to each other; one can apply different instrumentation to a MIDI file much the same way one would change the font on a piece of text (or run a markdown file through Pandoc). -+ **MP3**, a compression format for sound that is _lossy_ in that it strips out data as part of its compression routine. -+ **Pitch**, the actual note itself (middle C, etc) -+ **Attack**, how the note is played or hit -+ **Duration**, how long the note lasts (whole notes, quarter notes, eighth notes etc) -+ **Pitch Mapping & Duration Mapping**, scaling data values against a range of notes or the length of the note -+ **Amplitude**, roughly, the loudness of the note +### MIDI {#midi} -# References -Baio, Andy. 2015. 'If Drake Was Born A Piano'. Waxy. [http://waxy.org/2015/12/if_drake_was_born_a_piano/](http://waxy.org/2015/12/if_drake_was_born_a_piano/) +Musical instrument digital interface. It is a description of a note's value and timing, not of its dynamics or how one might play it (this is an important distinction). It allows computers and instruments to talk to each other; one can apply different instrumentation to a MIDI file much the same way one would change the font on a piece of text (or run a markdown file through Pandoc). -Drucker, Johanna. 2011. Humanities Approaches to Graphical Display. DHQ 5.1 [http://web.archive.org/web/20190203083307/http://www.digitalhumanities.org/dhq/vol/5/1/000091/000091.html](http://web.archive.org/web/20190203083307/http://www.digitalhumanities.org/dhq/vol/5/1/000091/000091.html) +### MP3 {#mp3} -Hedges, Stephen A. 1978. “Dice Music in the Eighteenth Century”. Music & Letters 59 (2). Oxford University Press: 180–87. [http://www.jstor.org/stable/734136](http://www.jstor.org/stable/734136). +A compression format for sound that is _lossy_ in that it strips out data as part of its compression routine. -Hermann, T. 2008. "Taxonomy and definitions for sonification and auditory display". In P. Susini and O. Warusfel (eds.) Proceedings of the 14th international conference on auditory display (ICAD 2008). IRCAM, Paris. [http://www.icad.org/Proceedings/2008/Hermann2008.pdf](http://www.icad.org/Proceedings/2008/Hermann2008.pdf) +### Pitch {#pitch} -Koebler, Jason. 2015. "The Strange Acoustic Phenomenon Behind These Wacked-Out Versions of Pop Songs" Motherboard, Dec 18. [https://web.archive.org/web/20161023223029/http://motherboard.vice.com/read/the-strange-acoustic-phenomenon-behind-these-wacked-out-versions-of-pop-songs](https://web.archive.org/web/20161023223029/http://motherboard.vice.com/read/the-strange-acoustic-phenomenon-behind-these-wacked-out-versions-of-pop-songs) +The actual note itself (middle C, etc) -Last and Usyskin, 2015. "Listen to the Sound of Data". In Aaron K. Baughman et al. (eds.) Multimedia Data Mining and Analytics. Springer: Heidelberg. Pp. 419-446 [https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data](https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data) +### Attack {#attack} + +How the note is played or hit + +### Duration {#duration} + +How long the note lasts (whole notes, quarter notes, eighth notes etc) + +### Pitch Mapping {#pitch-mapping} + +Scaling data values against a range of notes or the length of the note + +### Amplitude {#amplitude} + +Roughly, the loudness of the note + +# Endnotes + +[^1]: Drucker, Johanna. 2011. Humanities Approaches to Graphical Display. DHQ 5.1 [https://web.archive.org/web/20190203083307/https://www.digitalhumanities.org/dhq/vol/5/1/000091/000091.html](https://web.archive.org/web/20190203083307/https://www.digitalhumanities.org/dhq/vol/5/1/000091/000091.html) + +[^2]: Hermann, T. 2008. "Taxonomy and definitions for sonification and auditory display". In P. Susini and O. Warusfel (eds.) Proceedings of the 14th international conference on auditory display (ICAD 2008). IRCAM, Paris. [https://www.icad.org/Proceedings/2008/Hermann2008.pdf](https://www.icad.org/Proceedings/2008/Hermann2008.pdf) + +[^3]: Last and Usyskin, 2015. "Listen to the Sound of Data". In Aaron K. Baughman et al. (eds.) Multimedia Data Mining and Analytics. Springer: Heidelberg. Pp. 419-446 [https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data](https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data) + +[^4]: Baio, Andy. 2015. 'If Drake Was Born A Piano'. Waxy. [https://waxy.org/2015/12/if_drake_was_born_a_piano/](https://waxy.org/2015/12/if_drake_was_born_a_piano/) + +[^5]: Koebler, Jason. 2015. "The Strange Acoustic Phenomenon Behind These Wacked-Out Versions of Pop Songs" Motherboard, Dec 18. [https://web.archive.org/web/20161023223029/https://motherboard.vice.com/read/the-strange-acoustic-phenomenon-behind-these-wacked-out-versions-of-pop-songs](https://web.archive.org/web/20161023223029/https://motherboard.vice.com/read/the-strange-acoustic-phenomenon-behind-these-wacked-out-versions-of-pop-songs) + +[^6]: Hedges, Stephen A. 1978. “Dice Music in the Eighteenth Century”. Music & Letters 59 (2). Oxford University Press: 180–87. [https://www.jstor.org/stable/734136](https://www.jstor.org/stable/734136). diff --git a/en/lessons/space-place-gazetteers.md b/en/lessons/space-place-gazetteers.md index b6a75db296..e050c72d4a 100644 --- a/en/lessons/space-place-gazetteers.md +++ b/en/lessons/space-place-gazetteers.md @@ -79,7 +79,7 @@ The first task for anybody embarking on a digital spatial history project is to A project emphasizing the conflicting, contested, and dynamic characteristics of places, as well as spatial information reflected in textual attestations, should begin with a gazetteer. An example of such a project would be the [Heritage Gazetteer of Libya](https://perma.cc/KLV5-FTRL), which aims to provide information about unique identifiers, locations, and monuments within modern Libya that were important to its history before 1950. The emphasis of this project is on compiling names and variants produced by the research of the Society for Libyan Studies. -A GIS is only the logical starting point for a spatial history project centered on geography and spatial relations *per se*. Both gazetteers and GIS are based on spatial data structured in particular formats, but the focus of a GIS is primarily on the projection of geospatial geometries, in the form of points, lines, and polygons. An example GIS project would be the [Bomb Site: Mapping the WW2 bomb census](http://bombsight.org/#17/51.50595/-0.10680) project, which prioritizes the visualization of targets of the Luftwaffe Blitz bombing raids in London from October 7, 1940 to June 6, 1941. While a gazetteer may also contain geographical information, its primary focus is on depicting more information about places then merely points, lines, or polygons on a map base. +A GIS is only the logical starting point for a spatial history project centered on geography and spatial relations *per se*. Both gazetteers and GIS are based on spatial data structured in particular formats, but the focus of a GIS is primarily on the projection of geospatial geometries, in the form of points, lines, and polygons. An example GIS project would be the [Bomb Site: Mapping the WW2 bomb census](https://bombsight.org/#17/51.50595/-0.10680) project, which prioritizes the visualization of targets of the Luftwaffe Blitz bombing raids in London from October 7, 1940 to June 6, 1941. While a gazetteer may also contain geographical information, its primary focus is on depicting more information about places then merely points, lines, or polygons on a map base. Indeed, although geometry is necessary for making maps, the symbols on maps only tell a small part of the story of a place. The way to model rich, multivocal data about place-making events and contestations of power, about places as settings for social events, and about the sense of place and its representations, is with a gazetteer, not a map. Gazetteers are excellent for collecting information about what a place has been called, by whom, why, and when; who has been there; what has occurred there; who has contended for authority over it; or what texts have referred to it. Gazetteers often use a controlled vocabulary to designate the supplementary feature types associated with places: whether a place is a settlement, a waypoint on a travel itinerary, or a geographical feature such as a mountain or river. diff --git a/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown.md b/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown.md index 73b8197c99..2704c5833a 100755 --- a/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown.md +++ b/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown.md @@ -17,7 +17,7 @@ abstract: "In this tutorial, you will first learn the basics of Markdown—an ea exclude_from_check: - reviewers - review-ticket -redirect_from: /lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown +redirect_from: /lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown/ avatar_alt: A man working at a drafting table doi: 10.46430/phen0041 --- @@ -102,7 +102,7 @@ as means for writing scholarly papers but as a convention for online editing in general. Popular general purpose plain text editors include [Atom](https://atom.io/) -(all platforms) and [Notepad++](http://notepad-plus-plus.org) (Windows only). +(all platforms) and [Notepad++](https://notepad-plus-plus.org) (Windows only). It is important to understand that Markdown is merely a convention. Markdown files are stored as plain text, further adding to the @@ -370,7 +370,7 @@ nice PDF: If you'd like to get an idea of how this kind of markup will be interpreted as HTML formatting, try [this online -sandbox](http://daringfireball.net/projects/markdown/dingus) and play +sandbox](https://daringfireball.net/projects/markdown/dingus) and play around with various kinds of syntax. Remember that certain elements of *Pandoc*-flavored Markdown (such as the title block and footnotes) will not work in this web form, which only accepts the basics. @@ -561,7 +561,7 @@ this: @article{fyfe_digital_2011, title = {Digital Pedagogy Unplugged}, volume = {5}, - url = {http://digitalhumanities.org/dhq/vol/5/3/000106/000106.html}, + url = {https://digitalhumanities.org/dhq/vol/5/3/000106/000106.html}, number = {3}, urldate = {2013-09-28}, author = {Fyfe, Paul}, @@ -628,7 +628,7 @@ Style Language" (yet another plain-text convention, in this case for describing citation styles) and denoted by the .csl file extension. Luckily, the CSL project maintains a repository of common citation styles, some even tailored for specific journals. Visit - to find the .csl file for + to find the .csl file for Modern Language Association, download `modern-language-association.csl`, and save to your project directory as `mla.csl`. Now we need to tell Pandoc to use the MLA stylesheet instead of the default Chicago. We do @@ -693,33 +693,33 @@ for support than John MacFarlane's [Pandoc site](https://pandoc.org/) and the affiliated [mailing list](https://groups.google.com/forum/#!forum/pandoc-discuss). At least two "Question and Answer" type sites can field questions on Pandoc: -[Stack Overflow](http://stackoverflow.com/questions/tagged/pandoc) and -[Digital Humanities Q&A](http://web.archive.org/web/20190203062832/http://digitalhumanities.org/answers/). +[Stack Overflow](https://stackoverflow.com/questions/tagged/pandoc) and +[Digital Humanities Q&A](https://web.archive.org/web/20190203062832/https://digitalhumanities.org/answers/). Questions may also be asked live, on Freenode IRC, \#Pandoc channel, frequented by a friendly group of regulars. As you learn more about Pandoc, you can also explore one of its most powerful features: [filters](https://github.com/jgm/pandoc/wiki/Pandoc-Filters). Although we suggest starting out with a simple editor, many (70+, according to [this blog -post](http://web.archive.org/web/20140120195538/http://mashable.com/2013/06/24/markdown-tools/)) +post](https://web.archive.org/web/20140120195538/https://mashable.com/2013/06/24/markdown-tools/)) other, Markdown-specific alternatives to MS Word are available online, and often free of cost. From the standalone ones, we liked -[Mou](http://mouapp.com/), [Write Monkey](https://web.archive.org/web/20260327163157/http://writemonkey.com/), and -[Sublime Text](http://www.sublimetext.com/). Several web-based platforms +[Mou](https://mouapp.com/), [Write Monkey](https://web.archive.org/web/20260327163157/http://writemonkey.com/), and +[Sublime Text](https://www.sublimetext.com/). Several web-based platforms have recently emerged that provide slick, graphic interfaces for collaborative writing and version tracking using Markdown. These -include: [prose.io](http://prose.io), -[Authorea](http://www.authorea.com), -[Draft](http://www.draftin.com), and +include: [prose.io](https://prose.io), +[Authorea](https://www.authorea.com), +[Draft](https://www.draftin.com), and [StackEdit](https://stackedit.io). -But the ecosystem is not limited to editors. [Gitit](http://gitit.net/) +But the ecosystem is not limited to editors. [Gitit](https://gitit.net/) and [Ikiwiki](https://github.com/dubiousjim/pandoc-iki) support authoring in Markdown with Pandoc as parser. To this list we may a range of tools that generate fast, static webpages, [Yst](https://github.com/jgm/yst), -[Jekyll](http://github.com/fauno/jekyll-pandoc-multiple-formats), -[Hakyll](http://jaspervdj.be/hakyll/), and [bash shell +[Jekyll](https://github.com/fauno/jekyll-pandoc-multiple-formats), +[Hakyll](https://jaspervdj.be/hakyll/), and [bash shell script](https://github.com/wcaleb/website) by the historian Caleb McDaniel. @@ -728,14 +728,14 @@ Markdown. Markdown to marketplace platform [Leanpub](https://leanpub.com) could be an interesting alternative to the traditional publishing model. And we ourselves are experimenting with academic journal design based on GitHub and -[readthedocs.org](http://readthedocs.org) (tools usually used for technical +[readthedocs.org](https://readthedocs.org) (tools usually used for technical documentation). [^1]: Don't worry if you don't understand some of of this terminology yet! [^2]: The source files for this document can be [downloaded from GitHub](https://github.com/dhcolumbia/pandoc-workflow). Use the "raw" option when viewing in GitHub to see the source Markdown. The authors would like to thank Alex Gil and his colleagues from Columbia's Digital Humanities Center, and the participants of openLab at the Studio in the Butler library for testing the code in this tutorial on a variety of platforms. -[^3]: See Charlie Stross's excellent discussion of this topic in [Why Microsoft Word Must Die](http://www.antipope.org/charlie/blog-static/2013/10/why-microsoft-word-must-die.html). +[^3]: See Charlie Stross's excellent discussion of this topic in [Why Microsoft Word Must Die](https://www.antipope.org/charlie/blog-static/2013/10/why-microsoft-word-must-die.html). [^4]: Note that the .bib extension may be "registered" to Zotero in your operating system. That means when you click on a .bib file it is likely that Zotero will be called to open it, whereas we want to open it within a text editor. Eventually, you may want to associate the .bib extension with your text editor. diff --git a/en/lessons/temporal-network-analysis-with-r.md b/en/lessons/temporal-network-analysis-with-r.md index b75d6eaeba..afea5ad561 100644 --- a/en/lessons/temporal-network-analysis-with-r.md +++ b/en/lessons/temporal-network-analysis-with-r.md @@ -429,7 +429,7 @@ Let's take a step back and reflect on what we've learned. At this point, we have If there is one thing that I hope you will take away from this tutorial, it is the idea that adding temporal data to nodes and edges transforms a general social science tool into a powerful method for historical argument. Comparing network structures and metrics from one timeslice to another gives them historical significance that can be difficult, if not impossible, to discern in conventional static social network analysis. -This tutorial introduced only a few of the many tools and techniques made possible by temporal network analysis. One especially exciting area of this field is in dynamic simulations that model the transmission of something, for example a disease or an idea, among individuals within a given temporal network. If that sounds interesting, take a look at the [EpiModel](http://www.epimodel.org/) package or other tools created by epidemiologists to model diffusion within dynamic networks. +This tutorial introduced only a few of the many tools and techniques made possible by temporal network analysis. One especially exciting area of this field is in dynamic simulations that model the transmission of something, for example a disease or an idea, among individuals within a given temporal network. If that sounds interesting, take a look at the [EpiModel](https://www.epimodel.org/) package or other tools created by epidemiologists to model diffusion within dynamic networks. Depending on the historical data that you're working with, temporal network analysis may offer important insights into how the properties of nodes, edges, and the overall network change over time. Whether or not you decide to make the leap to temporal network analysis, it is helpful to remember that networks of all kinds are complex historical phenomena that emerge, develop, transform beyond recognition, and disappear over the course of time. @@ -443,7 +443,7 @@ Maybe you made it through this tutorial but you are still more comfortable with - Ken Cherven has a good overview of Dynamic Network Analysis with Gephi in his book _Mastering Gephi Network Visualization_ (2015) -If you are hungry for more temporal network analysis with R, [this tutorial](https://web.archive.org/web/20180423112846/http://statnet.csde.washington.edu/workshops/SUNBELT/current/ndtv/ndtv_workshop.html) by Skye Bender-deMoll explains additional functions and features of the packages used here. It served as my own guide to learning about temporal network analysis and formed the inspiration for the tutorial above. +If you are hungry for more temporal network analysis with R, [this tutorial](https://web.archive.org/web/20180423112846/https://statnet.csde.washington.edu/workshops/SUNBELT/current/ndtv/ndtv_workshop.html) by Skye Bender-deMoll explains additional functions and features of the packages used here. It served as my own guide to learning about temporal network analysis and formed the inspiration for the tutorial above. You can also dive deeper into the documentation to learn more about the [networkDynamic package](https://cran.r-project.org/web/packages/networkDynamic/index.html), the [TSNA package](https://cran.r-project.org/web/packages/tsna/index.html), and the [NDTV package](https://cran.r-project.org/web/packages/networkDynamic/index.html). diff --git a/en/lessons/text-mining-with-extracted-features.md b/en/lessons/text-mining-with-extracted-features.md index 3064151d9e..c681a9b30c 100755 --- a/en/lessons/text-mining-with-extracted-features.md +++ b/en/lessons/text-mining-with-extracted-features.md @@ -16,7 +16,7 @@ difficulty: 3 review-ticket: https://github.com/programminghistorian/ph-submissions/issues/29 abstract: | Explains how to use Python to summarize and visualize data on millions of texts from the HathiTrust Research Center's Extracted Features dataset. -redirect_from: /lessons/text-mining-with-extracted-features +redirect_from: /lessons/text-mining-with-extracted-features/ mathjax: true avatar_alt: A book inside a torn case doi: 10.46430/phen0058 @@ -60,17 +60,17 @@ Though it is relatively new, the Extracted Features dataset is already seeing us [Underwood](https://doi.org/10.6084/m9.figshare.1279201) leveraged the features for identifying genres, such as fiction, poetry, and drama (2014). Associated with this work, he has released a dataset of 178k books classified by genre alongside genre-specific word counts ([Underwood 2015](https://doi.org/10.13012/J8JW8BSJ)). -The Underwood subset of the Extracted Features dataset was used by Forster (2015) to [observe gender in literature](https://web.archive.org/web/20160105003327/http://cforster.com/2015/09/gender-in-hathitrust-dataset/), illustrating the decline of woman authors through the 19th century. +The Underwood subset of the Extracted Features dataset was used by Forster (2015) to [observe gender in literature](https://web.archive.org/web/20160105003327/https://cforster.com/2015/09/gender-in-hathitrust-dataset/), illustrating the decline of woman authors through the 19th century. -The Extracted Features dataset also underlies higher-level analytic tools. [Mimno](http://mimno.infosci.cornell.edu/wordsim/nearest.html) processed word co-occurrence tables per year, allowing others to view how correlations between topics change over time (2014). The [HT Bookworm](https://analytics.hathitrust.org/bookworm) project has developed an API and visualization tools to support exploration of trends within the HathiTrust collection across various classes, genres, and languages. Finally, we have developed an approach to [within-book topic modelling](https://github.com/organisciak/htrc-book-models) which functions as a mnemonic accompaniment to a previously-read book (Organisciak 2014). +The Extracted Features dataset also underlies higher-level analytic tools. [Mimno](https://mimno.infosci.cornell.edu/wordsim/nearest.html) processed word co-occurrence tables per year, allowing others to view how correlations between topics change over time (2014). The [HT Bookworm](https://analytics.hathitrust.org/bookworm) project has developed an API and visualization tools to support exploration of trends within the HathiTrust collection across various classes, genres, and languages. Finally, we have developed an approach to [within-book topic modelling](https://github.com/organisciak/htrc-book-models) which functions as a mnemonic accompaniment to a previously-read book (Organisciak 2014). ## Suggested Prior Skills This lesson provides a gentle but technical introduction to text analysis in Python with the HTRC Feature Reader. Most of the code is provided, but is most useful if you are comfortable tinkering with it and seeing how outputs change when you do. -We recommend a baseline knowledge of Python conventions, which can be learned with Turkel and Crymble's [series of Python lessons](/lessons/introduction-and-installation) on Programming Historian. +We recommend a baseline knowledge of Python conventions, which can be learned with Turkel and Crymble's [series of Python lessons](/en/lessons/introduction-and-installation) on Programming Historian. -The skills taught here are focused on flexibly accessing and working with already-computed text features. For a better understanding of the process of deriving word features, Programming Historian provides a lesson on [Counting Frequencies](/lessons/counting-frequencies), by Turkel and Crymble. +The skills taught here are focused on flexibly accessing and working with already-computed text features. For a better understanding of the process of deriving word features, Programming Historian provides a lesson on [Counting Frequencies](/en/lessons/counting-frequencies), by Turkel and Crymble. A more detailed look at text analysis with Python is provided in the [Art of Literary Text Analysis](https://github.com/sgsinclair/alta/blob/master/ipynb/ArtOfLiteraryTextAnalysis.ipynb) (Sinclair). The Art of Literary Text Analysis (ALTA) provides a deeper introduction to foundation Python skills, as well as introduces further text analytics concepts to accompany the skills we cover in this lesson. This includes lessons on extracting features ([tokenization](https://github.com/sgsinclair/alta/blob/master/ipynb/Nltk.ipynb), [collocations](https://github.com/sgsinclair/alta/blob/master/ipynb/RepeatingPhrases.ipynb)), and [visualizing trends](https://github.com/sgsinclair/alta/blob/master/ipynb/GettingGraphical.ipynb). @@ -111,7 +111,7 @@ This command installs the HTRC Feature Reader and its necessary dependencies. We That's it! At this point you have everything necessary to start reading HTRC Feature Reader files. -> *psst*, advanced users: You can install the HTRC Feature Reader *without* Anaconda with `pip install htrc-feature-reader`, though for this lesson you'll need to install two additional libraries `pip install matplotlib jupyter`. Also, note that not all manual installations are alike because of hard-to-configure system optimizations: this is why we recommend Anaconda. If you think your code is going slow, you should check that Numpy has access to [BLAS and LAPACK libraries](http://stackoverflow.com/a/19350234/233577) and install [Pandas recommended packages](http://pandas.pydata.org/pandas-docs/version/0.15.2/install.html#recommended-dependencies). The rest is up to you, advanced user! +> *psst*, advanced users: You can install the HTRC Feature Reader *without* Anaconda with `pip install htrc-feature-reader`, though for this lesson you'll need to install two additional libraries `pip install matplotlib jupyter`. Also, note that not all manual installations are alike because of hard-to-configure system optimizations: this is why we recommend Anaconda. If you think your code is going slow, you should check that Numpy has access to [BLAS and LAPACK libraries](https://stackoverflow.com/a/19350234/233577) and install [Pandas recommended packages](https://pandas.pydata.org/docs/getting_started/install.html#performance-dependencies-recommended). The rest is up to you, advanced user! ## Start a Notebook @@ -150,7 +150,7 @@ In this notebook, it's time to give the HTRC Feature Reader a try. When it is ti ## Reading your First Volume The HTRC Feature Reader library has three main objects: **FeatureReader**, **Volume**, and **Page**. -The **FeatureReader** object is the interface for loading the dataset files and making sense of them. The files are originally formatted in a notation called JSON (which _Programming Historian_ discusses [here](/lessons/json-and-jq)) and compressed, which FeatureReader makes sense of and returns as Volume objects. A **Volume** is a representation of a single book or other work. This is where you access features about a work. Many features for a volume are collected from individual pages; to access Page information, you can use the **Page** object. +The **FeatureReader** object is the interface for loading the dataset files and making sense of them. The files are originally formatted in a notation called JSON (which _Programming Historian_ discusses [here](/en/lessons/json-and-jq)) and compressed, which FeatureReader makes sense of and returns as Volume objects. A **Volume** is a representation of a single book or other work. This is where you access features about a work. Many features for a volume are collected from individual pages; to access Page information, you can use the **Page** object. Let's load two volumes to understand how the FeatureReader works. Create a cell in the already-open Jupyter notebook and run the following code. This should give you the input shown below. @@ -232,7 +232,7 @@ The volume id can be used to pull more information from other sources. The scann print(vol.handle_url) ``` - http://hdl.handle.net/2027/nyp.33433075749246 + https://hdl.handle.net/2027/nyp.33433075749246 {% include figure.html filename="June-cover.PNG" caption="Digital copy of sample book" %} @@ -327,7 +327,7 @@ tokens.plot() On some systems, this may take some time the first time. It is clear that pages at the start of a book have fewer words per page, after which the count is fairly steady except for occasional valleys. -You may have some guesses for what these patterns mean. A look at the [scans](http://hdl.handle.net/2027/nyp.33433074811310) confirms that the large valleys are often illustration pages or blank pages, small valleys are chapter headings, and the upward pattern at the start is from front matter. +You may have some guesses for what these patterns mean. A look at the [scans](https://hdl.handle.net/2027/nyp.33433074811310) confirms that the large valleys are often illustration pages or blank pages, small valleys are chapter headings, and the upward pattern at the start is from front matter. Not all books will have the same patterns so we can't just codify these correlations for millions of books. However, looking at this plot makes clear an inportant assumption in text and data mining: that there are patterns underlying even the basic statistics derived from a text. The trick is to identify the consistent and interesting patterns and teach them to a computer. @@ -441,7 +441,7 @@ Look at the following list of commands: can you guess what the output will look - `vol.tokenlist(section='header')` - `vol.tokenlist(section='group')` -Details for these arguments are available in the code [documentation](http://htrc.github.io/htrc-feature-reader/htrc_features/feature_reader.m.html#htrc_features.feature_reader.Volume.tokenlist) for the Feature Reader. +Details for these arguments are available in the code [documentation](https://htrc.github.io/htrc-feature-reader/htrc_features/feature_reader.m.html#htrc_features.feature_reader.Volume.tokenlist) for the Feature Reader. Jupyter provides another convenience here. Documentation can be accessed within the notebook by adding a '?' to the start of a piece of code. Try it with `?vol.tokenlist`, or with other objects or variables. @@ -885,7 +885,7 @@ for page_number in page_numbers: token_idx = tl.index.get_level_values("token") tl[token_idx.str.isalpha()] ``` -Readers familiar with regular expressions (see [Understanding Regular Expressions](/lessons/understanding-regular-expressions) by Doug Knox) can adapt this example for even more robust selection using the `contains()` string method. +Readers familiar with regular expressions (see [Understanding Regular Expressions](/en/lessons/understanding-regular-expressions) by Doug Knox) can adapt this example for even more robust selection using the `contains()` string method. ## Sorting DataFrames @@ -1022,7 +1022,7 @@ The output is a count of how often each part-of-speech tag ("pos") occurs in the - *Apply* with `sum()`: These groups were sent to an apply function, `sum()`. Sum is an aggregation function, so it sums all the information in the 'count' column for each group. For example, all the rows of data in the adverb group are summed up into a single count of all adverbs. - *Combine*: The combine step is implicit: the DataFrame knows from the `groupby` pattern to take everything that the apply function gives back (in the case of 'sum', just one row for every group) and stick it together. -`sum()` is one of many convenient functions [built-in](http://pandas.pydata.org/pandas-docs/stable/groupby.html) to Pandas. Other useful functions are `mean()`, `count()`, `max()`. It is also possible to send your groups to any function that you write with `apply()`. +`sum()` is one of many convenient functions [built-in](https://pandas.pydata.org/pandas-docs/stable/groupby.html) to Pandas. Other useful functions are `mean()`, `count()`, `max()`. It is also possible to send your groups to any function that you write with `apply()`. > groupby can be used on data columns or an index. To run against an index, use `level=[index_level_name]` as above. To group against columns, use `by=[column_name]`. @@ -1143,7 +1143,7 @@ Like iterating over `FeatureReader.volumes()` to get Volume objects, it is possi # Next Steps -Now that you know the basics of the HTRC Feature Reader, you can learn more about the [Extracted Features dataset](https://analytics.hathitrust.org/features). The [Feature Reader home page](https://github.com/htrc/htrc-feature-reader/blob/master/README.ipynb) contains a lesson similar to this one but for more advanced users (that's you now!), and the [code documentation](http://htrc.github.io/htrc-feature-reader/htrc_features/feature_reader.m.html) gives exact information about what types of information can be called. +Now that you know the basics of the HTRC Feature Reader, you can learn more about the [Extracted Features dataset](https://analytics.hathitrust.org/features). The [Feature Reader home page](https://github.com/htrc/htrc-feature-reader/blob/master/README.ipynb) contains a lesson similar to this one but for more advanced users (that's you now!), and the [code documentation](https://htrc.github.io/htrc-feature-reader/htrc_features/feature_reader.m.html) gives exact information about what types of information can be called. Underwood (2015) has released [genre classifications of public-domain texts in the HTRC EF Dataset](https://analytics.hathitrust.org/genre), comprised of fiction, poetry, and drama. Though many historians will be interested in other corners of the dataset, fiction is a good place to tinker with text mining ideas because of its expressiveness and relative format consistency. @@ -1156,15 +1156,15 @@ Finally, the repository for the HTRC Feature Reader has [advanced tutorial noteb Boris Capitanu, Ted Underwood, Peter Organisciak, Timothy Cole, Maria Janina Sarol, J. Stephen Downie (2016). The HathiTrust Research Center Extracted Feature Dataset (1.0) [Dataset]. *HathiTrust Research Center*. [https://doi.org/10.13012/J8X63JT3](https://doi.org/10.13012/J8X63JT3) -Chris Forster. "A Walk Through the Metadata: Gender in the HathiTrust Dataset." Blog. [http://cforster.com/2015/09/gender-in-hathitrust-dataset/](https://web.archive.org/web/20160105003327/http://cforster.com/2015/09/gender-in-hathitrust-dataset/). +Chris Forster. "A Walk Through the Metadata: Gender in the HathiTrust Dataset." Blog. [https://cforster.com/2015/09/gender-in-hathitrust-dataset/](https://web.archive.org/web/20160105003327/https://cforster.com/2015/09/gender-in-hathitrust-dataset/). -Matthew L. Jockers (Feb 2015). "Revealing Sentiment and Plot Arcs with the Syuzhet Package". *Matthew L. Jockers*. Blog. http://www.matthewjockers.net/2015/02/02/syuzhet/. +Matthew L. Jockers (Feb 2015). "Revealing Sentiment and Plot Arcs with the Syuzhet Package". *Matthew L. Jockers*. Blog. https://www.matthewjockers.net/2015/02/02/syuzhet/. Peter Organisciak, Loretta Auvil, J. Stephen Downie (2015). “Remembering books: A within-book topic mapping technique.” Digital Humanities 2015. Sydney, Australia. Stéfan Sinclair & Geoffrey Rockwell (2016). "The Art of Literary Text Analysis." Github.com. Commit b04bc18. https://github.com/sgsinclair/alta. -William J. Turkel and Adam Crymble (2012). "Counting Word Frequencies with Python". The Programming Historian. /lessons/counting-frequencies. +[William J. Turkel and Adam Crymble (2012). "Counting Word Frequencies with Python". The Programming Historian.](/en/lessons/counting-frequencies). Ted Underwood (2014): Understanding Genre in a Collection of a Million Volumes, Interim Report. figshare. [https://doi.org/10.6084/m9.figshare.1281251.v1](https://doi.org/10.6084/m9.figshare.1281251.v1) @@ -1192,7 +1192,7 @@ rsync -azv data.analytics.hathitrust.org::features/listing/htrc-ef-all-files.txt ``` -Finally, it is possible to download many files from a list. To try, we've put together lists for public-domain [fiction](http://data.analytics.hathitrust.org/genre/fiction_paths.txt), [drama](http://data.analytics.hathitrust.org/genre/drama_paths.txt), and [poetry](http://data.analytics.hathitrust.org/genre/poetry_paths.txt) (Underwood 2014). For example: +Finally, it is possible to download many files from a list. To try, we've put together lists for public-domain [fiction](https://data.analytics.hathitrust.org/genre/fiction_paths.txt), [drama](https://data.analytics.hathitrust.org/genre/drama_paths.txt), and [poetry](https://data.analytics.hathitrust.org/genre/poetry_paths.txt) (Underwood 2014). For example: ```bash rsync -azv --files-from=fiction_paths.txt data.analytics.hathitrust.org::features/ . diff --git a/en/lessons/text-mining-youtube-comments.md b/en/lessons/text-mining-youtube-comments.md index 4b22ef2198..0e3b4982e1 100644 --- a/en/lessons/text-mining-youtube-comments.md +++ b/en/lessons/text-mining-youtube-comments.md @@ -334,7 +334,7 @@ Now that the comment data is reduced to the essentials, you can transform the da ## Modeling -An increasingly wide range of text mining and machine learning algorithms are available for scholars looking to create models and visualizations of big data. Many of these algorithms are described in other _Programming Historian_ lessons, for example, [word frequency analysis](/en/lessons/counting-frequencies) and [topic modeling](/en/lessons/topic-modeling-and-mallet). As noted above, the text mining algorithm central to this lesson is called Wordfish. For information on the machine learning algorithm itself and to explore Wordfish's base code, visit [the Wordfish website](http://www.Wordfish.org/software.html) and [the Wordfish Github repository](http://www.wordfish.org/). +An increasingly wide range of text mining and machine learning algorithms are available for scholars looking to create models and visualizations of big data. Many of these algorithms are described in other _Programming Historian_ lessons, for example, [word frequency analysis](/en/lessons/counting-frequencies) and [topic modeling](/en/lessons/topic-modeling-and-mallet). As noted above, the text mining algorithm central to this lesson is called Wordfish. For information on the machine learning algorithm itself and to explore Wordfish's base code, visit [the Wordfish website](https://www.Wordfish.org/software.html) and [the Wordfish Github repository](https://www.wordfish.org/). Developed by and for political scientists, Wordfish was originally created as a method for extracting the ideological leaning of documents expected to contain latent political perspectives (such as party manifestos or politician speeches). For example, Wordfish can be a useful tool for identifying whether United States representatives' speeches were made by [Democrats](https://perma.cc/G7U3-X2FB) or [Republicans](https://perma.cc/5WKD-YKY9), as well as for measuring the extremity of the ideological leaning conveyed in those speeches. @@ -364,7 +364,7 @@ The key difference between Wordfish scaling and topic modeling, however, are the ### Creating a Corpus in R -The [Wordfish](http://www.wordfish.org/) algorithm was initially distributed as a stand-alone R package (still available on the [Wordfish website](http://www.Wordfish.org/software.html)), but it is now also available in the [`quanteda` package](https://perma.cc/WYV4-Y884). The `quanteda` Wordfish package has certain advantages, including that it enables seamless wrangling of YouTube comment data into a useful format [to build the Wordfish model](https://perma.cc/7736-5QHV). Visit the [docs and tutorials](https://quanteda.org/quanteda/) on the `quanteda` website for more background. +The [Wordfish](https://www.wordfish.org/) algorithm was initially distributed as a stand-alone R package (still available on the [Wordfish website](https://www.Wordfish.org/software.html)), but it is now also available in the [`quanteda` package](https://perma.cc/WYV4-Y884). The `quanteda` Wordfish package has certain advantages, including that it enables seamless wrangling of YouTube comment data into a useful format [to build the Wordfish model](https://perma.cc/7736-5QHV). Visit the [docs and tutorials](https://quanteda.org/quanteda/) on the `quanteda` website for more background. To run the Wordfish model in `quanteda`, you must create three types of text data objects: a corpus, tokens, and a DFM. For more detail on how these objects work together, refer to `quanteda`'s [quick start page](https://perma.cc/QR2C-RCUH). diff --git a/en/lessons/topic-modeling-and-mallet.md b/en/lessons/topic-modeling-and-mallet.md index e8abc87298..30aa2ef3ba 100755 --- a/en/lessons/topic-modeling-and-mallet.md +++ b/en/lessons/topic-modeling-and-mallet.md @@ -17,7 +17,7 @@ exclude_from_check: activity: analyzing topics: [distant-reading] abstract: "In this lesson you will first learn what topic modeling is and why you might want to employ it in your research. You will then learn how to install and work with the MALLET natural language processing toolkit to do so." -redirect_from: /lessons/topic-modeling-and-mallet +redirect_from: /lessons/topic-modeling-and-mallet/ avatar_alt: A man striking an anvil with a large hammer doi: 10.46430/phen0017 --- @@ -161,7 +161,7 @@ the instructions appropriate for you below: ### Windows Instructions -1. Go to the [MALLET][] project page. You can [download MALLET here](http://mallet.cs.umass.edu/download.php). +1. Go to the [MALLET][] project page. You can [download MALLET here](https://mallet.cs.umass.edu/download.php). 2. You will also need the [Java developer's kit][] – that is, not the regular Java that's on every computer, but the one that lets you program things. Install this on your computer. @@ -236,7 +236,7 @@ You are now ready to skip ahead to the next section. Many of the instructions for OS X installation are similar to Windows, with a few differences. In fact, it is a bit easier. -1. Download and [install MALLET](http://mallet.cs.umass.edu/download.php). +1. Download and [install MALLET](https://mallet.cs.umass.edu/download.php). 2. Download the [Java Development Kit][Java developer's kit]. Unzip MALLET into a directory on your system (for ease of following @@ -373,7 +373,7 @@ bin\mallet import-dir --input sample-data\web\de --output tutorial.mallet --keep And then finally, you could use your own data. Change `sample-data\web\de` to a directory that contains your own research files. Good luck! -If you are unsure how directories work, we suggest the *Programming Historian* lesson ["Introduction to the Bash Command Line"](/lessons/intro-to-bash). +If you are unsure how directories work, we suggest the *Programming Historian* lesson ["Introduction to the Bash Command Line"](/en/lessons/intro-to-bash). ### For Mac @@ -599,30 +599,30 @@ report. 'Methods' section is especially important, in that it discusses preparing text for this sort of analysis. - [Bash Command Line]: /lessons/intro-to-bash - [discussion list]: http://mallet.cs.umass.edu/mailinglist.php - [Distant Reading]: http://www.cs.umbc.edu/~hillol/NGDM07/abstracts/talks/MKirschenbaum.pdf - [Reading Machines]: http://www.worldcat.org/title/reading-machines-toward-an-algorithmic-criticism/oclc/708761605&referer=brief_results - [Voyant Tools]: http://voyant-tools.org - [dangers]: https://web.archive.org/web/20240602215348/https://www.scottbot.net/HIAL/index.html@p=16713.html - [zombies using Google Trends]: http://arxiv.org/abs/1003.6087/ - [David Blei and friends]: http://en.wikipedia.org/wiki/Latent_Dirichlet_allocation - [Mining the Dispatch]: http://dsl.richmond.edu/dispatch/ - [Topic Modeling Martha Ballard's Diary]: https://perma.cc/39CG-MNLH - [MALLET]: http://mallet.cs.umass.edu/index.php - [*Gibbs sampling*]: http://en.wikipedia.org/wiki/Gibbs_sampling - [`download MALLET`]: http://mallet.cs.umass.edu/download.php - [Java developer's kit]: http://www.oracle.com/technetwork/java/javase/downloads/index.html - [automate this process]: http://electricarchaeology.ca/2012/07/09/mining-a-day-of-archaeology/ - [Mining the Open Web with Looted Heritage Draft]: http://electricarchaeology.ca/2012/06/08/mining-the-open-web-with-looted-heritage-draft/ - [Figshare.com]: https://ndownloader.figshare.com/files/90972 - [Guided Tour to Topic Modeling]: https://web.archive.org/web/20240520155820/https://www.scottbot.net/HIAL/index.html@p=19113.html - [Topic modeling made just simple enough]: http://tedunderwood.wordpress.com/2012/04/07/topic-modeling-made-just-simple-enough/ - [Some Assembly Required]: http://web.archive.org/web/20160704150726/http://www.lisarhody.com:80/some-assembly-required/ - [Topic Modeling in the Humanities: An Overview | Maryland Institute for Technology in the Humanities]: https://web.archive.org/web/20130116223500/http://mith.umd.edu/topic-modeling-in-the-humanities-an-overview/ - [Latent dirichlet allocation]: http://dl.acm.org/citation.cfm?id=944937 - [bibliography of topic modeling articles]: http://mimno.infosci.cornell.edu/topics.html - [Computational Historiography]: http://www.perseus.tufts.edu/publications/02-jocch-mimno.pdf - [Windows]: /lessons/windows-installation - [Mac]: /lessons/mac-installation - [Linux]: /lessons/linux-installation +- [Bash Command Line](/en/lessons/intro-to-bash) +- [discussion list](https://mallet.cs.umass.edu/mailinglist.php) +- [Distant Reading](https://www.cs.umbc.edu/~hillol/NGDM07/abstracts/talks/MKirschenbaum.pdf) +- [Reading Machines](https://www.worldcat.org/title/reading-machines-toward-an-algorithmic-criticism/oclc/708761605&referer=brief_results) +- [Voyant Tools](https://voyant-tools.org) +- [dangers](https://web.archive.org/web/20240602215348/https://www.scottbot.net/HIAL/index.html@p=16713.html) +- [zombies using Google Trends](https://arxiv.org/abs/1003.6087/) +- [David Blei and friends](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) +- [Mining the Dispatch](https://dsl.richmond.edu/dispatch/) +- [Topic Modeling Martha Ballard's Diary](https://perma.cc/39CG-MNLH) +- [MALLET](https://mallet.cs.umass.edu/index.php) +- [*Gibbs sampling*](https://en.wikipedia.org/wiki/Gibbs_sampling) +- [`download MALLET`](https://mallet.cs.umass.edu/download.php) +- [Java developer's kit](https://www.oracle.com/technetwork/java/javase/downloads/index.html) +- [automate this process](https://electricarchaeology.ca/2012/07/09/mining-a-day-of-archaeology/) +- [Mining the Open Web with Looted Heritage Draft](https://electricarchaeology.ca/2012/06/08/mining-the-open-web-with-looted-heritage-draft/) +- [Figshare.com](https://ndownloader.figshare.com/files/90972) +- [Guided Tour to Topic Modeling](https://web.archive.org/web/20240520155820/https://www.scottbot.net/HIAL/index.html@p=19113.html) +- [Topic modeling made just simple enough](https://tedunderwood.wordpress.com/2012/04/07/topic-modeling-made-just-simple-enough/) +- [Some Assembly Required](https://web.archive.org/web/20160704150726/https://www.lisarhody.com:80/some-assembly-required/) +- [Topic Modeling in the Humanities: An Overview | Maryland Institute for Technology in the Humanities](https://web.archive.org/web/20130116223500/https://mith.umd.edu/topic-modeling-in-the-humanities-an-overview/) +- [Latent dirichlet allocation](https://dl.acm.org/citation.cfm?id=944937) +- [bibliography of topic modeling articles](https://mimno.infosci.cornell.edu/topics.html) +- [Computational Historiography](https://www.perseus.tufts.edu/publications/02-jocch-mimno.pdf) +- [Windows](/en/lessons/windows-installation) +- [Mac](/en/lessons/mac-installation) +- [Linux](/en/lessons/linux-installation) diff --git a/en/lessons/transcribing-handwritten-text-with-python-and-azure.md b/en/lessons/transcribing-handwritten-text-with-python-and-azure.md index b21a394826..33e83b2d25 100644 --- a/en/lessons/transcribing-handwritten-text-with-python-and-azure.md +++ b/en/lessons/transcribing-handwritten-text-with-python-and-azure.md @@ -541,12 +541,12 @@ As capabilities grow, so the potential uses of this type of transcription for Di ## Bibliography -Cahill, Barry. "White, William Andrew," in Dictionary of Canadian Biography, vol. 16, University of Toronto/Université Laval, 2003–, [http://www.biographi.ca/en/bio/white_william_andrew_16E.html](https://perma.cc/AU2P-GBCA). Accessed August 18, 2023. +Cahill, Barry. "White, William Andrew," in Dictionary of Canadian Biography, vol. 16, University of Toronto/Université Laval, 2003–, [https://www.biographi.ca/en/bio/white_william_andrew_16E.html](https://perma.cc/AU2P-GBCA). Accessed August 18, 2023. Dombrowski, Quinn, Tassie Gniady, and David Kloster, "Introduction to Jupyter Notebooks," _Programming Historian_ 8 (2019), [https://doi.org/10.46430/phen0087](https://doi.org/10.46430/phen0087). Graham, Shawn. Detecting and Extracting Hand-written text. Jan 28, 2020. [https://shawngraham.github.io/dhmuse/detecting-handwriting/](https://perma.cc/J7BV-V6ME). Accessed 25 December, 2021. -White, William. 1917. William Andrew White fonds, R15535-0-8-E, "1917 Diary", Item ID number 4818067. Library and Archives Canada. [http://central.bac-lac.gc.ca/.redirect?app=fonandcol&id=4818067&lang=eng](https://perma.cc/9LQJ-XBEW). Accessed August 18, 2023. +White, William. 1917. William Andrew White fonds, R15535-0-8-E, "1917 Diary", Item ID number 4818067. Library and Archives Canada. [https://central.bac-lac.gc.ca/.redirect?app=fonandcol&id=4818067&lang=eng](https://perma.cc/9LQJ-XBEW). Accessed August 18, 2023. Cognitive-services-quickstart-code, June 22, 2021, [https://docs.microsoft.com/en-us/azure/cognitive-services/computer-vision/quickstarts-sdk/python-sdk](https://perma.cc/FQ4Z-J9JU). Accessed 25 December, 2021. diff --git a/en/lessons/transforming-xml-with-xsl.md b/en/lessons/transforming-xml-with-xsl.md index 21ba61884d..1112482526 100644 --- a/en/lessons/transforming-xml-with-xsl.md +++ b/en/lessons/transforming-xml-with-xsl.md @@ -14,7 +14,7 @@ review-ticket: https://github.com/programminghistorian/ph-submissions/issues/11 activity: transforming topics: [data-manipulation, data-visualization] abstract: "This tutorial will provide you with the ability to convert or transform historical data from an XML database (whether a single file or several linked documents) into a variety of different presentations—condensed tables, exhaustive lists or paragraphed narratives—and file formats." -redirect_from: /lessons/transforming-xml-with-xsl +redirect_from: /lessons/transforming-xml-with-xsl/ avatar_alt: A peacock with a woman's head doi: 10.46430/phen0097 --- @@ -239,7 +239,7 @@ The command line code examples we will show here will assume that this is the ca # Choosing and Preparing XML Data -In order to begin transforming XML, you will need to obtain a well-formed dataset. Many online historical databases are built upon XML and provide their data freely. This tutorial will make use of the [Scissors and Paste Database](http://scissors-and-paste.net). +In order to begin transforming XML, you will need to obtain a well-formed dataset. Many online historical databases are built upon XML and provide their data freely. This tutorial will make use of the [Scissors and Paste Database](https://scissors-and-paste.net). The *Scissors and Paste Database* is a collaborative and growing collection of articles from British and imperial newspapers in the 18th and 19th centuries. Its original purpose was to allow for careful comparisons of reprints (copies) that appeared in multiple newspapers as well as to detect similarly themed articles across different English-language publications. Like many XML databases, *Scissors and Paste* contains both data (the article's text), formatting information (such as italics and justification), and metadata. This metadata includes documentation about the particular article, such as its pagination and printing date, information about the newspaper in which it was published, and the themes, individuals or locations mentioned in the text. @@ -318,7 +318,7 @@ The first three lines of your XSL file should be the following: ``` The first line documents that this is an XML document encoded as UTF-8. -The second line states that the document is an XSL document version 1.0 and the standards (or [namespace](https://en.wikipedia.org/wiki/Namespace)) established by the [World Wide Web Consortium](http://www.w3.org/), whose web address you have listed. +The second line states that the document is an XSL document version 1.0 and the standards (or [namespace](https://en.wikipedia.org/wiki/Namespace)) established by the [World Wide Web Consortium](https://www.w3.org/), whose web address you have listed. (Note that an XSL document is ultimately an XML document!) Finally, the third line tells your transformer what sort of output you would like to create. In this case, you are indicating that you will be creating a plain-text file. (You could also have written `xml` or `html`, instead of `text`, in order to produce an XML or and HTML document, respectively.) diff --git a/en/lessons/transliterating.md b/en/lessons/transliterating.md index f7263ab570..c23553feab 100644 --- a/en/lessons/transliterating.md +++ b/en/lessons/transliterating.md @@ -19,7 +19,7 @@ abstract: "This lesson shows how to use Python to transliterate automatically a list of words from a language with a non-Latin alphabet to a standardized format using the American Standard Code for Information Interchange (ASCII) characters." -redirect_from: /lessons/transliterating +redirect_from: /lessons/transliterating/ avatar_alt: A set of Cyrillic characters doi: 10.46430/phen0032 --- @@ -468,30 +468,30 @@ dealing with lots of names or for people who prefer or need to use ASCII characters. It is a simple tool but one that can be an enormous time saver. - [ASCII]: http://en.wikipedia.org/wiki/Ascii - [Viewing HTML Files]: /lessons/viewing-html-files - [Working with Web Pages]: /lessons/working-with-web-pages - [From HTML to List of Words (part 1)]: /lessons/from-html-to-list-of-words-1 - [Intro to Beautiful Soup]: /lessons/intro-to-beautiful-soup - [Memorial]: http://lists.memo.ru - [Cyrillic]: http://en.wikipedia.org/wiki/Cyrillic_script - [Latin characters]: http://en.wikipedia.org/wiki/Latin_script - [Unicode]: http://en.wikipedia.org/wiki/Unicode - [Terminal]: http://en.wikipedia.org/wiki/Terminal_%28OS_X%29 - [IDLE]: http://en.wikipedia.org/wiki/IDLE_%28Python%29 - [Komodo Edit]: http://www.activestate.com/komodo-edit - [ALA-LC]: http://en.wikipedia.org/wiki/ALA-LC_romanization_for_Russian - [Beautiful Soup in Python.]: http://www.crummy.com/software/BeautifulSoup/ - [Glasnost]: http://en.wikipedia.org/wiki/Glasnost - [here]: http://lists.memo.ru/d1/f1.htm - [Automated Downloading with Wget]: /lessons/automated-downloading-with-wget - [What is Unicode]: http://www.unicode.org/standard/WhatIsUnicode.html - [comma separated value]: http://en.wikipedia.org/wiki/Comma-separated_values - [Counting Frequencies]: /lessons/counting-frequencies - [Library of Congress]: http://web.archive.org/web/20170312041508/http://www.lcweb.loc.gov/catdir/cpso/romanization/russian.pdf - [Wikipedia has a table]: http://en.wikipedia.org/wiki/Cyrillic_script_in_Unicode - [Unicode website]: http://www.unicode.org/charts/ - [Manipulating Strings in Python]: /lessons/manipulating-strings-in-python - [Installing Python Modules with pip]: /lessons/installing-python-modules-pip - [Cascading Style Sheets]: http://www.w3schools.com/css/ - [Code Academy’s]: https://www.codecademy.com/catalog/subject/web-development +- [ASCII](https://en.wikipedia.org/wiki/Ascii) +- [Viewing HTML Files](/en/lessons/viewing-html-files) +- [Working with Web Pages](/en/lessons/working-with-web-pages) +- [From HTML to List of Words (part 1)](/en/lessons/from-html-to-list-of-words-1) +- [Intro to Beautiful Soup](/en/lessons/intro-to-beautiful-soup) +- [Memorial](https://lists.memo.ru) +- [Cyrillic](https://en.wikipedia.org/wiki/Cyrillic_script) +- [Latin characters](https://en.wikipedia.org/wiki/Latin_script) +- [Unicode](https://en.wikipedia.org/wiki/Unicode) +- [Terminal](https://en.wikipedia.org/wiki/Terminal_%28OS_X%29) +- [IDLE](https://en.wikipedia.org/wiki/IDLE_%28Python%29) +- [Komodo Edit](https://github.com/ActiveState/OpenKomodoIDE) +- [ALA-LC](https://en.wikipedia.org/wiki/ALA-LC_romanization_for_Russian) +- [Beautiful Soup in Python.](https://www.crummy.com/software/BeautifulSoup/) +- [Glasnost](https://en.wikipedia.org/wiki/Glasnost) +- [here](https://lists.memo.ru/d1/f1.htm) +- [Automated Downloading with Wget](/en/lessons/automated-downloading-with-wget) +- [What is Unicode](https://www.unicode.org/standard/WhatIsUnicode.html) +- [comma separated value](https://en.wikipedia.org/wiki/Comma-separated_values) +- [Counting Frequencies](/en/lessons/counting-frequencies) +- [Library of Congress](https://web.archive.org/web/20170312041508/https://www.lcweb.loc.gov/catdir/cpso/romanization/russian.pdf) +- [Wikipedia has a table](https://en.wikipedia.org/wiki/Cyrillic_script_in_Unicode) +- [Unicode website](https://www.unicode.org/charts/) +- [Manipulating Strings in Python](/en/lessons/manipulating-strings-in-python) +- [Installing Python Modules with pip](/en/lessons/installing-python-modules-pip) +- [Cascading Style Sheets](https://www.w3schools.com/css/) +- [Code Academy’s](https://www.codecademy.com/catalog/subject/web-development) diff --git a/en/lessons/understanding-creating-word-embeddings.md b/en/lessons/understanding-creating-word-embeddings.md index 4079889863..3849f4604b 100644 --- a/en/lessons/understanding-creating-word-embeddings.md +++ b/en/lessons/understanding-creating-word-embeddings.md @@ -497,4 +497,4 @@ We would like to thank Mark Algee-Hewitt and Julia Flanders for their contributi [^3]: Many research questions in the humanities address bigger-picture concepts like gender, identity, or justice. A corpus the size of the one we are using here would be poorly suited to these kinds of research questions, because relevant terms are used in a diffuse set of contexts. As a general guideline, a million words is a minimum starting point for these kinds of queries. In our example, we are looking at a set of terms that appear with some frequency in a very consistent set of contexts, which makes it possible to produce reasonable results with a smaller corpus. Weavers and Koolen lay out a set of considerations around corpus size in greater detail, and the piece is worth consulting as you consider your own corpus. See Wevers, Melvin and Koolwen, Marijn. "Digital begriffsgeschichte: Tracing semantic change using word embeddings." _Historical Methods: A Journal of Quantitative and Interdisciplinary History_ 53, no. 4 (2020): 226-243. [https://doi.org/10.1080/01615440.2020.1760157](https://doi.org/10.1080/01615440.2020.1760157). -[^4]: For example, see Cordell, Ryan. "‘Q i-Jtb the Raven’: Taking Dirty OCR Seriously." _Book History_ 20, no. 1 (2017): 188–225. [https://doi.org/10.1353/bh.2017.0006](https://doi.org/10.1353/bh.2017.0006) for a discussion of how OCR errors can provide useful information in research. See also Rawson, Katie, and Muñoz, Trevor. "Against Cleaning." _Curating Menus_, July 2016.[http://www.curatingmenus.org/articles/against-cleaning/](https://perma.cc/QPW7-ZJ7U) for a discussion on the many and significant complexities that are often obscured under the concept of 'cleaning' data. +[^4]: For example, see Cordell, Ryan. "‘Q i-Jtb the Raven’: Taking Dirty OCR Seriously." _Book History_ 20, no. 1 (2017): 188–225. [https://doi.org/10.1353/bh.2017.0006](https://doi.org/10.1353/bh.2017.0006) for a discussion of how OCR errors can provide useful information in research. See also Rawson, Katie, and Muñoz, Trevor. "Against Cleaning." _Curating Menus_, July 2016.[https://www.curatingmenus.org/articles/against-cleaning/](https://perma.cc/QPW7-ZJ7U) for a discussion on the many and significant complexities that are often obscured under the concept of 'cleaning' data. diff --git a/en/lessons/understanding-regular-expressions.md b/en/lessons/understanding-regular-expressions.md index 7f9a89db72..9e62036082 100755 --- a/en/lessons/understanding-regular-expressions.md +++ b/en/lessons/understanding-regular-expressions.md @@ -17,7 +17,7 @@ topics: [data-manipulation] abstract: "In this lesson, we will use advanced find-and-replace capabilities in a word processing application in order to make use of structure in a brief historical document that is essentially a table in the form of prose." -redirect_from: /lessons/understanding-regular-expressions +redirect_from: /lessons/understanding-regular-expressions/ avatar_alt: Person studying a book at a desk doi: 10.46430/phen0033 --- @@ -81,7 +81,7 @@ any year from 1850 to 1899. In this exercise we will use LibreOffice Writer and LibreOffice Calc, which are free software desktop applications for word processing and spreadsheets, respectively. Installation packages for Linux, Mac, or -Windows can be downloaded from . +Windows can be downloaded from . Other word processing software and programming languages have similar pattern-matching capabilities. This exercise uses LibreOffice because it is freely available, and its regular expression syntax is closer to what @@ -115,7 +115,7 @@ textual resources that are useful in many kinds of historical research. For our exercise, we will use a five-page report of monthly morbidity and mortality statistics for states and cities in the United States, published in February 1908, available at -. +. Take a moment to scan the pages through the [Read Online][] link to become familiar with it. This document is organized as paragraphs rather @@ -733,9 +733,9 @@ the University of Pittsburg, has some good materials on how to work with [regular expressions and XML tools][] to help mark up plain-text files in TEI XML. - [Read Online]: http://archive.org/stream/jstor-4560629/4560629#page/n0/mode/2up - [Full Text]: http://archive.org/stream/jstor-4560629/4560629_djvu.txt + [Read Online]: https://archive.org/stream/jstor-4560629/4560629#page/n0/mode/2up + [Full Text]: https://archive.org/stream/jstor-4560629/4560629_djvu.txt [List of Regular Expressions]: https://help.libreoffice.org/Common/List_of_Regular_Expressions - [regular expressions]: http://en.wikipedia.org/wiki/Regular_expressions - [Rubular]: http://rubular.com/ - [regular expressions and XML tools]: http://dh.obdurodon.org/regex.html + [regular expressions]: https://en.wikipedia.org/wiki/Regular_expressions + [Rubular]: https://rubular.com/ + [regular expressions and XML tools]: https://dh.obdurodon.org/regex.html diff --git a/en/lessons/up-and-running-with-omeka.md b/en/lessons/up-and-running-with-omeka.md index 830343dc76..cc2aa4fe54 100755 --- a/en/lessons/up-and-running-with-omeka.md +++ b/en/lessons/up-and-running-with-omeka.md @@ -16,7 +16,7 @@ exclude_from_check: activity: presenting topics: [website] abstract: "Omeka.net makes it easy to create websites that show off collections of items." -redirect_from: /lessons/up-and-running-with-omeka +redirect_from: /lessons/up-and-running-with-omeka/ avatar_alt: Dinosaur skeleton in a museum doi: 10.46430/phen0060 --- @@ -27,14 +27,14 @@ doi: 10.46430/phen0060 -[Omeka.net](http://www.omeka.net) makes it easy to create websites that show off collections of items. +[Omeka.net](https://www.omeka.net) makes it easy to create websites that show off collections of items. Sign up for an Omeka account ---------------------------- {% include figure.html filename="up-and-running-01.png" caption="Sign up for a trial account" %} -Go to [www.omeka.net](http://www.omeka.net) and click on **Sign Up**. Choose the Trial plan. Fill in the sign-up form. Check your email for the link to activate your account. +Go to [www.omeka.net](https://www.omeka.net) and click on **Sign Up**. Choose the Trial plan. Fill in the sign-up form. Check your email for the link to activate your account. Create your new Omeka site -------------------------- @@ -56,7 +56,7 @@ An empty Omeka site ------------------- {% include figure.html filename="up-and-running-04.png" caption="Public view" %} -This is your empty Omeka site, waiting to be filled in. To get back to your dashboard, click the **Back** button or enter **http://www.omeka.net/dashboard**. This time, click on **Manage Site**. +This is your empty Omeka site, waiting to be filled in. To get back to your dashboard, click the **Back** button or enter **https://www.omeka.net/dashboard**. This time, click on **Manage Site**. Switch themes ------------- @@ -180,4 +180,4 @@ Now that you've added some items and grouped them into a collection, take some t Further Resources ----------------------------- -The Omeka team has put together great resources on the software's [help pages](http://info.omeka.net) +The Omeka team has put together great resources on the software's [help pages](https://info.omeka.net) diff --git a/en/lessons/urban-demographic-data-r-ggplot2.md b/en/lessons/urban-demographic-data-r-ggplot2.md index 993f1c6be5..badd6070e1 100644 --- a/en/lessons/urban-demographic-data-r-ggplot2.md +++ b/en/lessons/urban-demographic-data-r-ggplot2.md @@ -29,7 +29,7 @@ After [World War II](https://perma.cc/89BN-3NCG), European cities faced a monume Sister-city relationships present historians with both an opportunity and a challenge. The opportunity lies in their potential to reveal patterns of post-war reconciliation and diplomacy. The challenge comes from their scale and complexity: there are many hundreds of European cities to analyze, and each one might have formed dozens of partnerships across multiple decades. By converting these complex networks of sister-city relationships into visual patterns, we can explore questions that are difficult to answer through traditional methods alone. For example, did cities of [West Germany](https://perma.cc/ALL6-TWXA) preferentially form partnerships with French cities immediately after the war? Did the [Iron Curtain](https://perma.cc/XH8M-XCJ9) create distinct patterns of sister-city relationships between Eastern and Western Europe? How did city size and geographic distance influence diplomatic connections? This case is a good example of how useful data visualization can be for historical research. -The R package [ggplot2](http://ggplot2.tidyverse.org) provides powerful tools for investigating such questions through data visualization. While spreadsheets and basic charts can obscure patterns, ggplot2's sophisticated visualization capabilities allow historians to uncover hidden relationships in data. For example, [scatter plots](https://perma.cc/47QY-KL2V) can reveal correlations between numerical variables like population sizes and geographic distances, [bar charts](https://perma.cc/H58M-6UDU) can show the distribution of partnerships across different categories of cities, and [histograms](https://perma.cc/W7TW-9V52) can expose patterns in demographic data that might otherwise remain invisible. +The R package [ggplot2](https://ggplot2.tidyverse.org) provides powerful tools for investigating such questions through data visualization. While spreadsheets and basic charts can obscure patterns, ggplot2's sophisticated visualization capabilities allow historians to uncover hidden relationships in data. For example, [scatter plots](https://perma.cc/47QY-KL2V) can reveal correlations between numerical variables like population sizes and geographic distances, [bar charts](https://perma.cc/H58M-6UDU) can show the distribution of partnerships across different categories of cities, and [histograms](https://perma.cc/W7TW-9V52) can expose patterns in demographic data that might otherwise remain invisible. This lesson differs from standard ggplot2 guides by focusing specifically on the needs of urban historians. Rather than using generic datasets, we'll work with historical data about sister-city relationships to demonstrate how visualization techniques can illuminate historical patterns and processes. Through this approach, you'll learn to create visualizations that reveal complex partnerships and make historical findings more accessible to a broader audience. @@ -68,7 +68,7 @@ We have many reasons for chosing to use ggplot2 for this analysis. The package h - It relies on a theoretical framework (detailed below) that ensures your graphs meaningfully convey information, which is particularly important when working with complex urban and demographic datasets. - It is relatively simple to use while remaining powerful. - It creates publication-ready graphs. -- It comes with community-developed [extensions](http://www.ggplot2-exts.org/) which further enhance its capabilities, such as additional functions, graphs, and themes. +- It comes with community-developed [extensions](https://www.ggplot2-exts.org/) which further enhance its capabilities, such as additional functions, graphs, and themes. - It is versatile, as it can handle various data structures, including: * Numerical data (continuous and discrete) * Categorical data (factors and character strings) @@ -154,7 +154,7 @@ ggplot(data = eudata.perc, aes(x = typecountry, y = perc)) + There is an important difference between the first plot (Figure 1) and this one. In the previous plot, ggplot2 counted the number of cities in every group (domestic, EU, non-EU). In our new plot, the tibble already contained each bar's numerical value, stored in the **perc** column. This is why we specify `y = perc` as a parameter of `aes()`. The tricky part is that by default, `geom_bar()` will use the parameter `stat = "count"`. This means it will count how many times a value appears. In other words, it aggregates data for you. However, you can inform ggplot2 that you have already calculated your values by using the parameter `stat = "identity"`. -Figure 2 shows that most sister cities are from a different country than the origin city, yet still within the EU (around 68%). This could be due to geographical proximity, cultural similarities, or economic ties within the European Union. you can get a more detailed look by adding in the name of each origin country to the visualization. You could decide to visualize this either by breaking down each bar into percentages by origin country (Figure 3), or by creating separate graphs for each origin country (this is called 'facetting' in ggplot2 lingo, which we [cover below](#Facetting-a-Graph)). Let's try the first approach, aggregating the data per country and per type of country while adding a new column with percentages: +Figure 2 shows that most sister cities are from a different country than the origin city, yet still within the EU (around 68%). This could be due to geographical proximity, cultural similarities, or economic ties within the European Union. you can get a more detailed look by adding in the name of each origin country to the visualization. You could decide to visualize this either by breaking down each bar into percentages by origin country (Figure 3), or by creating separate graphs for each origin country (this is called 'facetting' in ggplot2 lingo, which we [cover below](#facetting-a-graph)). Let's try the first approach, aggregating the data per country and per type of country while adding a new column with percentages: ``` `eudata.perc.country` <- eudata %>% @@ -339,7 +339,7 @@ p1 + {% include figure.html filename="en-or-urban-demographic-data-r-ggplot2-11.png" alt="Scatter plot that uses scale_colour_manual() to change the colors of the scatterplot points." caption="Figure 11. Using scale_colour_manual() to specify the colors of the scatter plot's points." %} -However, you can also simply rely on predefined color scales, such as the [color brewer palettes](http://colorbrewer2.org). It's better to use these whenever possible, because choosing the right colors for visualizations is a very complicated issue (for instance, avoiding colors that are not distinguishable by people with impaired vision). Fortunately, ggplot2 comes with `scale_colour_brewer()` already [integrated](https://perma.cc/BST9-7GMG): +However, you can also simply rely on predefined color scales, such as the [color brewer palettes](https://colorbrewer2.org). It's better to use these whenever possible, because choosing the right colors for visualizations is a very complicated issue (for instance, avoiding colors that are not distinguishable by people with impaired vision). Fortunately, ggplot2 comes with `scale_colour_brewer()` already [integrated](https://perma.cc/BST9-7GMG): ``` p1 + @@ -447,7 +447,7 @@ p3 + ### Extending ggplot2 with Other Packages -One of ggplot2's strengths is its extensive collection of [extensions](http://www.ggplot2-exts.org/) that can help enhance your analysis with specialized visualizations like network graphs (useful for showing relationships between cities, for example), time series graphs (for tracking demographic changes over time), and ridgeline plots (for comparing population distributions across different urban areas). +One of ggplot2's strengths is its extensive collection of [extensions](https://www.ggplot2-exts.org/) that can help enhance your analysis with specialized visualizations like network graphs (useful for showing relationships between cities, for example), time series graphs (for tracking demographic changes over time), and ridgeline plots (for comparing population distributions across different urban areas). Let's explore an example showcasing a ggplot2 extension that creates more advanced and visually striking plots. In this case, we will create a [ridgeline plot](https://perma.cc/D9Z2-XHAV) – also known as a 'joyplot' – designed to visualize changes in distributions over time, across different categories. Ridgeline plots are particularly effective for comparing multiple distributions in a compact and aesthetically pleasing manner. @@ -482,7 +482,7 @@ To gain a more thorough understanding of ggplot2, we recommend you explore some * The [official ggplot2 site](https://ggplot2.tidyverse.org/). -* Hadley Wickham's books [`ggplot2`: _Elegant Graphics for Data Analysis_](https://ggplot2-book.org/) and [_R for Data Science_](http://r4ds.hadley.nz/). +* Hadley Wickham's books [`ggplot2`: _Elegant Graphics for Data Analysis_](https://ggplot2-book.org/) and [_R for Data Science_](https://r4ds.hadley.nz/). * Hadley Wickham's [original paper](https://doi.org/10.1198/jcgs.2009.07098) on the grammar of graphics. @@ -500,7 +500,7 @@ To gain a more thorough understanding of ggplot2, we recommend you explore some * The [general documentation](https://ggplot2.tidyverse.org/reference/). -* The [Cookbook for R](http://www.cookbook-r.com/Graphs/) book (based on Winston Chang's [_R Graphics Cookbook. Practical Recipes for Visualizing Data_](http://shop.oreilly.com/product/0636920023135.do)). +* The [Cookbook for R](https://www.cookbook-r.com/Graphs/) book (based on Winston Chang's [_R Graphics Cookbook. Practical Recipes for Visualizing Data_](https://shop.oreilly.com/product/0636920023135.do)). * This official [R cheatsheet](https://www.rstudio.com/resources/cheatsheets/). diff --git a/en/lessons/using-javascript-to-create-maps.md b/en/lessons/using-javascript-to-create-maps.md index 00537a57b0..65db8983dd 100755 --- a/en/lessons/using-javascript-to-create-maps.md +++ b/en/lessons/using-javascript-to-create-maps.md @@ -17,7 +17,7 @@ activity: presenting topics: [mapping] abstract: | Demonstrates how to use the JavaScript library "Leaflet" to produce an interactive map that can be hosted online or viewed locally, and demonstrates how to customize many of its features. -redirect_from: /lessons/using-javascript-to-create-maps +redirect_from: /lessons/using-javascript-to-create-maps/ avatar_alt: A woman throwing letters near a mailbox doi: 10.46430/phen0071 --- @@ -32,13 +32,13 @@ doi: 10.46430/phen0071 The mapping software this lesson demonstrates grew out of a need to create a program that was easy to use and designed for the smaller and less uniform geospatial datasets used by historians. While working on a book manuscript on female abolitionists and early feminism in Britain and the United States, the question arose of how to determine the extent of transnational connections in women's antislavery work. We were interested not only in the number of letters that crossed the Atlantic, but also the specific locations the letters were sent from and to and how those international connections changed over time. -To solve this problem, we decided to plot the correspondence of Boston-area abolitionist women on a map and do network analysis of women's correspondence, starting with a single woman's letters as a test project. When we set out to map nineteenth-century abolitionist Maria Weston Chapman's correspondence, there was already an easy way to do [network analysis](http://wcm1.web.rice.edu/mining-bpl-antislavery.html), but we struggled to find software to do the mapping portion of the project.[^1] To remedy this, we wrote a simple JavaScript mapping tool based on [Leaflet](http://leafletjs.com/) which allowed us to display correspondence networks on a browser-based map. This lesson explains not only how to write your own script (or adapt the one we wrote) for your own project, but also explores why creating your own tool is sometimes more effective than using commercially available software to analyze historical data. +To solve this problem, we decided to plot the correspondence of Boston-area abolitionist women on a map and do network analysis of women's correspondence, starting with a single woman's letters as a test project. When we set out to map nineteenth-century abolitionist Maria Weston Chapman's correspondence, there was already an easy way to do [network analysis](https://wcm1.web.rice.edu/mining-bpl-antislavery.html), but we struggled to find software to do the mapping portion of the project.[^1] To remedy this, we wrote a simple JavaScript mapping tool based on [Leaflet](https://leafletjs.com/) which allowed us to display correspondence networks on a browser-based map. This lesson explains not only how to write your own script (or adapt the one we wrote) for your own project, but also explores why creating your own tool is sometimes more effective than using commercially available software to analyze historical data. -Originally, when we set out to study Chapman's correspondence with digital tools, we intended to use [PostGIS](http://postgis.net/) and [Gephi](https://gephi.org/) to examine the geographic connections and to analyze the network itself. While cleaning the data, it quickly became clear that [PostGIS](http://postgis.net/) was not going to be the ideal tool for the geospatial analysis as it required re-loading all the data into the software every time a change was made. Chapman's correspondence data, obtained from the [Boston Public Library's Antislavery Collection available from the Internet Archive](https://archive.org/details/bplscas) and the [Digital Public Library of America (DPLA)](http://dp.la), required extensive cleaning to standardize and complete dates, names and locations. +Originally, when we set out to study Chapman's correspondence with digital tools, we intended to use [PostGIS](https://postgis.net/) and [Gephi](https://gephi.org/) to examine the geographic connections and to analyze the network itself. While cleaning the data, it quickly became clear that [PostGIS](https://postgis.net/) was not going to be the ideal tool for the geospatial analysis as it required re-loading all the data into the software every time a change was made. Chapman's correspondence data, obtained from the [Boston Public Library's Antislavery Collection available from the Internet Archive](https://archive.org/details/bplscas) and the [Digital Public Library of America (DPLA)](https://dp.la), required extensive cleaning to standardize and complete dates, names and locations. Many of the mistakes, misspellings, and incorrect data points only became noticeable after the data was run through the software and a map generated, but having to reload all of the data from scratch was not a sustainable option. So we began drafting the code for the JavaScript map, which allowed us to easily run the visualization on our local system using a local web server to catch problems and errors as we worked. The script we ended up writing also allows the map to be interactive, making it a more useful tool for research and discovery than a static visualization. Being able to easily update the map was also important as the Boston Public Library was not finished digitizing their antislavery collection at the time of writing, and we hoped to expand the dataset to include the correspondence of other abolitionists if our test was successful. Creating our own tool allowed us the flexibility to adapt our project to the constraints of the data. -After we began working on this project, several other options for building online interactive maps became available, most notably [Carto](https://carto.com/platform) and [Palladio](http://hdlab.stanford.edu/palladio/) (a Stanford University project). Neither of these products were available when we began and they both have some limitations that may be problematic for some scholars. Carto only allows you to keep your data private if you pay for a subscription. Palladio, which creates a map very similar to the one we built, only allows you to share your maps via screenshots, so other people cannot easily use your interactive map. Running your own script via a local or web server is the easiest and most straightforward way to control your data and the look of your map without purchasing a subscription to an online service. But if you do decide to use Carto, Palladio, or another online mapping service, this lesson can still be useful to you as you collect and clean your data and analyze the resulting map. +After we began working on this project, several other options for building online interactive maps became available, most notably [Carto](https://carto.com/platform) and [Palladio](https://hdlab.stanford.edu/palladio/) (a Stanford University project). Neither of these products were available when we began and they both have some limitations that may be problematic for some scholars. Carto only allows you to keep your data private if you pay for a subscription. Palladio, which creates a map very similar to the one we built, only allows you to share your maps via screenshots, so other people cannot easily use your interactive map. Running your own script via a local or web server is the easiest and most straightforward way to control your data and the look of your map without purchasing a subscription to an online service. But if you do decide to use Carto, Palladio, or another online mapping service, this lesson can still be useful to you as you collect and clean your data and analyze the resulting map. ### Lesson Goals @@ -46,13 +46,13 @@ After we began working on this project, several other options for building onlin * Explain the process of parsing, cleaning and formatting data for maps. * Explore the analytical possibilities of mapping correspondence. -Note: This lesson requires using the command line (or Command Prompt). If you have never used a command line interface before, you might want to read other Programming Historian lessons on [the command line]({{site.baseurl}}/lessons/intro-to-bash) or [PowerShell]({{site.baseurl}}/lessons/intro-to-powershell), depending on your operating system. You can use the built-in command line tool in your operating system for all of the tasks in this lesson. +Note: This lesson requires using the command line (or Command Prompt). If you have never used a command line interface before, you might want to read other Programming Historian lessons on [the command line]({{site.baseurl}}/en/lessons/intro-to-bash) or [PowerShell]({{site.baseurl}}/en/lessons/intro-to-powershell), depending on your operating system. You can use the built-in command line tool in your operating system for all of the tasks in this lesson. ## Set Up Before you begin, [download the ZIP file]({{site.baseurl}}/assets/using-javascript-to-create-maps/using-javascript-to-create-maps.zip) for this lesson and double click on it to "unzip." Inside you will find all the folders and files you will need to get a correspondence map working. -The `css` folder contains the code that explains how parts of the map look. The `js` folder contains the actual code that drives the map and its interactive functions. The basic map script contains a timeline function which allows you to "play" the data, generating the map one data point at a time so you can watch the correspondence network grow. The same feature allows you to filter your data by date, and display only a certain range of dates on the map. The `jquery` and `leaflet` folders in each of these locations are third party tools that add functionality to the map. [Leaflet](http://leafletjs.com/) helps create the map and [jQuery](http://jqueryui.com/) makes it easy to add interactive elements like the time line. The other files are as follows: +The `css` folder contains the code that explains how parts of the map look. The `js` folder contains the actual code that drives the map and its interactive functions. The basic map script contains a timeline function which allows you to "play" the data, generating the map one data point at a time so you can watch the correspondence network grow. The same feature allows you to filter your data by date, and display only a certain range of dates on the map. The `jquery` and `leaflet` folders in each of these locations are third party tools that add functionality to the map. [Leaflet](https://leafletjs.com/) helps create the map and [jQuery](https://jqueryui.com/) makes it easy to add interactive elements like the time line. The other files are as follows: * `controls.js` contains functions that handle the time line slider and filters. * `data.js` contains functions that load and handle the initial formatting of the CSV file. @@ -66,7 +66,7 @@ The Customizing the Map section explain how each of these scripts work together In order to do geospatial analysis on correspondence, you need several pieces of data about each letter. At the bare minimum, you need the sender, the recipient, the date, the sender's address, and the recipient's address. However, historians often have a lot more information about each piece of correspondence, including summaries of the content, keywords, and links to the letter in an online repository. Writing your own script allows you to display or access the other information about the letter within the interactive visualization as well as be able to display subsets of the data to help with your analysis. -There are several ways to collect or compile data about correspondence. Many historians often have large databases listing correspondence details from their research or have entered research data into Endnote or [Zotero](http://zotero.org), and archival finding aids and digitized archival collections often contain much of the information needed for doing a geospatial analysis. To obtain the basic information about Maria Weston Chapman's correspondence, we parsed the data from an API and then hand entered the missing information.[^2] +There are several ways to collect or compile data about correspondence. Many historians often have large databases listing correspondence details from their research or have entered research data into Endnote or [Zotero](https://zotero.org), and archival finding aids and digitized archival collections often contain much of the information needed for doing a geospatial analysis. To obtain the basic information about Maria Weston Chapman's correspondence, we parsed the data from an API and then hand entered the missing information.[^2] Several APIs contain the metadata for the Boston Public Library's antislavery collection, including the Internet Archive (where the metadata closely mirrors the data on the original index cards created in the 1880s when the collection was compiled and indexed) and Digital Public Library of America.[^3] A separate lesson on *The Programming Historian* shows [how to mine data from the antislavery collection on the Internet Archive]({{site.baseurl}}/lessons/data-mining-the-internet-archive). We chose to use the DPLA's API instead. @@ -90,7 +90,7 @@ If you want to use the timeline function of the script, you will also need to ch Once you have identified all of the locations for the letters in your collection, you will need to convert the addresses to coordinates. The first step to this process is to create a master list of locations included in your dataset. To do this, copy the sent and received locations into single column in a new spreadsheet. Sort them alphabetically and then remove the duplicates (this is also a good way to find spelling or data entry errors if you had to hand-compile the locations). Use this new list to do your coordinate search. -There are many websites that allow you to search for coordinates, but most of them limit the number of searches you can request. If you need to search for several thousand locations, you may want to get an API key for the GPS search engine you decide to use, but for most correspondence sets, you will only end up looking up a few hundred locations even if you are mapping thousands of letters. We used [GPS Visualizer](http://www.gpsvisualizer.com/geocoder/), which allows you to search Google Maps, Bing, and Map Quest. Simply paste the list of addresses you made into the input section of GPS Visualizer (99 at a time unless you get an API key), select the type of data (raw list, 1 address per line), select your search engine, and set the field separator output to comma. Click run and wait for your results. +There are many websites that allow you to search for coordinates, but most of them limit the number of searches you can request. If you need to search for several thousand locations, you may want to get an API key for the GPS search engine you decide to use, but for most correspondence sets, you will only end up looking up a few hundred locations even if you are mapping thousands of letters. We used [GPS Visualizer](https://www.gpsvisualizer.com/geocoder/), which allows you to search Google Maps, Bing, and Map Quest. Simply paste the list of addresses you made into the input section of GPS Visualizer (99 at a time unless you get an API key), select the type of data (raw list, 1 address per line), select your search engine, and set the field separator output to comma. Click run and wait for your results. When the results appear in the second box on the screen, copy and paste them into the spreadsheet containing the list of addresses. Make sure you capture both pieces of each coordinate (latitude and longitude). Depending on the addresses in your dataset, you may find one of the search engines gives better results than the others. In our case, we found Bing to be the most accurate. You will need to double-check each location you find coordinates for to make sure they are correct by running them through the mapping script (we had several notable mistakes when a search returned coordinates for a street in Paris, France, in the middle of Africa, and an estate in the suburbs of London, England, in the American Midwest). @@ -150,7 +150,7 @@ The map data is loaded in `data.js`. If you want to change the available columns Mapping software uses several layers of information to create a map. The first layer is a simple grid of latitude and longitude. The second layer contains the information that displays the map itself. These are called vector tiles. Vector tiles are the information on roads or other geographical features you want to appear on your map plus the actual images used to render the map. These can be modern features or historical ones, depending on the tile set you use to display your information. -For our project, we began with a basic set of map tiles from [MapBox](http:///www.mapbox.com). MapBox provides a number of different tile sets so that you can customize your map's appearance. You can use existing tiles or even design your own (what we ended up doing). The script is currently set up to use our custom map tiles, but you can edit the script to use other map tiles by changing the following section of `map.js` in the `js` folder to use your tiles. You are not limited to MapBox either; any tile server will work: +For our project, we began with a basic set of map tiles from [MapBox](https:///www.mapbox.com). MapBox provides a number of different tile sets so that you can customize your map's appearance. You can use existing tiles or even design your own (what we ended up doing). The script is currently set up to use our custom map tiles, but you can edit the script to use other map tiles by changing the following section of `map.js` in the `js` folder to use your tiles. You are not limited to MapBox either; any tile server will work: ``` var tileURL = 'http://{s}.tiles.mapbox.com/v3/ttavenner.e7ef536d/{z}/{x}/{y}.png' @@ -231,8 +231,8 @@ Now that you have an idea about what can be done with JavaScript as a programmin [^3]: See Lee V. Chambers, *The Weston Sisters: An American Abolitionist Family*, (Chapel Hill, University of North Carolina Press, 2015), 175. The BPL began a transcription project at Digital Commonwealth in 2017, images of the collection are also available there and the images are searchable by place. -[^4]: If you are having permissions errors installing `npm`, check the solutions [on Stack Overflow](http://stackoverflow.com/questions/16151018/npm-throws-error-without-sudo/24404451#24404451). +[^4]: If you are having permissions errors installing `npm`, check the solutions [on Stack Overflow](https://stackoverflow.com/questions/16151018/npm-throws-error-without-sudo/24404451#24404451). -[^5]: Stephen Robertson, "The Differences between Digital Humanities and Digital History," *Debates in the Digital Humanities, 2016*. Matthew K. Gold and Lauren F. Klein, eds. (Minneapolis: University of Minnesota Press, 2016). Available Online: http://dhdebates.gc.cuny.edu/debates/text/76 +[^5]: Stephen Robertson, "The Differences between Digital Humanities and Digital History," *Debates in the Digital Humanities, 2016*. Matthew K. Gold and Lauren F. Klein, eds. (Minneapolis: University of Minnesota Press, 2016). Available Online: https://dhdebates.gc.cuny.edu/debates/text/76 [^6]: Chambers, *Weston Sisters*, Chapter 6. diff --git a/en/lessons/vector-layers-qgis.md b/en/lessons/vector-layers-qgis.md index 1a1b2650de..9ab25ee951 100755 --- a/en/lessons/vector-layers-qgis.md +++ b/en/lessons/vector-layers-qgis.md @@ -19,11 +19,11 @@ activity: presenting topics: [mapping, data-visualization] abstract: "In this lesson you will learn how to create vector layers based on scanned historical maps." -next: georeferencing-qgis -previous: qgis-layers +next: /en/lessons/georeferencing-qgis +previous: /en/lessons/qgis-layers series_total: 5 lessons sequence: 3 -redirect_from: /lessons/vector-layers-qgis +redirect_from: /lessons/vector-layers-qgis/ avatar_alt: Map of city streets doi: 10.46430/phen0034 --- @@ -417,9 +417,9 @@ work!** *This lesson is part of the [Geospatial Historian][].* - [Intro to Google Maps and Google Earth]: /lessons/googlemaps-googleearth - [Installing QGIS 2.0 and Adding Layers]: /lessons/qgis-layers - [PEI_Holland map]: /assets/vector-layers-qgis/PEI_HollandMap1798_compLZW.tif - [Georeferencing in QGIS 2.0]: /lessons/georeferencing-qgis - [Wikipedia entry]: http://en.wikipedia.org/wiki/Prince_Royalty,_Prince_Edward_Island - [Geospatial Historian]: http://geospatialhistorian.wordpress.com/ +- [Intro to Google Maps and Google Earth](/en/lessons/googlemaps-googleearth) +- [Installing QGIS 2.0 and Adding Layers](/en/lessons/qgis-layers) +- [PEI_Holland map](/assets/vector-layers-qgis/PEI_HollandMap1798_compLZW.tif) +- [Georeferencing in QGIS 2.0](/en/lessons/georeferencing-qgis) +- [Wikipedia entry](https://en.wikipedia.org/wiki/Prince_Royalty,_Prince_Edward_Island) +- [Geospatial Historian](https://geospatialhistorian.wordpress.com/) \ No newline at end of file diff --git a/en/lessons/viewing-html-files.md b/en/lessons/viewing-html-files.md index bb2f94dfb2..eb35648b57 100755 --- a/en/lessons/viewing-html-files.md +++ b/en/lessons/viewing-html-files.md @@ -16,11 +16,11 @@ exclude_from_check: activity: presenting topics: [python] abstract: "This lesson introduces you to HTML and the web pages it structures." -next: working-with-text-files -previous: introduction-and-installation +next: /en/lessons/working-with-text-files +previous: /en/lessons/introduction-and-installation series_total: 15 lessons sequence: 2 -redirect_from: /lessons/viewing-html-files +redirect_from: /lessons/viewing-html-files/ avatar_alt: A woman listening to a man through an ear trumpet doi: 10.46430/phen0018 --- @@ -142,5 +142,5 @@ text editor (which does not). - [W3 Schools HTML Tutorial][W3 Schools HTML tutorial] - [W3 Schools HTML5 Tutorial][] - [W3 Schools HTML tutorial]: http://www.w3schools.com/html/default.asp - [W3 Schools HTML5 Tutorial]: http://www.w3schools.com/html/html5_intro.asp + [W3 Schools HTML tutorial]: https://www.w3schools.com/html/default.asp + [W3 Schools HTML5 Tutorial]: https://www.w3schools.com/html/html5_intro.asp diff --git a/en/lessons/visualizing-with-bokeh.md b/en/lessons/visualizing-with-bokeh.md index e72c1a29d0..afe79c680d 100644 --- a/en/lessons/visualizing-with-bokeh.md +++ b/en/lessons/visualizing-with-bokeh.md @@ -149,7 +149,7 @@ Within the virtual environment, you can run your code by typing: python filename.py ``` -A Jupyter Notebook containing the code used in this tutorial is also [available](https://github.com/programminghistorian/ph-submissions/tree/gh-pages/assets/visualizing-with-bokeh/visualizing-with-bokeh.ipynb) in case you prefer to work through the tutorial without installing a virtual environment. You can learn more about Jupyter Notebook [here](http://jupyter.org). If you have created a virtual environment using Miniconda, as discussed above, you can install Jupyter Notebook in the environment by typing `conda install jupyter` +A Jupyter Notebook containing the code used in this tutorial is also [available](https://github.com/programminghistorian/ph-submissions/tree/gh-pages/assets/visualizing-with-bokeh/visualizing-with-bokeh.ipynb) in case you prefer to work through the tutorial without installing a virtual environment. You can learn more about Jupyter Notebook [here](https://jupyter.org). If you have created a virtual environment using Miniconda, as discussed above, you can install Jupyter Notebook in the environment by typing `conda install jupyter` # The Basics of Bokeh @@ -239,7 +239,7 @@ In the previous example, we manually created two short Python lists for our x an ## Pandas Overview -For the purposes of this tutorial, I will only touch on the basic functions of Pandas that are necessary to produce our visualizations. [10 Minutes to Pandas](https://pandas.pydata.org/pandas-docs/stable/10min.html) and [Lessons for New Pandas Users](https://pandas.pydata.org/pandas-docs/stable/tutorials.html#lessons-for-new-pandas-users) are excellent introductions that I would recommend for expanding your knowledge beyond the very basics touched on here. +For the purposes of this tutorial, I will only touch on the basic functions of Pandas that are necessary to produce our visualizations. [10 Minutes to Pandas](https://pandas.pydata.org/pandas-docs/stable/10min.html) and [Lessons for New Pandas Users](https://pandas.pydata.org/pandas-docs/stable/getting_started/tutorials.html#exercises-for-new-users) are excellent introductions that I would recommend for expanding your knowledge beyond the very basics touched on here. Pandas has quickly become the *de facto* Python library for data and data science workflows; integration with other major data science and machine learning libraries has only fueled a rise in popularity.[^1] Pandas provides functionality to quickly and efficiently read, write, and modify datasets for analysis. To accomplish this, Pandas provides data structures that hold different dimensionalities of data. The `DataFrame` holds 2-dimensional data in the manner of a spreadsheet with rows and columns. It's through this object that we'll interact with our WWII THOR dataset. Let's first examine the Pandas `DataFrame` by loading our csv data into one. @@ -256,7 +256,7 @@ print(df) ``` We start by importing the Pandas library and then calling `read_csv()` and passing a filename to it. Note that the Pandas library is aliased as *pd*. This alias is a convention followed in the [Pandas official documentation](https://pandas.pydata.org/pandas-docs/stable/) and is widely used by the Pandas community. For this reason, I'll use the *pd* alias throughout the tutorial. -In this code, `read_csv` creates a `DataFrame` that holds the rows/columns of our csv data. By convention, the variable name *df* is used to represent the loaded dataframe in tutorials and basic code examples. [Many other methods](https://pandas.pydata.org/pandas-docs/stable/api.html#input-output) exist for reading data formats other than csv in Pandas, such as JSON, SQL tables, Excel files, and HTML. +In this code, `read_csv` creates a `DataFrame` that holds the rows/columns of our csv data. By convention, the variable name *df* is used to represent the loaded dataframe in tutorials and basic code examples. [Many other methods](https://pandas.pydata.org/pandas-docs/stable/reference/io.html#input-output) exist for reading data formats other than csv in Pandas, such as JSON, SQL tables, Excel files, and HTML. When running this code, `print(df)` will output an abridged representation of the loaded data. @@ -583,7 +583,7 @@ Thankfully, Pandas offers a quick and easy way to do this. By modifying a single Resampling time-series data can involve either upsampling (creating more records) or downsampling (creating fewer records). For example, a list of daily temperatures could be upsampled to a list of hourly temperatures or downsampled to a list of weekly temperatures. We'll only be downsampling in this tutorial, but upsampling is very useful when you're trying to match a sporadically-measured dataset with one that's more periodically measured. -To resample our data, we use a Pandas `Grouper` object, to which we pass the column name holding our datetimes and a code representing the desired resampling frequency. In the case of our data, the statement `pd.Grouper(key='MSNDATE', freq='M') ` will be used to resample our MSNDATE column by *M*onth. We could equally resample by *W*eek, *Y*ear, *H*our, and [so forth](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases). These frequency designations can also be prefaced with numbers so that, for example, `freq='2W'` resamples at two week intervals! +To resample our data, we use a Pandas `Grouper` object, to which we pass the column name holding our datetimes and a code representing the desired resampling frequency. In the case of our data, the statement `pd.Grouper(key='MSNDATE', freq='M') ` will be used to resample our MSNDATE column by *M*onth. We could equally resample by *W*eek, *Y*ear, *H*our, and [so forth](https://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases). These frequency designations can also be prefaced with numbers so that, for example, `freq='2W'` resamples at two week intervals! To complete the process of resampling and plotting our data, we pass the above `Grouper` object to our `groupby` function in place of the raw column name. The `groupby` statement from the previous code example should now look like this: @@ -642,7 +642,7 @@ show(p) A few patterns emerge in the ETO data. First we see a very clear escalation of overall bombings leading up to June 6, 1944 and a notable dip during the winter of 1944/1945. Incendiary munitions show three spikes and confirm that the fourth spike seen in the preceding example was directed at the bombing of Japan after Germany's surrender. The pattern of fragmentation bombs is harder to read, but it's now clear that they were only seriously used in the European Theater after D-Day. -{% include alert.html text="Try your hand at resampling this data using any of [Pandas' time frequencies ](http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases) to see what other trends might emerge. Remember, you can preface these frequencies with numbers as well (e.g. if you were working with historical stock market data, 2Q would give you bi-quarterly data!)" %} +{% include alert.html text="Try your hand at resampling this data using any of [Pandas' time frequencies ](https://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases) to see what other trends might emerge. Remember, you can preface these frequencies with numbers as well (e.g. if you were working with historical stock market data, 2Q would give you bi-quarterly data!)" %} Since we have established that 6 June 1944 and the winter of 1944/1945 mark changes to the bombing patterns in the ETO, let's highlight these trends using Bokeh's annotation features. @@ -689,7 +689,7 @@ The function get_provider was deprecated as of Bokeh 3.0.0. The not We'll also be using functions imported from the `pyproj` library. Since our coordinates are stored as latitude/longitude, we'll define a custom function to convert them before mapping. Note that although Bokeh is coordinate-system neutral, it uses the Web Mercator projection for mapping, a standard found across web tile providers. The subject of coordinate systems and projections are outside the scope of this tutorial, but the interested reader will find many useful web resources on these topics. -{% include alert.html text="If your own dataset has place names, but not latitude and longitude, don't worry! You can find ways to easily get coordinates from place names in Programming Historian's [Geocoding Historical Data using QGIS](/lessons/geocoding-qgis) or [Web Mapping with Python and Leaflet](/lessons/mapping-with-python-leaflet#geocoding-with-python)." %} +{% include alert.html text="If your own dataset has place names, but not latitude and longitude, don't worry! You can find ways to easily get coordinates from place names in Programming Historian's [Geocoding Historical Data using QGIS](/en/lessons/geocoding-qgis) or [Web Mapping with Python and Leaflet](/en/lessons/mapping-with-python-leaflet#geocoding-with-python)." %} ```python # target_locations.py diff --git a/en/lessons/windows-installation.md b/en/lessons/windows-installation.md index 5c6a55c9f5..96da9161b8 100755 --- a/en/lessons/windows-installation.md +++ b/en/lessons/windows-installation.md @@ -16,7 +16,7 @@ exclude_from_check: activity: transforming topics: [get-ready, python] abstract: "This lesson will help you set up an integrated development environment for Python on a computer running the Windows operating system." -redirect_from: /lessons/windows-installation +redirect_from: /lessons/windows-installation/ avatar_alt: A band of three musicians doi: 10.46430/phen0019 --- @@ -181,7 +181,7 @@ Now that you and your computer are up and running, we can move onto some more interesting tasks. If you are working through the Python lessons in order, we suggest you next try ‘[Understanding Web Pages and HTML][]‘ - [Python website]: http://www.python.org/ + [Python website]: https://www.python.org/ [other text editing options]: https://wiki.python.org/python/PythonEditors - [UTF-8]: http://en.wikipedia.org/wiki/UTF-8 + [UTF-8]: https://en.wikipedia.org/wiki/UTF-8 [Understanding Web Pages and HTML]: /lessons/viewing-html-files diff --git a/en/lessons/working-with-text-files.md b/en/lessons/working-with-text-files.md index ce419c3639..7e2fa84328 100755 --- a/en/lessons/working-with-text-files.md +++ b/en/lessons/working-with-text-files.md @@ -15,12 +15,12 @@ exclude_from_check: activity: transforming topics: [python] abstract: "In this lesson you will learn how to manipulate text files using Python." -next: code-reuse-and-modularity -previous: viewing-html-files +next: /en/lessons/code-reuse-and-modularity +previous: /en/lessons/viewing-html-files series_total: 15 lessons sequence: 3 python_warning: false -redirect_from: /lessons/working-with-text-files +redirect_from: /lessons/working-with-text-files/ avatar_alt: Bespectacled man reading an alphabet book doi: 10.46430/phen0020 --- @@ -274,10 +274,10 @@ Suggested Readings - [Non-Programmer’s Tutorial for Python 2.6/Hello, World][] - [Mac Installation]: /lessons/mac-installation - [Windows Installation]: /lessons/windows-installation - [Linux Installation]: /lessons/linux-installation - [print]: https://docs.python.org/2/reference/simple_stmts.html#the-print-statement - [reserved word]: http://docs.python.org/release/2.5.4/ref/keywords.html - [File Objects]: https://docs.python.org/2/library/stdtypes.html#bltin-file-objects - [Non-Programmer’s Tutorial for Python 2.6/Hello, World]: http://en.wikibooks.org/wiki/Non-Programmer%27s_Tutorial_for_Python_2.6/Hello,_World +- [Mac Installation](/en/lessons/mac-installation) +- [Windows Installation](/en/lessons/windows-installation) +- [Linux Installation](/en/lessons/linux-installation) +- [print](https://docs.python.org/2/reference/simple_stmts.html#the-print-statement) +- [reserved word](https://docs.python.org/release/2.5.4/ref/keywords.html) +- [File Objects](https://docs.python.org/2/library/stdtypes.html#bltin-file-objects) +- [Non-Programmer’s Tutorial for Python 2.6/Hello, World](https://en.wikibooks.org/wiki/Non-Programmer%27s_Tutorial_for_Python_2.6/Hello,_World) diff --git a/en/lessons/working-with-web-pages.md b/en/lessons/working-with-web-pages.md index 2419ff324a..5da424d758 100755 --- a/en/lessons/working-with-web-pages.md +++ b/en/lessons/working-with-web-pages.md @@ -16,13 +16,13 @@ exclude_from_check: activity: acquiring topics: [python] abstract: "This lesson introduces Uniform Resource Locators (URLs) and explains how to use Python to download and save the contents of a web page to your local hard drive." -next: manipulating-strings-in-python -previous: code-reuse-and-modularity +next: /en/lessons/manipulating-strings-in-python +previous: /en/lessons/code-reuse-and-modularity series_total: 15 lessons sequence: 5 categories: [lessons, python, original-ph] python_warning: false -redirect_from: /lessons/working-with-web-pages +redirect_from: /lessons/working-with-web-pages/ avatar_alt: A tall man next to a short woman doi: 10.46430/phen0021 --- @@ -143,10 +143,10 @@ Unfortunately, not all websites have such readable and reliable URLs. Spend a few minutes looking at Benjamin Bowsey’s trial page. Here we are not so much interested in what the transcript says, but what features -the page has. Notice the [View as XML](http://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/17800628.xml&div=t17800628-33&xml=yes) link at the bottom that takes +the page has. Notice the [View as XML](https://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/17800628.xml&div=t17800628-33&xml=yes) link at the bottom that takes you to a heavily marked up version of the text which may be useful to certain types of research. You can also look at a [scan of the original -document](http://www.oldbaileyonline.org/images.jsp?doc=178006280084), which was transcribed to make this resource. +document](https://www.oldbaileyonline.org/images.jsp?doc=178006280084), which was transcribed to make this resource. Now let's try opening the page using Python. Copy the following program into Komodo Edit and save it as `open-webpage.py`. When you execute the @@ -240,7 +240,7 @@ f.close So, if you can save a single file this easily, could you write a program to download a bunch of files? Could you step through trial IDs, for example, and make your own copies of a whole bunch of them? Yep. You can learn -how to do that in [Downloading Multiple Files using Query Strings](/lessons/downloading-multiple-records-using-query-strings), +how to do that in [Downloading Multiple Files using Query Strings](/en/lessons/downloading-multiple-records-using-query-strings), which we recommend after you have completed the introductory lessons in this series. Suggested Readings @@ -258,8 +258,8 @@ file to make sure you have the correct code. - programming-historian-1 ([zip][]) - [Old Bailey Online]: http://www.oldbaileyonline.org/ + [Old Bailey Online]: https://www.oldbaileyonline.org/ [Downloading Multiple Records Using Query Strings]: /lessons/downloading-multiple-records-using-query-strings [Old]: /images/old-bailey.png "Old" - [Gordon Riots]: http://en.wikipedia.org/wiki/Gordon_Riots + [Gordon Riots]: https://en.wikipedia.org/wiki/Gordon_Riots [zip]: /assets/python-lessons1.zip diff --git a/en/project-team.md b/en/project-team.md index bba68f2cca..360d4e99b9 100755 --- a/en/project-team.md +++ b/en/project-team.md @@ -1,7 +1,7 @@ --- title: Project Team layout: blank -redirect_from: /project-team +redirect_from: /project-team/ --- # Project Team diff --git a/en/research.md b/en/research.md index 90af62aaba..73c9de4573 100755 --- a/en/research.md +++ b/en/research.md @@ -1,7 +1,7 @@ --- title: Project Research layout: blank -redirect_from: /research +redirect_from: /research/ --- # Project Scholarship @@ -10,17 +10,17 @@ The project team and members of the wider community are involved in a number of ## Original Programming Historian -* William J. Turkel and Alan MacEachern, [_The Programming Historian_](http://niche-canada.org/wp-content/uploads/2013/09/programming-historian-1.pdf) 1st edition (Network in Canadian History & Environment: 2007-2008). +* William J. Turkel and Alan MacEachern, [_The Programming Historian_](https://niche-canada.org/wp-content/uploads/2013/09/programming-historian-1.pdf) 1st edition (Network in Canadian History & Environment: 2007-2008). * Japanese translation of William J. Turkel and Alan MacEachern, [_The Programming Historian_](https://www.dh.ku-orcas.kansai-u.ac.jp/?cat=2), 1st edition (Network in Canadian History & Environment: 2007-2008). ## Reviews -* Björn Ekström, Elisa Tattersall Wallin and Hana Marčetić, '[_Programming Historian_: Novice-friendly tutorials on digital methods](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1508542&dswid=7551)', _Tidskrift för ABM_, Vol. 5, no 1 (2020), pp. 71-75. +* Björn Ekström, Elisa Tattersall Wallin and Hana Marčetić, '[_Programming Historian_: Novice-friendly tutorials on digital methods](https://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1508542&dswid=7551)', _Tidskrift för ABM_, Vol. 5, no 1 (2020), pp. 71-75. * Dries Daems, '[A Review and Roadmap of Online Learning Platforms and Tutorials in Digital Archaeology](https://doi.org/10.1017/aap.2019.47)', _Advances in Archaeological Practice_, vol. 8, issue 1 (2020), pp. 87-92. * Martin Dröge, '[Review of: The Programming Historian](https://www.hsozkult.de/webreview/id/rezwww-184)', _H-Soz-Kult_ (2019). * Priscila Pilatowsky Goñi, '[Reseña a The programming historian](https://revistas.uned.es/index.php/RHD/article/view/22420)', _Revista de Humanidades Digitales_, vol. 2 (2018). * Lincoln Mullen, '[Review of the Programming Historian](https://academic.oup.com/jah/article-abstract/103/1/299/1751315)', _The Journal of American History_, vol. 103, no. 1 (2016), pp. 299-301. -* Cameron Blevins, '[Review of the Programming Historian](http://jitp.commons.gc.cuny.edu/review-of-the-programming-historian/)', _The Journal of Interactive Technology & Pedagogy_, vol. 8 (2015). +* Cameron Blevins, '[Review of the Programming Historian](https://jitp.commons.gc.cuny.edu/review-of-the-programming-historian/)', _The Journal of Interactive Technology & Pedagogy_, vol. 8 (2015). ## Published Research @@ -31,22 +31,22 @@ The project team and members of the wider community are involved in a number of * Jennifer Isasi, Riva Quiroga, Nabeel Sidiqqui, Joana Vieira Paulino, Alex Wermer-Colan, [“A Model for Multilingual and Multicultural Digital Scholarship Methods Publishing"](https://www.taylorfrancis.com/chapters/edit/10.4324/9781003393696-3/model-multilingual-multicultural-digital-scholarship-methods-publishing-jennifer-isasi-riva-quiroga-nabeel-siddiqui-joana-vieira-paulino-alex-wermer-colan), in _Multilingual Digital Humanities_, edited by Viola, L., & Spence, P., Routledge, 2023. * Adam Crymble & Charlotte M. H. Im, ['Measuring digital humanities learning requirements in Spanish & English-speaking practitioner communities'](https://doi.org/10.1007/s42803-023-00066-x), International Journal of Digital Humanities, (2023). * Eric Brasil, '[_pyHDB - Ferramenta Heurística para a Hemeroteca Digital Brasileira: utilizando técnicas de web scraping para a pesquisa em História_'](https://doi.org/10.15848/hh.v15i40.1904), _História Da Historiografia: International Journal of Theory and History of Historiography_, 15(40) (2022), 186–217. -* Matthew Lincoln, Sarah Melton, Jennifer Isasi, François Dominic Laramée, '[Relocating Complexity: The Programming Historian and Multilingual Static Site Generation](http://www.digitalhumanities.org/dhq/vol/16/2/000585/000585.html)', _Digital Humanities Quarterly_ 16, 2 (2022). +* Matthew Lincoln, Sarah Melton, Jennifer Isasi, François Dominic Laramée, '[Relocating Complexity: The Programming Historian and Multilingual Static Site Generation](https://www.digitalhumanities.org/dhq/vol/16/2/000585/000585.html)', _Digital Humanities Quarterly_ 16, 2 (2022). * Jennifer Isasi and Antonio Rojas Castro, ‘[¿Sin equivalencia? Una reflexión sobre la traducción al español de recursos educativos abiertos](https://muse.jhu.edu/article/842253)’, _Hispania_, 104, no. 4 (2021), 613-624. * Adam Crymble and Maria José Afanador Llach, ‘The Globally Unequal Promise of Digital Tools for History: UK and Colombia Case Study’ in _Teaching History for the Contemporary World_, edited by Adele Nye, 85-98, Springer, 2021. * Daniel Alves, '[Ensinar Humanidades Digitais sem as Humanidades Digitais: um olhar a partir das licenciaturas em História](https://novaresearch.unl.pt/files/32228034/Ensinar_Humanidades_Digitais.pdf)', _Revista EducaOnline_, v. 15, n. 2 (2021). * Adam Crymble, [_Technology & the Historian: Transformations in the Digital Age_](https://www.press.uillinois.edu/books/catalog/57hxp7wr9780252043710.html), (University of Illinois Press, 2021). * Anna-Maria Sichani, James Baker, Maria José Afanador Llach, and Brandon Walsh, [‘Diversity and Inclusion in Digital Scholarship and Pedagogy: The Case of The Programming Historian’](https://doi.org/10.1629/uksg.465), _Insights_, (2019). -* Katrina Navickas and Adam Crymble, ['From Chartist Newspaper to Digital Map of Grass-roots Meetings, 1841-44: Documenting Workflows'](http://www.tandfonline.com/doi/full/10.1080/13555502.2017.1301179), _Journal of Victorian Culture_, (2017). +* Katrina Navickas and Adam Crymble, ['From Chartist Newspaper to Digital Map of Grass-roots Meetings, 1841-44: Documenting Workflows'](https://www.tandfonline.com/doi/full/10.1080/13555502.2017.1301179), _Journal of Victorian Culture_, (2017). * Adam Crymble, ['Identifying and Removing Gender Barriers in Open Learning Communities: The Programming Historian'](https://www.herts.ac.uk/__data/assets/pdf_file/0016/138013/Blip-2016-Autumn-2016-Final-Autumn-2016.pdf), _Blended Learning in Practice_, (2016), 49-60. [[pre-print pdf](/researchpapers/openLearningCommunities2016.pdf)] -* Fred Gibbs, ['Editorial Sustainability and Open Peer Review at Programming Historian',](http://web.archive.org/web/20180713014622/http://dhcommons.org/journal/issue-1/editorial-sustainability-and-open-peer-review-programming-historian) _DH Commons_, Vol. 1 (2015). -* Shawn Graham, Ian Milligan, and Scott Weingart, [_Exploring Big Historical Data: The Historian's Macroscope_](http://www.themacroscope.org/2.0/), (Imperial College Press, 2015). +* Fred Gibbs, ['Editorial Sustainability and Open Peer Review at Programming Historian',](https://web.archive.org/web/20180713014622/https://dhcommons.org/journal/issue-1/editorial-sustainability-and-open-peer-review-programming-historian) _DH Commons_, Vol. 1 (2015). +* Shawn Graham, Ian Milligan, and Scott Weingart, [_Exploring Big Historical Data: The Historian's Macroscope_](https://www.themacroscope.org/2.0/), (Imperial College Press, 2015). ## Reports * Maria José Afanador-Llach & Andrés Rivera, '[Segundo ciclo de talleres: Herramientas y procesos digitales para la investigación y creación en artes y humanidades](/researchpapers/Informe_final_Talleres%20EHCN_2023-ENG_PH.pdf)', (2023). * Incllewsion and the Programming Historian, 'Initial Accessibility Testing: Summary of Findings', (2021). -* Penny Andrews and the Programming Historian, ['The Programming Historian: developing and sustaining impact in the Global South'](http://doi.org/10.5281/zenodo.3813763) (2020). +* Penny Andrews and the Programming Historian, ['The Programming Historian: developing and sustaining impact in the Global South'](https://doi.org/10.5281/zenodo.3813763) (2020). * Amy Kavanagh and the Programming Historian, 'Programming Historian – Access for visually impaired researchers', (n.d.). ## Workshops & Events @@ -65,7 +65,7 @@ The project team and members of the wider community are involved in a number of * Alex Wermer-Colan, ['Learning Digital Methods with the _Programming Historian_'](https://charlesstudy.temple.edu/event/11953011), Temple University [Online], (22 February 2024). * Carlo Blum, Adam Crymble, Vicky Garnett, Timothée Giraud, Alíz Horváth, Stefan Krebs, Ralph Marschall, Sofia Papastamkou, & Lorella Viola, 'Invisible College of Digital History: Workshop on Multilingual Educational Resources', C²DH [Online], (8 November 2023). * Nabeel Siddiqui, 'Convolutional Neural Networks for Image Classification', University of Edinburgh [Online], (7 November 2023). -* Eric Brasil, '[História Digital e História Digital da Educação: Caminhos Cruzados](http://www.iea.usp.br/eventos/historia-digital-educacao-caminhos-cruzados)', Instituto de Estudos Avançados, USP, São Paulo, Brazil, (17 October 2023). +* Eric Brasil, '[História Digital e História Digital da Educação: Caminhos Cruzados](https://www.iea.usp.br/eventos/historia-digital-educacao-caminhos-cruzados)', Instituto de Estudos Avançados, USP, São Paulo, Brazil, (17 October 2023). * Scott Kleinman, Alex Wermer-Colan, Joana Vieira Paulino, Nabeel Siddiqui, Zoe LeBlanc, 'Developing a Digital Humanities Tutorial', [DH 2023](https://dh2023.adho.org/), Graz, Austria (10 July 2023). * Daphné Mathelier, 'Atelier Markdown', [11e journées du réseau Medici](https://web.archive.org/web/20230629084307/https://medici2023.sciencesconf.org/resource/page/id/2), Université de Liège, Belgium, (29 June 2023). * María José Afanador Llach, Jennifer Isasi, Riva Quiroga, 'Sobre _Programming Historian en español_ y cómo contribuir a la publicación', Semana de Humanidades Digitales 2023 [Online], (10 May 2023). @@ -153,10 +153,10 @@ The project team and members of the wider community are involved in a number of * Adam Crymble, 'Facilitating Making in Digital Humanities', The Archaeology of Making, University of London, 5 May 2021. * Daniel Alves, Jennifer Isasi, Sarah Melton, Sofia Papastamkou, Jessica Parr, Riva Quiroga, Nabeel Siddiqui, Brandon Walsh, '[The Programming Historian: A Global Case Study in Multilingual Open Access and DH Tutelage/Instruction](https://msuglobaldh.org/abstracts/#programming-historian)' (panel), _Global Digital Humanities Symposium_, Michigan State University, East Lansing, USA, 12 April, 2021. * Jessica Parr, '[Cambridge Cultural Heritage Data School: Final plenary](https://www.cdh.cam.ac.uk/events/cambridge-cultural-heritage-data-school-final-plenary)', University of Cambridge, United Kingdom, 30 March 2021. -* Jennifer Isasi & Riva Quiroga, ['_Programming Historian_: Un proyecto colaborativo para poner la programación al alcance de los humanistas'](http://ixa2.si.ehu.eus/intele/?q=webinars), _INTELE : INfraestructura de TEcnologías del LEnguaje_, Spain, 25 March, 2021. +* Jennifer Isasi & Riva Quiroga, ['_Programming Historian_: Un proyecto colaborativo para poner la programación al alcance de los humanistas'](https://ixa2.si.ehu.eus/intele/?q=webinars), _INTELE : INfraestructura de TEcnologías del LEnguaje_, Spain, 25 March, 2021. * Sofia Papastamkou, Jessica Parr & Riva Quiroga, 'Challenges for Digital Literacy in the Humanities: The Open, Community-Based and Multilinguistic Approach of _The Programming Historian_', NewsEye’s International Conference, Europe, 17 March, 2021. * Riva Quiroga, ['Multilingual Digital Humanites'](https://mediacentral.ucl.ac.uk/Play/59506), Digital Humanities Long View Seminar, UCLDH, UK & CESTA, USA, 10 March, 2021. -* Brandon Walsh, '[The Programming Historian and Editorial Process in Digital Publishing](http://walshbr.com/blog/the-programming-historian-and-editorial-process-in-digital-publishing/)', Modern Languages Association Conference 2021, 7-10 January, 2021. +* Brandon Walsh, '[The Programming Historian and Editorial Process in Digital Publishing](https://walshbr.com/blog/the-programming-historian-and-editorial-process-in-digital-publishing/)', Modern Languages Association Conference 2021, 7-10 January, 2021. * Sofia Papastamkou, François Dominic Laramée, Martin Grandjean, '[Le Programming Historian en français: quelles ressources éducatives libres pour les méthodes numériques ?](https://zenodo.org/record/3819954)', *Humanistica 2020 Conference*, Bordeaux, France, 12-14 May 2020. * Sofia Papastamkou, 'A Beating Heart of Digital History: The Programming Historian', [Teaching Digital History Workshop](https://cas.au.dk/en/cedhar/events/show/artikel/teaching-digital-history-workshop), Center for Digital History Aarhus, University of Aarhus, Denmark, 23 October 2019. * Jennifer Isasi, Maria José Afanador y Antonio Rojas Castro, 'Retos en la producción de tutoriales de HD en contexto hispanohablantes', Conferencia ACH 2019, The Association for Computers and the Humanities, Pittsburgh, USA, 23-26 July, 2019. @@ -169,7 +169,7 @@ The project team and members of the wider community are involved in a number of * Victor Gayol, 'La investigación del pasado y la historia digital: análisis de datos y cómo aprender (The Programming Historian en español)', _Humanidades Digitales_, IV Feria Internacional de Ciencias Sociales y Humanidades, Centro Universitario de Los Lagos - Universidad de Guadalajara, Lagos de Moreno, Jalisco (9 March, 2017). * Victor Gayol, 'The Programming Historian: 'un modelo colaborativo para la investigación y la ensenñanza en ciencias sociales y humanidades digitales', _Mesa de Trabajo sobre Ciencias Sociales y Humanidades Digitales_, El Colegio De Michoacán, Mexico (21 February 2017). * Adam Crymble, 'Bringing Digital Humanities into the University for Free', University of Cape Town, South Africa (27-28 June 2016). -* Fred Gibbs, 'The Programming Historian' (Poster), _American Historical Association_, New York (January 2015). +* Fred Gibbs, 'The Programming Historian' (Poster), _American Historical Association_, New York (January 2015). * Adam Crymble, 'The Programming Historian 2', _Digital History Seminar_, Institute of Historical Research, London (13 October 2013). * Adam Crymble, 'The Programming Historian 2', _Digital Humanities 2012_, Hamburg (July 2012). @@ -181,11 +181,11 @@ The project team and members of the wider community are involved in a number of * Martin Dröge, 'Rezension zu The Programming Historian', _H-Soz-Kult_, 31.08.2019, . * Sue Levine, 'The Early-Stage Ph.D.'s Guide to Summer', _Inside Higher Education_, 10 June 2019, . * 'Championing open access with online digital history journal', _University of Sussex Press Office_, 9 October, 2018, . -* Adam Crymble, 'A Decade of Programming Historians', _Network in Canadian History & Environment_, 23 March, 2018, . +* Adam Crymble, 'A Decade of Programming Historians', _Network in Canadian History & Environment_, 23 March, 2018, . * Fred Gibbs, "Sustainable Publishing: Reflections of a Former Programming Historian Editor", FredGibbs.net, 2017, . -* Anaclet Pons, "The Programming Historian en español", _Clionauta: Blog de historia_, June 14, 2017, . +* Anaclet Pons, "The Programming Historian en español", _Clionauta: Blog de historia_, June 14, 2017, . * Seth Denbo, “Historian, Program! Self-Help for Digital Neophytes,” _Perspectives on History: The Newsmagazine of the American Historical Association_, May 2017, . -* Víctor Gayol, '*The Programming Historian* en español', *Blog de Humanidades Digitales*, March 17, 2017, . +* Víctor Gayol, '*The Programming Historian* en español', *Blog de Humanidades Digitales*, March 17, 2017, . ## Projects Using the Programming Historian diff --git a/en/reviewer-guidelines.md b/en/reviewer-guidelines.md index c127a48f0d..9fcd60eb60 100755 --- a/en/reviewer-guidelines.md +++ b/en/reviewer-guidelines.md @@ -2,7 +2,7 @@ title: Reviewer Guidelines date: 03-14-2015 layout: blank -redirect_from: /reviewer-guidelines +redirect_from: /reviewer-guidelines/ --- # Reviewer Guidelines @@ -33,7 +33,7 @@ We highly value transparency in our lesson production and review process. Our re Therefore, your work as a reviewer--and your identity--will be fully visible to the author. Comments should engage with the author and lesson directly, rather than the review editor. If at any point you are unsure of your role or what to do next, feel free to post a question to clarify and an editor will respond as soon as they can. -In keeping with the ideas of public scholarship and open peer review, we generally encourage discussions to stay on GitHub as outlined in our editorial workflow. However, we also want everyone to feel comfortable. In some cases a private word may be more appropriate. If you feel the need to discuss a matter related to a tutorial or a matter related to the review, please feel free to [email the assigned editor directly](/project-team), or to contact one of our dedicated ombudsperson (Dr Ian Milligan - i2milligan@uwaterloo.ca). +In keeping with the ideas of public scholarship and open peer review, we generally encourage discussions to stay on GitHub as outlined in our editorial workflow. However, we also want everyone to feel comfortable. In some cases a private word may be more appropriate. If you feel the need to discuss a matter related to a tutorial or a matter related to the review, please feel free to [email the assigned editor directly](/en/project-team), or to contact one of our dedicated ombudsperson (Dr Ian Milligan - i2milligan@uwaterloo.ca). Unless you instruct us otherwise, your name will be indicated as a reviewer on the lesson's page when it is officially published; you'll also be listed on our contributors page. diff --git a/en/supporters.md b/en/supporters.md index 2bb36710e7..d2f86fa8f4 100644 --- a/en/supporters.md +++ b/en/supporters.md @@ -1,7 +1,7 @@ --- layout: blank title: Our Supporters -redirect_from: /supporters +redirect_from: /supporters/ --- # Our Supporters @@ -17,7 +17,7 @@ _Programming Historian_ is grateful to our past and current supporters for enabl - [Jisc](https://www.jisc.ac.uk/), United Kingdom ## Institutional Partners -Contributors to our [Institutional Partner Programme](support-us#institutional-partner-programme): +Contributors to our [Institutional Partner Programme](/en/ipp): - [KU Leuven Bibliotheken](https://bib.kuleuven.be/), Belgium - [Western University Library](https://www.lib.uwo.ca/), Canada @@ -28,7 +28,7 @@ Contributors to our [Institutional Partner Programme](support-us#institutional-p - [Cambridge Digital Humanities](https://www.cdh.cam.ac.uk/), United Kingdom - [Georg-August-Universität Göttingen](https://www.uni-goettingen.de/), Germany - [MIT Libraries](https://libraries.mit.edu/), United States -- [Center for Digital Research in the Humanities, University of Nebraska-Lincoln](http://cdrh.unl.edu/), United States +- [Center for Digital Research in the Humanities, University of Nebraska-Lincoln](https://cdrh.unl.edu/), United States - [The National Archives](https://www.nationalarchives.gov.uk/), United Kingdom - [College of the Liberal Arts, Penn State University](https://la.psu.edu/), United States - [University of Bristol Library](https://www.bristol.ac.uk/library/), United Kingdom @@ -47,10 +47,10 @@ Contributors to our [Institutional Partner Programme](support-us#institutional-p - [University of Edinburgh Library](https://library.ed.ac.uk/), United Kingdom - [University of Cambridge](https://www.cam.ac.uk/), United Kingdom -We welcome enquiries from prospective [Institutional Partner Programme](ipp) contributors. +We welcome enquiries from prospective [Institutional Partner Programme](/en/ipp) contributors. ### Alumni Partners -Former contributors to our [Institutional Partner Programme](support-us#institutional-partner-programme): +Former contributors to our [Institutional Partner Programme](/en/ipp): - [Roy Rosenzweig Center for History and New Media, George Mason University](https://rrchnm.org/), United States [2021-2022] - [Centre for Digital Humanities, University College London](https://www.ucl.ac.uk/digital-humanities/), United Kingdom [2021-2022] diff --git a/en/translator-guidelines.md b/en/translator-guidelines.md index b57f07e75e..676b06613b 100644 --- a/en/translator-guidelines.md +++ b/en/translator-guidelines.md @@ -9,9 +9,9 @@ skip_validation: true # Translator Guidelines {{ site.data.snippets.write-a-lesson-image-alt[page.lang] }} -

    Step 1: Proposing the Translation of a Lesson

    -

    Step 2: Writing and Formatting a Translation

    -

    Step 3: Submitting a Translated Lesson

    +

    Step 1: Proposing the Translation of a Lesson

    +

    Step 2: Writing and Formatting a Translation

    +

    Step 3: Submitting a Translated Lesson

    ## Proposing the Translation of a Lesson If you want to translate a lesson published in *Programming Historian*, please see the list of [pending translations](https://github.com/orgs/programminghistorian/projects/5) and contact {% include managing-editor.html lang=page.lang %} to discuss your language skills and translation experience. We look for translations that are rigorous, readable, and consider the needs of an English-reading audience. @@ -34,7 +34,7 @@ All of our lessons must also be written in Markdown and follow our technical for ## Submitting a Translated Lesson Once your translation file has been prepared to the above specifications, you are ready to submit it for peer review. -We have a [Programming Historian project page at GitHub](https://github.com/programminghistorian), where we maintain two repositories (a repository is a place to store related files and folders–you can think of it as a kind of folder). One of these, called [jekyll](https://github.com/programminghistorian/jekyll), hosts the code for the live version of the site you see at http://programminghistorian.org. The other repository is called [ph-submissions](https://github.com/programminghistorian/ph-submissions). +We have a [Programming Historian project page at GitHub](https://github.com/programminghistorian), where we maintain two repositories (a repository is a place to store related files and folders–you can think of it as a kind of folder). One of these, called [jekyll](https://github.com/programminghistorian/jekyll), hosts the code for the live version of the site you see at https://programminghistorian.org. The other repository is called [ph-submissions](https://github.com/programminghistorian/ph-submissions). Our preferred way for translators to submit a lesson is to add them directly to the [ph-submissions](https://github.com/programminghistorian/ph-submissions) repository (or repo, for short). Thanks to GitHub's features, you can do this using drag-and-drop uploading actions with which you are probably already familiar. As a new translator, here are the steps: @@ -50,14 +50,14 @@ Our preferred way for translators to submit a lesson is to add them directly to
    ### Translation Submitted! Now What? -To see what happens after you submit a translation, feel free to browse our [editor guidelines](/editor-guidelines), which detail our editorial process. Highlights are below: +To see what happens after you submit a translation, feel free to browse our [editor guidelines](/en/editor-guidelines), which detail our editorial process. Highlights are below: The most immediately important step is that your editor will create an [issue](https://github.com/programminghistorian/ph-submissions/issues) for the new translation on the [ph-submissions](https://github.com/programminghistorian/ph-submissions) repository, with a link to your lesson (that you previewed in step 5). The editor and at least two reviewers invited by the editor will post their comments to this issue. ### Wait for Reviewer Feedback We aim to complete the review process within four weeks, but sometimes delays occur or people get busy and the process can take longer than we hoped. -In keeping with the ideas of public scholarship and open peer review, we encourage discussions to stay on GitHub. However, we also want everyone to feel comfortable with the process. If you need to discuss something privately, please feel free to [email your editor directly](/project-team), or to contact our dedicated ombudsperson (Dr Ian Milligan - i2milligan@uwaterloo.ca). +In keeping with the ideas of public scholarship and open peer review, we encourage discussions to stay on GitHub. However, we also want everyone to feel comfortable with the process. If you need to discuss something privately, please feel free to [email your editor directly](/en/project-team), or to contact our dedicated ombudsperson (Dr Ian Milligan - i2milligan@uwaterloo.ca). ### Respond to Feedback Your editor and reviewers will most likely make some suggestions for improvements on the "issue" for your translation. The editor should clarify which suggestions are essential to address, which are optional, and which can be set aside. diff --git a/en/vacancies.md b/en/vacancies.md index 64e77cb832..6a0c379bac 100644 --- a/en/vacancies.md +++ b/en/vacancies.md @@ -1,7 +1,7 @@ --- title: Vacancies layout: blank -redirect_from: /vacancies +redirect_from: /vacancies/ --- Thank you for your interest in working with _Programming Historian_. There are currently no vacancies. diff --git a/es/acerca-de.md b/es/acerca-de.md index 194b5ebb0b..c1e20ff94f 100644 --- a/es/acerca-de.md +++ b/es/acerca-de.md @@ -14,7 +14,7 @@ Todos los tutoriales publicados en _The Programming Historian en español_ han s Nuestro proceso de revisión es un poco distinto al tradicional: no solicitamos una valoración sobre la calidad del texto, es decir, si merece ser publicado o no, sino que pedimos a nuestros revisores que participen de manera activa para mejorar el tutorial, de tal modo que todas las partes implicadas aprendan. Al hacernos cargo de un tutorial nuevo o bien de una traducción, seguimos un [flujo de trabajo]({{site.baseurl}}/es/guia-para-autores) específico con el objetivo de que la lección se publique en un período de tiempo razonable. Puedes consultar nuestra [guía para revisores]({{site.baseurl}}/es/guia-para-revisores) si deseas más información. ## Código abierto -En _The Programming Historian en español_ estamos comprometidos con el uso de herramientas y lenguajes accesibles para todo el mundo. Todas nuestras lecciones utilizan lenguajes y programas gratuitos. Creemos que cualquier persona debería poder hacer uso de nuestros tutoriales con independencia de la financiación disponible para llevar a cabo su proyecto de investigación. Desde 2016, se ha depositado una versión citable del proyecto _Programming Historian_ en [Zenodo](https://zenodo.org/). El depósito de 2022 está disponible en [doi.org/10.5281/zenodo.7313045](https://doi.org/10.5281/zenodo.7313045). Desde 2018, el [Archivo Web del Reino Unido](https://www.webarchive.org.uk/) rastrea e indexa a _The Programming Historian_ de manera regular. Estos datos son archivados y están disponibles de manera púbica [a través de su sitio web](https://www.webarchive.org.uk/wayback/en/archive/*/http://programminghistorian.org/). +En _The Programming Historian en español_ estamos comprometidos con el uso de herramientas y lenguajes accesibles para todo el mundo. Todas nuestras lecciones utilizan lenguajes y programas gratuitos. Creemos que cualquier persona debería poder hacer uso de nuestros tutoriales con independencia de la financiación disponible para llevar a cabo su proyecto de investigación. Desde 2016, se ha depositado una versión citable del proyecto _Programming Historian_ en [Zenodo](https://zenodo.org/). El depósito de 2022 está disponible en [doi.org/10.5281/zenodo.7313045](https://doi.org/10.5281/zenodo.7313045). Desde 2018, el [Archivo Web del Reino Unido](https://www.webarchive.org.uk/) rastrea e indexa a _The Programming Historian_ de manera regular. Estos datos son archivados y están disponibles de manera púbica [a través de su sitio web](https://www.webarchive.org.uk/wayback/en/archive/*/https://programminghistorian.org/). ## Acceso abierto *Diamond* @@ -26,7 +26,7 @@ _The Programming Historian en español_ (ISSN {{ site.data.snippets.issn[page.la ## Premios -The _Programming Historian_ ha ganado múltiples premios que reconocen y celebran nuestros logros en las esferas de la publicación en acceso abierto y de las humanidades digitales. En 2016, la revista en inglés fue la ganadora del [Digital Humanities Awards](http://dhawards.org/dhawards2016/results/) en la categoría de Mejor Serie de Posts y, al año siguiente, 2017, _Programming Historian en español_ [recibió el mismo galardón](http://dhawards.org/dhawards2017/results/). En 2018, la [Asociación de Humanidades Digitales Hispánicas](http://humanidadesdigitaleshispanicas.es/) otorgó el premio de 'Mejor iniciativa formativa desarrollada durante el año 2018' a _Programming Historian en español_. Recibimos el [Canadian Social Knowledge Institute's Open Scholarship Award](https://etcl.uvic.ca/events-activities/open-scholarship-awards/) en 2020 y en 2021 nuestro trabajo fue reconocido con un [Coko Foundation's Open Publishing Award](https://web.archive.org/web/20220408041024/https://openpublishingawards.org/results/2021/index.html) en la categoría de Contenido Abierto. En 2022, ganamos la categoría de Mejor material de formación en DH de los [Digital Humanities Awards](http://dhawards.org/dhawards2022/results/). +The _Programming Historian_ ha ganado múltiples premios que reconocen y celebran nuestros logros en las esferas de la publicación en acceso abierto y de las humanidades digitales. En 2016, la revista en inglés fue la ganadora del [Digital Humanities Awards](https://dhawards.org/dhawards2016/results/) en la categoría de Mejor Serie de Posts y, al año siguiente, 2017, _Programming Historian en español_ [recibió el mismo galardón](https://dhawards.org/dhawards2017/results/). En 2018, la [Asociación de Humanidades Digitales Hispánicas](https://humanidadesdigitaleshispanicas.es/) otorgó el premio de 'Mejor iniciativa formativa desarrollada durante el año 2018' a _Programming Historian en español_. Recibimos el [Canadian Social Knowledge Institute's Open Scholarship Award](https://etcl.uvic.ca/events-activities/open-scholarship-awards/) en 2020 y en 2021 nuestro trabajo fue reconocido con un [Coko Foundation's Open Publishing Award](https://web.archive.org/web/20220408041024/https://openpublishingawards.org/results/2021/index.html) en la categoría de Contenido Abierto. En 2022, ganamos la categoría de Mejor material de formación en DH de los [Digital Humanities Awards](https://dhawards.org/dhawards2022/results/). ## Política de diversidad @@ -36,8 +36,8 @@ En The _Programming Historian_ estamos comprometidos con la diversidad y defende ## Financiamiento y propiedad _Programming Historian_ es un proyecto internacional impulsado por un equipo voluntario. Sus actividades financieras son administradas por ProgHist Ltd, una organización benéfica registrada en Inglaterra y Gales ([1195875](https://register-of-charities.charitycommission.gov.uk/charity-search/-/charity-details/5181272/charity-overview)) y constituida en esos mismos países como una compañía limitada por garantía ([12192946](https://beta.companieshouse.gov.uk/company/12192946)). El proyecto es publicado por el consejo editorial de *Programming Historian*. -Para ver un listado de patrocinadores y ayudas, visita nuestra página '[Apóyanos](/es/apoyanos)'. +Para ver un listado de patrocinadores y ayudas, visita nuestra página '[Apóyanos](/es/donaciones/)'. ## Historia del proyecto -*The Programming Historian* fue fundado en 2008 por William J. Turkel y Alan MacEachern. En aquel entonces, Turkel publicó [un blog post](http://digitalhistoryhacks.blogspot.com/2008/01/programming-historian.html) en el que explicó sus ideas para el proyecto. Se centró principalmente en Python y se publicó en acceso abierto como un proyecto de "Infraestructura digital" en la Red de *Historia y Medio Ambiente de Canada* (Network in Canadian History & Environment (NiCHE)). En 2012, *The Programming Historian* expandió su equipo editorial y se presentó como una revista académica de metodología para historiadores digitales, de revisión por pares y de acceso abierto. En 2016 añadimos una publicación en español a la publicación inicial en inglés y en 2017 empezamos a publicar lecciones traducidas bajo el título *The Programming Historian en español*. En 2018 [organizamos nuestro primer taller de escritura en español](/posts/bogota-workshop-report) y [abrimos una convocatoria para lecciones en español](/posts/convocatoria-de-tutoriales). En ese mismo año añadimos una publicación en francés que lanzó *Programming Historian en français* en 2019. Un año después, un equipo de habla portuguesa se nos unió e inauguramos *[Programming Historian em português]({{site.baseurl}}/pt)* a principios de 2021. +*The Programming Historian* fue fundado en 2008 por William J. Turkel y Alan MacEachern. En aquel entonces, Turkel publicó [un blog post](https://digitalhistoryhacks.blogspot.com/2008/01/programming-historian.html) en el que explicó sus ideas para el proyecto. Se centró principalmente en Python y se publicó en acceso abierto como un proyecto de "Infraestructura digital" en la Red de *Historia y Medio Ambiente de Canada* (Network in Canadian History & Environment (NiCHE)). En 2012, *The Programming Historian* expandió su equipo editorial y se presentó como una revista académica de metodología para historiadores digitales, de revisión por pares y de acceso abierto. En 2016 añadimos una publicación en español a la publicación inicial en inglés y en 2017 empezamos a publicar lecciones traducidas bajo el título *The Programming Historian en español*. En 2018 [organizamos nuestro primer taller de escritura en español](/posts/bogota-workshop-report) y [abrimos una convocatoria para lecciones en español](/posts/convocatoria-de-tutoriales). En ese mismo año añadimos una publicación en francés que lanzó *Programming Historian en français* en 2019. Un año después, un equipo de habla portuguesa se nos unió e inauguramos *[Programming Historian em português]({{site.baseurl}}/pt)* a principios de 2021. diff --git a/es/colaboradores.md b/es/colaboradores.md index d65dbd3b78..4891626f5e 100644 --- a/es/colaboradores.md +++ b/es/colaboradores.md @@ -18,7 +18,7 @@ original: supporters - [Jisc](https://www.jisc.ac.uk/), Reino Unido ## Instituciones asociadas -Contribuidores de nuestro [Programa de Instituciones Asociadas](pia): +Contribuidores de nuestro [Programa de Instituciones Asociadas](/es/pia): - [KU Leuven Bibliotheken](https://bib.kuleuven.be/), Bélgica - [Western University Library](https://www.lib.uwo.ca/), Canadá @@ -29,7 +29,7 @@ Contribuidores de nuestro [Programa de Instituciones Asociadas](pia): - [Cambridge Digital Humanities](https://www.cdh.cam.ac.uk/), Reino Unido - [Georg-August-Universität Göttingen](https://www.uni-goettingen.de/), Alemania - [MIT Libraries](https://libraries.mit.edu/), Estados Unidos -- [Center for Digital Research in the Humanities, University of Nebraska-Lincoln](http://cdrh.unl.edu/), Estados Unidos +- [Center for Digital Research in the Humanities, University of Nebraska-Lincoln](https://cdrh.unl.edu/), Estados Unidos - [The National Archives](https://www.nationalarchives.gov.uk/), Reino Unido - [College of the Liberal Arts, Penn State University](https://la.psu.edu/), Estados Unidos - [University of Bristol Library](https://www.bristol.ac.uk/library/), Reino Unido @@ -48,10 +48,10 @@ Contribuidores de nuestro [Programa de Instituciones Asociadas](pia): - [University of Edinburgh Library](https://library.ed.ac.uk/), Reino Unido - [University of Cambridge](https://www.cam.ac.uk/), Reino Unido -Son bienvenidas las consultas de potenciales contribuidores del [Programa de Instituciones Asociadas](pia). +Son bienvenidas las consultas de potenciales contribuidores del [Programa de Instituciones Asociadas](/es/pia). ## Alumni -Antiguos contribuidores de nuestro [Programa de Instituciones Asociadas](pia): +Antiguos contribuidores de nuestro [Programa de Instituciones Asociadas](/es/pia): - [Roy Rosenzweig Center for History and New Media, George Mason University](https://rrchnm.org/), Estados Unidos [2021-2022] - [UCL Centre for Digital Humanities](https://www.ucl.ac.uk/digital-humanities/), Reino Unido [2021-2022] diff --git a/es/contribuciones.md b/es/contribuciones.md index e35c9e08bf..1349b8324b 100644 --- a/es/contribuciones.md +++ b/es/contribuciones.md @@ -14,8 +14,8 @@ _Programming Historian en español_ es posible gracias al esfuerzo de voluntario Si dominas de más de uno de nuestros idiomas (francés, español, inglés, portugués), puedes ponerte en contacto con nosotros para traducir una lección ya publicada en _Programming Historian_ de un idioma a otro. De esta manera nos ayudarás en nuestra contribucion en las comunidades de humanidades digitales en español y francés, y profundizarás en un lenguaje, método o tecnología. -Buscamos traducciones rigurosas y de lectura amena que tengan en cuenta los contextos de investigación hispánico, lusófono y francés así como los recursos disponibles en nuestras respectivas comunidades. Puesto que muchos de lo tecnicismos son nuevos y/o todavía no están recogidos en los diccionarios, también recomendamos el uso de la [Taxonomía sobre Actividades de investigación digital en humanidades](http://vocabularios.caicyt.gov.ar/portalthes/index.php?v=42) de TaDiRAH y el [Glosario de Preservación Archivística Digital (Versión 4.0)](http://www.mecd.gob.es/planes-nacionales/dam/jcr:f20a4ba1-0ed2-445d-9be9-b8b0382562ea/mex-glosario-interpares-total0112.pdf) de Voutssas-M y Barnard Amozorrutia (UNAM). -Si te interesa colaborar, consulta nuestras [instrucciones para autores y traductores](/es/guia-para-autores.html). +Buscamos traducciones rigurosas y de lectura amena que tengan en cuenta los contextos de investigación hispánico, lusófono y francés así como los recursos disponibles en nuestras respectivas comunidades. Puesto que muchos de lo tecnicismos son nuevos y/o todavía no están recogidos en los diccionarios, también recomendamos el uso de la [Taxonomía sobre Actividades de investigación digital en humanidades](https://vocabularios.caicyt.gov.ar/portalthes/index.php?v=42) de TaDiRAH y el [Glosario de Preservación Archivística Digital (Versión 4.0)](https://www.mecd.gob.es/planes-nacionales/dam/jcr:f20a4ba1-0ed2-445d-9be9-b8b0382562ea/mex-glosario-interpares-total0112.pdf) de Voutssas-M y Barnard Amozorrutia (UNAM). +Si te interesa colaborar, consulta nuestras [instrucciones para autores y traductores](/es/guia-para-autores). ## Revisa una lección @@ -63,7 +63,7 @@ Agradecemos de manera especial alertas sobre lecciones que no funcionan. A medid {{ site.data.snippets.library-catalogue-image-alt[page.lang] }} -_Programming Historian_ está registrado en WorldCat en [español](https://www.worldcat.org/title/programming-historian-en-espanol/oclc/1061292935&referer=brief_results), en [inglés](http://www.worldcat.org/title/programming-historian/oclc/951537099), en [francés](https://uva.worldcat.org/title/programming-historian-en-franais/oclc/1104391842), y en [portugués](https://search.worldcat.org/title/1332987197). +_Programming Historian_ está registrado en WorldCat en [español](https://www.worldcat.org/title/programming-historian-en-espanol/oclc/1061292935&referer=brief_results), en [inglés](https://www.worldcat.org/title/programming-historian/oclc/951537099), en [francés](https://uva.worldcat.org/title/programming-historian-en-franais/oclc/1104391842), y en [portugués](https://search.worldcat.org/title/1332987197). Gracias a [University of Purdue library] y a Amanda Visconti, y University of Virginia library. Y ha sido indexado por el [Directory of Open Access Journals]. Este proyecto se propone demostrar cómo deben ser las publicaciones académicas en abierto. Por favor, ayúdanos a difundir nuestro mensaje pidiendo a tu bibliotecario o bibliotecaria que añade este recurso al catálogo de tu biblioteca. @@ -83,6 +83,6 @@ Si se te ocurren más formas de participación, siempre puedes [escribirnos un e [revisores]: /es/guia-para-revisores [Guía para editores]: /es/guia-para-revisores [comentarios]: /es/retroalimentacion -[WorldCat]: http://www.worldcat.org/title/programming-historian/oclc/951537099 -[University of Purdue library]: http://purdue-primo-prod.hosted.exlibrisgroup.com/primo_library/libweb/action/dlDisplay.do?vid=PURDUE&search_scope=everything&docId=PURDUE_ALMA51671812890001081&fn=permalink +[WorldCat]: https://www.worldcat.org/title/programming-historian/oclc/951537099 +[University of Purdue library]: https://purdue-primo-prod.hosted.exlibrisgroup.com/primo_library/libweb/action/dlDisplay.do?vid=PURDUE&search_scope=everything&docId=PURDUE_ALMA51671812890001081&fn=permalink [Directory of Open Access Journals]: https://doaj.org/toc/2397-2068 diff --git a/es/donaciones.md b/es/donaciones.md index a82e6816d5..b07a88f378 100644 --- a/es/donaciones.md +++ b/es/donaciones.md @@ -19,7 +19,7 @@ Tu colaboración apoya directamente la infrastructura que de nuestras publicacio
    - + @@ -36,4 +36,4 @@ Puedes hacer donaciones puntuales a *Programming Historian* por [Paypal](https:/ # Apoyo institucional -Si trabajas para una organización interesada en contribuir al éxito de *Programming Historian*, visita nuestra página del [Programa de Instituciones Asociadas](pia), que provee un apoyo crucial a nuestro trabajo. +Si trabajas para una organización interesada en contribuir al éxito de *Programming Historian*, visita nuestra página del [Programa de Instituciones Asociadas](/es/pia), que provee un apoyo crucial a nuestro trabajo. diff --git a/es/guia-editor.md b/es/guia-editor.md index 9a03470bb9..8d8621269a 100644 --- a/es/guia-editor.md +++ b/es/guia-editor.md @@ -45,7 +45,7 @@ A continuación, el editor creará un *issue* en el [repositorio de GitHub](http Si la lección no es entregada en la [fecha acordada], el editor intentará contactar con el autor o autores de la lección. Si no recibe noticias, el ticket se cerrará. Éste podrá abrirse en el futuro a petición del autor o autores. - El principal contacto para esta lección es [nombre del editor]. Si se produce algún problema, el autor puede contactar con nuestros ’ombudsperson' (Silvia Gutiérrez De la Torre - http://programminghistorian.org/es/equipo-de-proyecto). + El principal contacto para esta lección es [nombre del editor]. Si se produce algún problema, el autor puede contactar con nuestros ’ombudsperson' (Silvia Gutiérrez De la Torre - https://programminghistorian.org/es/equipo-de-proyecto). Este texto, sin embargo, puede editarse y adaptarse a las necesidades para reflejar más objetivos o lo que se ha negociado entre el editor y el autor. @@ -61,7 +61,7 @@ Después de subir todo, el/la editor/a debe comprobar que la carga de archivos r *The Programming Historian en español* se sirve de un modelo de revisión por pares en abierto; creemos que esto incentiva el respeto y la generación de ideas. Sin embargo, los autores y traductores también tienen derecho a tener un proceso de revisión tradicional, es decir, mediante mensajes privados. Existen muchas razones por las que alguien podría dudar a la hora de iniciar un proceso de revisión por pares en abierto; por eso, animamos a los autores y traductores a que elijan la opción con la que se sientan más cómodos. -Antes de solicitar revisiones externas, el editor debe leer y probar el tutorial y utilizar su experiencia editorial previa para ayudar al autor a realizar algunas mejoras iniciales (si es necesario). El editor debe escribir un resumen sobre la sostenibilidad de la propuesta para asegurarse de que la versión y las especificaciones del programa son claras, que las capturas de pantalla son realmente necesarias para completar la lección y que la lección hace uso de la documentación existente siempre que esté disponible y sea apropiado. Los editores también deben asegurarse de que las lecciones intentan, en la medida de lo posible, evitar las instrucciones específicas del programa, como "Haga clic con el botón derecho del ratón en el icono _x_ para acceder al menú _x_", en lugar de favorecer las descripciones metodológicas generales. La lista de comprobación editorial [contiene más detalles sobre las prácticas de sostenibilidad](#c-revisar-la-sostenibilidad). +Antes de solicitar revisiones externas, el editor debe leer y probar el tutorial y utilizar su experiencia editorial previa para ayudar al autor a realizar algunas mejoras iniciales (si es necesario). El editor debe escribir un resumen sobre la sostenibilidad de la propuesta para asegurarse de que la versión y las especificaciones del programa son claras, que las capturas de pantalla son realmente necesarias para completar la lección y que la lección hace uso de la documentación existente siempre que esté disponible y sea apropiado. Los editores también deben asegurarse de que las lecciones intentan, en la medida de lo posible, evitar las instrucciones específicas del programa, como "Haga clic con el botón derecho del ratón en el icono _x_ para acceder al menú _x_", en lugar de favorecer las descripciones metodológicas generales. La lista de comprobación editorial [contiene más detalles sobre las prácticas de sostenibilidad](#c-revisar-la-sostenibilidad-e-internacionalización). A menudo, los editores necesitan ayuda para aclarar la audiencia a la que se quiere llegar con una lección, o para identificar la jerga que necesita más explicación. Esta revisión inicial ayuda a los revisores externos a centrarse en mejorar la pieza. Esto normalmente se hace abiertamente en nuestro sistema de presentación (ver más abajo), pero puede ser una revisión cerrada a petición de cualquiera de las partes. @@ -93,7 +93,7 @@ Asimismo, me comprometo a mantener la conversación abierta a todo el mundo en G Política contra el acoso _ -El objetivo de 'The Programming Historian en español' es ofrecer un entorno abierto en el que la comunidad de participantes sean libres para analizar ideas, realizar preguntas, sugerir cambios, y pedir aclaraciones; también queremos que sea un espacio libre de acoso y hostigamento para todo el mundo con independencia de su género, identidad, orientación sexual, minusvalía, apariencia física, tamaño corporal, raza, edad, religión o conocimientos informáticos. No se tolerará ningún tipo de acoso o ataque *ad hominem*. Los participantes que violen esta regla podrán ser expulsados del proceso editorial a discreción del equipo editorial. Si presencias o sientes que has sido víctima de algún tipo de acoso, por favor, contacta con nuestros 'ombudsperson' (Silvia Gutiérrez De la Torre - http://programminghistorian.org/es/equipo-de-proyecto). +El objetivo de 'The Programming Historian en español' es ofrecer un entorno abierto en el que la comunidad de participantes sean libres para analizar ideas, realizar preguntas, sugerir cambios, y pedir aclaraciones; también queremos que sea un espacio libre de acoso y hostigamento para todo el mundo con independencia de su género, identidad, orientación sexual, minusvalía, apariencia física, tamaño corporal, raza, edad, religión o conocimientos informáticos. No se tolerará ningún tipo de acoso o ataque *ad hominem*. Los participantes que violen esta regla podrán ser expulsados del proceso editorial a discreción del equipo editorial. Si presencias o sientes que has sido víctima de algún tipo de acoso, por favor, contacta con nuestros 'ombudsperson' ([Silvia Gutiérrez De la Torre](es/equipo-de-proyecto)). ``` Para las lecciones nuevas, utiliza la siguiente [plantilla](https://github.com/programminghistorian/ph-submissions/blob/gh-pages/es/PLANTILLA-LECCION.md): @@ -115,7 +115,7 @@ Asimismo, me comprometo a mantener la conversación abierta a todo el mundo en G Política contra el acoso _ -El objetivo de 'The Programming Historian en español' es ofrecer un entorno abierto en el que la comunidad de participantes sean libres para analizar ideas, realizar preguntas, sugerir cambios, y pedir aclaraciones; también queremos que sea un espacio libre de acoso y hostigamento para todo el mundo con independencia de su género, identidad, orientación sexual, minusvalía, apariencia física, tamaño corporal, raza, edad, religión o conocimientos informáticos. No se tolerará ningún tipo de acoso o ataque *ad hominem*. Los participantes que violen esta regla podrán ser expulsados del proceso editorial a discreción del equipo editorial. Si presencias o sientes que has sido víctima de algún tipo de acoso, por favor, contacta con nuestros 'ombudsperson' (Silvia Gutiérrez De la Torre - http://programminghistorian.org/es/equipo-de-proyecto). +El objetivo de 'The Programming Historian en español' es ofrecer un entorno abierto en el que la comunidad de participantes sean libres para analizar ideas, realizar preguntas, sugerir cambios, y pedir aclaraciones; también queremos que sea un espacio libre de acoso y hostigamento para todo el mundo con independencia de su género, identidad, orientación sexual, minusvalía, apariencia física, tamaño corporal, raza, edad, religión o conocimientos informáticos. No se tolerará ningún tipo de acoso o ataque *ad hominem*. Los participantes que violen esta regla podrán ser expulsados del proceso editorial a discreción del equipo editorial. Si presencias o sientes que has sido víctima de algún tipo de acoso, por favor, contacta con nuestros 'ombudsperson' ([Silvia Gutiérrez De la Torre](es/equipo-de-proyecto)). ``` @@ -151,7 +151,7 @@ Desde un punto de vista técnico, estas son las áreas en las que tendrás que i El **editor** debe sugerir un nombre para el archivo de la traducción o lección nueva conforme a las siguientes pautas: - El nombre debe ser corto pero descriptivo pues se convertirá en el *slug* de la lección cuando se publique (es decir, la terminación de la URL). -- Una buena URL debería encajar en una diapositiva, debería ser fácil de recordar y debería describir el contenido de la lección. Nuestras URLS tienen el siguiente formato: http://programminghistorian.org/es/lecciones/NOMBRE-DEL-ARCHIVO-AQUI +- Una buena URL debería encajar en una diapositiva, debería ser fácil de recordar y debería describir el contenido de la lección. Nuestras URLS tienen el siguiente formato: https://programminghistorian.org/es/lecciones/NOMBRE-DEL-ARCHIVO-AQUI - No introduzcas espacios en el nombre del archivo; en su lugar utiliza guiones. - La extensión del arhivo debe ser `.md` con el objetivo de que GitHub genere una visualización provisional de la lección. @@ -323,7 +323,7 @@ Puedes buscar imágenes en los recursos siguientes: - [British Library](https://www.flickr.com/photos/britishlibrary) - [Internet Archive Book Images](https://archive.org/details/bookimages) - [Virtual Manuscript Library of Switzerland](https://www.e-codices.unifr.ch/en) - - [Library of Congress Maps](http://www.loc.gov/maps/collections) + - [Library of Congress Maps](https://www.loc.gov/maps/collections) Si como editor estás buscando una imagen para una lección nueva, asegúrate de que la imagen sigue el mismo estilo que las imágenes anteriores; debería ser una ilustración, no una fotografía, tener al menos 200 píxeles de anchura y altura, y estar libre de derechos. Asegúrate de que la magen no es ofensiva y ten en cuenta nuestro [compromiso con la diversidad](/posts/PH-commitment-to-diversity); en otras palabras, intenta encontrar una imagen que no perpetúe estereotipos o envíe mensajes sutiles sobre la masculinidad y la raza blanca. @@ -382,7 +382,7 @@ Las opciones son: A) Sigue nuestra guía para ["hacer contribuciones técnicas"](https://github.com/programminghistorian/jekyll/wiki/Making-Technical-Contributions), que utiliza el sitio web de GitHub. -B) La manera más fácil de publicar el texto es utilizar git en tu terminal de línea de comandos. Las siguientes instrucciones presuponen que ya has clonado en tu ordenador los repositorios jekyll y ph-submissions/es (si no es así, nuestra [introducción a GitHub](/lessons/getting-started-with-github-desktop) puedes ser útil). Si tienes alguna duda puedes contactar al equipo técnico. +B) La manera más fácil de publicar el texto es utilizar git en tu terminal de línea de comandos. Las siguientes instrucciones presuponen que ya has clonado en tu ordenador los repositorios jekyll y ph-submissions/es (si no es así, nuestra [introducción a GitHub](/en/lessons/getting-started-with-github-desktop) puedes ser útil). Si tienes alguna duda puedes contactar al equipo técnico. 1. Sitúate en el director local de tu repositorio `ph-submissions/es`. 2. Introduce `git pull` para descargar los últimos cambios en tu ordenador (o `sync` si utilizas GitHub Desktop). diff --git a/es/guia-para-revisores.md b/es/guia-para-revisores.md index 01397f7a91..b8d4b08cbd 100644 --- a/es/guia-para-revisores.md +++ b/es/guia-para-revisores.md @@ -22,7 +22,7 @@ Para el equipo de _The Programming Historian en español_ revisar una traducció ## Política antiacoso Queremos exponer los principios que rigen _The Programming Historian en español_, así como toda la correspondencia accesible en nuestros fórums entre los revisores, los autores, los editores y otros colaboradores. -_The Programming Historian en español_ tiene como objetivo ofrecer, por un lado, un entorno abierto a la comunidad en el que reine la libertad para analizar ideas, hacer preguntas, sugerir cambios o pedir aclaraciones. Por el otro, quiere ser un espacio libre de acoso y violencia hacia cualquiera de los participantes con independencia de su género, identidad, orientación sexual, capacidades, físico, raza, edad, religión o nivel de conocimiento. No toleraremos ningún tipo de acoso, humillación o ataque personal por parte de los colaboradores. Quien, a discreción del equipo editorial, viole este principio será expulsado del proyecto. Si alguna de las partes es testigo o considera que ha sido víctima de un abuso, se recomienda contactar a uno de nuestros mediadores ([Antonio Rojas, Víctor Gayol o Maria José Afanador-Llach](/project-team)). +_The Programming Historian en español_ tiene como objetivo ofrecer, por un lado, un entorno abierto a la comunidad en el que reine la libertad para analizar ideas, hacer preguntas, sugerir cambios o pedir aclaraciones. Por el otro, quiere ser un espacio libre de acoso y violencia hacia cualquiera de los participantes con independencia de su género, identidad, orientación sexual, capacidades, físico, raza, edad, religión o nivel de conocimiento. No toleraremos ningún tipo de acoso, humillación o ataque personal por parte de los colaboradores. Quien, a discreción del equipo editorial, viole este principio será expulsado del proyecto. Si alguna de las partes es testigo o considera que ha sido víctima de un abuso, se recomienda contactar a uno de nuestros mediadores ([Antonio Rojas, Víctor Gayol o Maria José Afanador-Llach](/es/equipo-de-proyecto)). ¡Gracias por ayudarnos a crear un espacio seguro para todos y todas! @@ -40,7 +40,7 @@ Queremos que el proceso de producción y revisión sea transparente. Por eso tie Tu trabajo como revisor --así como tu identidad-- será totalmente visible para el autor o traductor. En consecuencia, los comentarios deben dirigirse directamente al autor o traductor (y no al editor). Si en algún momento del proceso tienes dudas sobre cuál es tu rol o qué se espera de ti, por favor, publica tus dudas o preguntas en nuestro repositorio para que alguno de nuestros editores pueda ayudarte. En ocasiones podemos demorarnos un poco pero estamos convencidos de que, una vez se publique la traducción o lección, habrá valido la pena esperar unos días. -A fin de mantener nuestro compromiso con una investigación pública, abierta y transparente, te animamos a mantener las conversaciones en Github, tal y como se expone en el apartado dedicado a nuestro flujo de trabajo. Sin embargo, también queremos que todo el mundo se sienta cómodo y por eso, pero de manera ocasional, también aceptamos tratar un asunto en privado. En tal caso puedes contactar a tu [editor asignado](/project-team) o a alguno de nuestros *ombudsperson* [Maria José Afanador-Llach](/project-team). +A fin de mantener nuestro compromiso con una investigación pública, abierta y transparente, te animamos a mantener las conversaciones en Github, tal y como se expone en el apartado dedicado a nuestro flujo de trabajo. Sin embargo, también queremos que todo el mundo se sienta cómodo y por eso, pero de manera ocasional, también aceptamos tratar un asunto en privado. En tal caso puedes contactar a tu [editor asignado](/es/equipo-de-proyecto) o a alguno de nuestros *ombudsperson* [Maria José Afanador-Llach](/es/equipo-de-proyecto). A menos que nos indiques lo contario, tu nombre aparecerá como revisor en la página de la lección en _The Programming Historian en español_ una vez se publique de manera oficial. Esto es solo un pequeño reconocimiento pero nos gustaría enfatizar que _The Programming Historian en español_ se lleva a cabo gracias al trabajo de los voluntarios y por eso es obligado darles crédito. @@ -68,7 +68,7 @@ De manera más específica, en cuanto a las traducciones, apreciamos el rigor pe - ¿Las capturas de pantalla y trozos de código han sido adaptados? - ¿Se han añadido referencias bibliográficas en español? -Puesto que muchos de lo tecnicismos son nuevos y/o todavía no están recogidos en los diccionarios, recomendamos el uso de la [Taxonomía sobre Actividades de investigación digital en humanidades](http://vocabularios.caicyt.gov.ar/portalthes/index.php?v=42) de TaDiRAH y el [Glosario de Preservación Archivística Digital (Versión 4.0)](http://www.mecd.gob.es/planes-nacionales/dam/jcr:f20a4ba1-0ed2-445d-9be9-b8b0382562ea/mex-glosario-interpares-total0112.pdf) de Voutssas-M y Barnard Amozorrutia (UNAM). +Puesto que muchos de lo tecnicismos son nuevos y/o todavía no están recogidos en los diccionarios, recomendamos el uso de la [Taxonomía sobre Actividades de investigación digital en humanidades](https://vocabularios.caicyt.gov.ar/portalthes/index.php?v=42) de TaDiRAH y el [Glosario de Preservación Archivística Digital (Versión 4.0)](https://www.mecd.gob.es/planes-nacionales/dam/jcr:f20a4ba1-0ed2-445d-9be9-b8b0382562ea/mex-glosario-interpares-total0112.pdf) de Voutssas-M y Barnard Amozorrutia (UNAM). ### Lecciones nuevas Por lo que respecta a las lecciones nuevas, queremos que las explicaciones técnicas (y el nivel de dificultad) sea constante a lo largo del tutorial. En tanto que revisor, queremos que, en un tutorial dirigido a un usuario experimentado, seas capaz de detectar pasajes que explican en detalle un concepto demasiado simple. Y a la inversa: queremos evitar tutoriales dirigidos a principiantees que no explican de manera adecuada un concepto fundamental para entender la lección. Aspectos a tener en cuenta: diff --git a/es/guia-para-traductores.md b/es/guia-para-traductores.md index f96302fb6d..b30cb98564 100644 --- a/es/guia-para-traductores.md +++ b/es/guia-para-traductores.md @@ -8,9 +8,9 @@ skip_validation: true # Guía para traductores {{ site.data.snippets.write-a-lesson-image-alt[page.lang] }} -

    Paso 1: Proponer una nueva traducción

    -

    Paso 2: Escribir y dar formato a una nueva traducción

    -

    Paso 3: Enviar una nueva traducción

    +

    Paso 1: Proponer una nueva traducción

    +

    Paso 2: Escribir y dar formato a una nueva traducción

    +

    Paso 3: Enviar una nueva traducción

    Estas directrices han sido desarrolladas para ayudarte a entender el proceso de traducción de un tutorial para *Programming Historian* en Español. Incluyen detalles prácticos sobre el proceso de traducción de un tutorial, así como indicaciones sobre el flujo de trabajo y el proceso de revisión entre pares. Si en algún momento hay algo que no te queda claro, por favor envía un correo electrónico a {% include managing-editor.html lang=page.lang %}. diff --git a/es/index.md b/es/index.md index d8895bc762..055d5f10ce 100644 --- a/es/index.md +++ b/es/index.md @@ -3,6 +3,7 @@ layout: base original: index title: | The Programming Historian en español +permalink: /es/ ---
    @@ -25,20 +26,20 @@ title: |

    Enseña

    -

    Utiliza The Programming Historian en español en tus clases o talleres. También puedes enviarnos tus comentarios con el objetivo de mejorar los tutoriales para que se ajustan a tus necesidades, o bien para advertirnos de algún error o problema.

    +

    Utiliza The Programming Historian en español en tus clases o talleres. También puedes enviarnos tus comentarios con el objetivo de mejorar los tutoriales para que se ajustan a tus necesidades, o bien para advertirnos de algún error o problema.

    Contribuye

    -

    Escribe una lección nueva, únete a nuestro equipo de revisores, o bien envíanos tus comentarios. ¡Nos gusta estar acompañados!

    +

    Escribe una lección nueva, únete a nuestro equipo de revisores, o bien envíanos tus comentarios. ¡Nos gusta estar acompañados!

    Equipo

    -

    Somos una comunidad de voluntarios; por eso, nos gusta ser transparentes y dar crédito a los colaboradores que han invertido tiempo y energía en hacer posible The Programming Historian en español.

    +

    Somos una comunidad de voluntarios; por eso, nos gusta ser transparentes y dar crédito a los colaboradores que han invertido tiempo y energía en hacer posible The Programming Historian en español.

    diff --git a/es/investigacion.md b/es/investigacion.md index 9a54f78e02..b74883e221 100644 --- a/es/investigacion.md +++ b/es/investigacion.md @@ -9,11 +9,11 @@ original: research El equipo del proyecto y los miembros de la comunidad en general están involucrados en una serie de iniciativas académicas relacionadas con nuestro trabajo aquí en *The Programming Historian*. Estas iniciativas incluyen eventos, artículos en revistas académicas, reseñas (de nosotros por la comunidad) y carteles. Si tú estás desarrollando una investigación académica usando los materiales de este proyecto, por favor contacta con nuestra asistente de publicación Anisa Hawes. ## *Programming Historian* original -* William J. Turkel y Alan MacEachern, [_The Programming Historian_](http://niche-canada.org/wp-content/uploads/2013/09/programming-historian-1.pdf) 1a edición (Network in Canadian History & Environment: 2007-2008). +* William J. Turkel y Alan MacEachern, [_The Programming Historian_](https://niche-canada.org/wp-content/uploads/2013/09/programming-historian-1.pdf) 1a edición (Network in Canadian History & Environment: 2007-2008). * Traducción al japonés de William J. Turkel y Alan MacEachern, [_The Programming Historian_](https://www.dh.ku-orcas.kansai-u.ac.jp/?cat=2), 1a edición (Network in Canadian History & Environment: 2007-2008). ### Reseñas -* Björn Ekström, Elisa Tattersall Wallin and Hana Marčetić, '[_Programming Historian_: Novice-friendly tutorials on digital methods](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1508542&dswid=7551)', _Tidskrift för ABM_, Vol. 5, no 1 (2020), pp. 71-75. +* Björn Ekström, Elisa Tattersall Wallin and Hana Marčetić, '[_Programming Historian_: Novice-friendly tutorials on digital methods](https://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1508542&dswid=7551)', _Tidskrift för ABM_, Vol. 5, no 1 (2020), pp. 71-75. * Dries Daems, '[A Review and Roadmap of Online Learning Platforms and Tutorials in Digital Archaeology](https://doi.org/10.1017/aap.2019.47)', _Advances in Archaeological Practice_, vol. 8, issue 1 (2020), pp. 87-92. * Martin Dröge, '[Review of: The Programming Historian](https://www.hsozkult.de/webreview/id/rezwww-184)', _H-Soz-Kult_ (2019). * Priscila Pilatowsky Goñi, '[Reseña a The programming historian](https://revistas.uned.es/index.php/RHD/article/view/22420)', _Revista de Humanidades Digitales_, vol. 2 (2018). @@ -29,22 +29,22 @@ El equipo del proyecto y los miembros de la comunidad en general están involucr * Jennifer Isasi, Riva Quiroga, Nabeel Sidiqqui, Joana Vieira Paulino, Alex Wermer-Colan, [“A Model for Multilingual and Multicultural Digital Scholarship Methods Publishing"](https://www.taylorfrancis.com/chapters/edit/10.4324/9781003393696-3/model-multilingual-multicultural-digital-scholarship-methods-publishing-jennifer-isasi-riva-quiroga-nabeel-siddiqui-joana-vieira-paulino-alex-wermer-colan), en _Multilingual Digital Humanities_, editado por Viola, L., & Spence, P., Routledge, 2023. * Adam Crymble & Charlotte M. H. Im, ['Measuring digital humanities learning requirements in Spanish & English-speaking practitioner communities'](https://doi.org/10.1007/s42803-023-00066-x), International Journal of Digital Humanities, (2023). * Eric Brasil, '[_pyHDB - Ferramenta Heurística para a Hemeroteca Digital Brasileira: utilizando técnicas de web scraping para a pesquisa em História_'](https://doi.org/10.15848/hh.v15i40.1904), _História Da Historiografia: International Journal of Theory and History of Historiography_, 15(40) (2022), 186–217. -* Matthew Lincoln, Sarah Melton, Jennifer Isasi, François Dominic Laramée, '[Relocating Complexity: _The Programming Historian_ and Multilingual Static Site Generation](http://www.digitalhumanities.org/dhq/vol/16/2/000585/000585.html)', _Digital Humanities Quarterly_ 16, 2 (2022). +* Matthew Lincoln, Sarah Melton, Jennifer Isasi, François Dominic Laramée, '[Relocating Complexity: _The Programming Historian_ and Multilingual Static Site Generation](https://www.digitalhumanities.org/dhq/vol/16/2/000585/000585.html)', _Digital Humanities Quarterly_ 16, 2 (2022). * Jennifer Isasi y Antonio Rojas Castro, ‘[¿Sin equivalencia? Una reflexión sobre la traducción al español de recursos educativos abiertos](https://muse.jhu.edu/article/842253)’, _Hispania_, 104, no. 4 (2021), 613-624. * Adam Crymble y Maria José Afanador Llach, ‘The Globally Unequal Promise of Digital Tools for History: UK and Colombia Case Study’ en _Teaching History for the Contemporary World_, editado por Adele Nye, 85-98, Springer, 2021. * Daniel Alves, ['Ensinar Humanidades Digitais sem as Humanidades Digitais: um olhar a partir das licenciaturas em História'](https://novaresearch.unl.pt/files/32228034/Ensinar_Humanidades_Digitais.pdf), _Revista EducaOnline_, v. 15, n. 2 (2021). * Adam Crymble, [_Technology & the Historian: Transformations in the Digital Age_](https://www.press.uillinois.edu/books/catalog/57hxp7wr9780252043710.html), (University of Illinois Press, 2021). * Anna-Maria Sichani, James Baker, Maria José Afanador Llach, y Brandon Walsh, [‘Diversity and Inclusion in Digital Scholarship and Pedagogy: The Case of The Programming Historian’](https://doi.org/10.1629/uksg.465), _Insights_, (2019). -* Katrina Navickas y Adam Crymble, ['From Chartist Newspaper to Digital Map of Grass-roots Meetings, 1841-44: Documenting Workflows'](http://www.tandfonline.com/doi/full/10.1080/13555502.2017.1301179), _Journal of Victorian Culture_, (2017). +* Katrina Navickas y Adam Crymble, ['From Chartist Newspaper to Digital Map of Grass-roots Meetings, 1841-44: Documenting Workflows'](https://www.tandfonline.com/doi/full/10.1080/13555502.2017.1301179), _Journal of Victorian Culture_, (2017). * Adam Crymble, ['Identifying and Removing Gender Barriers in Open Learning Communities: The Programming Historian'], _Blended Learning in Practice_, (2016), 49-60. [[pre-print pdf](/researchpapers/openLearningCommunities2016.pdf)] * Fred Gibbs, ‘[Editorial Sustainability and Open Peer Review at Programming Historian]’, *DH Commons*, Vol. 1 (2015). -* Shawn Graham, Ian Milligan, y Scott Weingart, [_Exploring Big Historical Data: The Historian's Macroscope_](http://www.themacroscope.org/2.0/), (Imperial College Press, 2015). +* Shawn Graham, Ian Milligan, y Scott Weingart, [_Exploring Big Historical Data: The Historian's Macroscope_](https://www.themacroscope.org/2.0/), (Imperial College Press, 2015). ### Reportes * Maria José Afanador-Llach & Andrés Rivera, '[Segundo ciclo de talleres: Herramientas y procesos digitales para la investigación y creación en artes y humanidades](/researchpapers/Informe_final_Talleres%20EHCN_2023-ENG_PH.pdf)', (2023). * Incllewsion and the Programming Historian, 'Initial Accessibility Testing: Summary of Findings', (2021). -* Penny Andrews and the Programming Historian, ['The Programming Historian: developing and sustaining impact in the Global South'](http://doi.org/10.5281/zenodo.3813763) (2020). +* Penny Andrews and the Programming Historian, ['The Programming Historian: developing and sustaining impact in the Global South'](https://doi.org/10.5281/zenodo.3813763) (2020). * Amy Kavanagh and the Programming Historian, 'Programming Historian – Access for visually impaired researchers', (n.d.). ### Talleres y eventos @@ -63,7 +63,7 @@ El equipo del proyecto y los miembros de la comunidad en general están involucr * Alex Wermer-Colan, ['Learning Digital Methods with the _Programming Historian_'](https://charlesstudy.temple.edu/event/11953011), Temple University [En línea], (22 de febrero de 2024). * Carlo Blum, Adam Crymble, Vicky Garnett, Timothée Giraud, Alíz Horváth, Stefan Krebs, Ralph Marschall, Sofia Papastamkou, & Lorella Viola, 'Invisible College of Digital History: Workshop on Multilingual Educational Resources', C²DH [En línea], (8 de noviembre de 2023). * Nabeel Siddiqui, 'Convolutional Neural Networks for Image Classification', University of Edinburgh [En línea], (7 de noviembre de 2023). -* Eric Brasil, '[História Digital e História Digital da Educação: Caminhos Cruzados](http://www.iea.usp.br/eventos/historia-digital-educacao-caminhos-cruzados)', Instituto de Estudos Avançados, USP, São Paulo, Brasil, (17 de octubre 2023). +* Eric Brasil, '[História Digital e História Digital da Educação: Caminhos Cruzados](https://www.iea.usp.br/eventos/historia-digital-educacao-caminhos-cruzados)', Instituto de Estudos Avançados, USP, São Paulo, Brasil, (17 de octubre 2023). * Scott Kleinman, Alex Wermer-Colan, Joana Vieira Paulino, Nabeel Siddiqui, Zoe LeBlanc, 'Developing a Digital Humanities Tutorial', [DH 2023](https://dh2023.adho.org/), Graz, Austria, (10 de julio de 2023). * Daphné Mathelier, 'Atelier Markdown', [11e journées du réseau Medici](https://web.archive.org/web/20230629084307/https://medici2023.sciencesconf.org/resource/page/id/2), Université de Liège, Bélgica, (29 de junio de 2023). * María José Afanador Llach, Jennifer Isasi, Riva Quiroga, 'Sobre _Programming Historian en español_ y cómo contribuir a la publicación', Semana de Humanidades Digitales 2023 [En línea], (10 de mayo de 2023). @@ -150,10 +150,10 @@ El equipo del proyecto y los miembros de la comunidad en general están involucr * Adam Crymble, 'Facilitating Making in Digital Humanities', The Archaeology of Making, University of London, Reino Unido, 5 de mayo 2021. * Daniel Alves, Jennifer Isasi, Sarah Melton, Sofia Papastamkou, Jessica Parr, Riva Quiroga, Nabeel Siddiqui, Brandon Walsh, '[The Programming Historian: A Global Case Study in Multilingual Open Access and DH Tutelage/Instruction](https://msuglobaldh.org/abstracts/#programming-historian)' (panel), _Global Digital Humanities Symposium_, Michigan State University, East Lansing, USA, 12 de abril, 2021. * Jessica Parr, '[Cambridge Cultural Heritage Data School: Final plenary](https://www.cdh.cam.ac.uk/events/cambridge-cultural-heritage-data-school-final-plenary)', University of Cambridge, Reino Unido, 30 de marzo 2021. -* Jennifer Isasi & Riva Quiroga, ['_Programming Historian_: Un proyecto colaborativo para poner la programación al alcance de los humanistas'](http://ixa2.si.ehu.eus/intele/?q=webinars), _INTELE : INfraestructura de TEcnologías del LEnguaje_, España, 25 de marzo, 2021. +* Jennifer Isasi & Riva Quiroga, ['_Programming Historian_: Un proyecto colaborativo para poner la programación al alcance de los humanistas'](https://ixa2.si.ehu.eus/intele/?q=webinars), _INTELE : INfraestructura de TEcnologías del LEnguaje_, España, 25 de marzo, 2021. * Sofia Papastamkou, Jessica Parr & Riva Quiroga, 'Challenges for Digital Literacy in the Humanities: The Open, Community-Based and Multilinguistic Approach of _The Programming Historian_', NewsEye’s International Conference, Europa, 17 de marzo, 2021. * Riva Quiroga, ['Multilingual Digital Humanites'](https://mediacentral.ucl.ac.uk/Play/59506), Digital Humanities Long View Seminar, UCLDH, UK & CESTA, USA, 10 de marzo, 2021. -* Brandon Walsh, '[The Programming Historian and Editorial Process in Digital Publishing](http://walshbr.com/blog/the-programming-historian-and-editorial-process-in-digital-publishing/)', Modern Languages Association Conference 2021, USA, 7-10 de enero, 2021. +* Brandon Walsh, '[The Programming Historian and Editorial Process in Digital Publishing](https://walshbr.com/blog/the-programming-historian-and-editorial-process-in-digital-publishing/)', Modern Languages Association Conference 2021, USA, 7-10 de enero, 2021. * Sofia Papastamkou, François Dominic Laramée, Martin Grandjean, '[Le Programming Historian en français: quelles ressources éducatives libres pour les méthodes numériques ?](https://zenodo.org/record/3819954)', *Humanistica 2020*, Bordeaux, France, 12-14 de mayo 2020. * Sofia Papastamkou, 'A Beating Heart of Digital History: The Programming Historian', [Teaching Digital History Workshop](https://cas.au.dk/en/cedhar/events/show/artikel/teaching-digital-history-workshop), Center for Digital History Aarhus, University of Aarhus, Dinamarca, 23 de octubre 2019. * Jennifer Isasi, Maria José Afanador y Antonio Rojas Castro, 'Retos en la producción de tutoriales de HD en contexto hispanohablantes', Conferencia ACH 2019, The Association for Computers and the Humanities, Pittsburgh, 23 al 26 de julio, 2019, Pittsburgh. @@ -166,12 +166,12 @@ El equipo del proyecto y los miembros de la comunidad en general están involucr * Victor Gayol, 'La investigación del pasado y la historia digital: análisis de datos y cómo aprender (The Programming Historian en español)', _Humanidades Digitales_, IV Feria Internacional de Ciencias Sociales y Humanidades, Centro Universitario de Los Lagos - Universidad de Guadalajara, Lagos de Moreno, Jalisco (9 de marzo, 2017). * Victor Gayol, 'The Programming Historian: 'un modelo colaborativo para la investigación y la enseñanza en ciencias sociales y humanidades digitales', _Mesa de Trabajo sobre Ciencias Sociales y Humanidades Digitales_, El Colegio De Michoacán, México (21 de febrero, 2017). * Adam Crymble, 'Bringing Digital Humanities into the University for Free', University of Cape Town, South Africa (27-28 junio 2016). -* Fred Gibbs, ‘The Programming Historian’ (Cartel), *American Historical Association*, New York (enero 2015). +* Fred Gibbs, ‘The Programming Historian’ (Cartel), *American Historical Association*, New York (enero 2015). * Adam Crymble, ‘The Programming Historian 2’, *Digital History Seminar*, Institute of Historical Research, London (13 octubre 2013). * Adam Crymble, ‘The Programming Historian 2’, *Digital Humanities 2012*, Hamburg (julio 2012). -* Anaclet Pons, “The Programming Historian en español”, Clionauta: Blog de historia, junio 14, 2017, http://clionauta.hypotheses.org/16979 +* Anaclet Pons, “The Programming Historian en español”, Clionauta: Blog de historia, junio 14, 2017, https://clionauta.hypotheses.org/16979 * Seth Denbo, “Historian, Program! Self-Help for Digital Neophytes,” Perspectives on History: The Newsmagazine of the American Historical Association, mayo 2017, https://www.historians.org/publications-and-directories/perspectives-on-history/may-2017/historian-program-self-help-digital-neophytes. -* Víctor Gayol, ‘The Programming Historian en español’, Blog de Humanidades Digitales, marzo 17, 2017, http://humanidadesdigitales.net/blog/2017/03/17/the-programming-historian-en-espanol/. +* Víctor Gayol, ‘The Programming Historian en español’, Blog de Humanidades Digitales, marzo 17, 2017, https://humanidadesdigitales.net/blog/2017/03/17/the-programming-historian-en-espanol/. ### Editoriales @@ -180,11 +180,11 @@ El equipo del proyecto y los miembros de la comunidad en general están involucr * Matthew Lincoln, 'Multilingual Jekyll: How The Programming Historian Does That', *matthewlincoln.net*, 1 de marzo 2020, . * Sue Levine, 'The Early-Stage Ph.D.'s Guide to Summer', _Inside Higher Education_, 10 en junio 2019, . * 'Championing open access with online digital history journal', _University of Sussex Press Office_, 9 de octubre, 2018, . -* Adam Crymble, 'A Decade of Programming Historians', _Network in Canadian History & Environment_, 23 de marzo, 2018, . +* Adam Crymble, 'A Decade of Programming Historians', _Network in Canadian History & Environment_, 23 de marzo, 2018, . * Fred Gibbs, "Sustainable Publishing: Reflections of a Former Programming Historian Editor", FredGibbs.net, 2017, . -* Anaclet Pons, "The Programming Historian en español", *Clionauta: blog de historia*, 14 de junio, 2017 . +* Anaclet Pons, "The Programming Historian en español", *Clionauta: blog de historia*, 14 de junio, 2017 . * Seth Denbo, “Historian, Program! Self-Help for Digital Neophytes,” _Perspectives on History: The Newsmagazine of the American Historical Association_, May 2017, . -* Víctor Gayol, '*The Programming Historian* en español', *Blog de Humanidades Digitales*, 17 de marzo, 2017, +* Víctor Gayol, '*The Programming Historian* en español', *Blog de Humanidades Digitales*, 17 de marzo, 2017, ### Proyectos que utilizan *The Programming Historian* @@ -195,9 +195,9 @@ El equipo del proyecto y los miembros de la comunidad en general están involucr [Review of the Programming Historian]: https://academic.oup.com/jah/article-abstract/103/1/299/1751315 -[Review of the Programming Historian]: http://jitp.commons.gc.cuny.edu/review-of-the-programming-historian +[Review of the Programming Historian]: https://jitp.commons.gc.cuny.edu/review-of-the-programming-historian ['Identifying and Removing Gender Barriers in Open Learning Communities: The Programming Historian']: https://www.herts.ac.uk/__data/assets/pdf_file/0016/138013/Blip-2016-Autumn-2016-Final-Autumn-2016.pdf ['pre-print pdf']: /researchpapers/openLearningCommunities2016.pdf -[Editorial Sustainability and Open Peer Review at Programming Historian]: http://web.archive.org/web/20180713014622/http://dhcommons.org/journal/issue-1/editorial-sustainability-and-open-peer-review-programming-historian +[Editorial Sustainability and Open Peer Review at Programming Historian]: https://web.archive.org/web/20180713014622/https://dhcommons.org/journal/issue-1/editorial-sustainability-and-open-peer-review-programming-historian ['Digital Project Consultations']: https://dhatasa2015.wordpress.com/ [Library Carpentry: software skills training for library professionals]: https://liberquarterly.eu/article/view/10847 diff --git a/es/lecciones/administracion-de-datos-en-r.md b/es/lecciones/administracion-de-datos-en-r.md index 89162204ee..401024cc59 100644 --- a/es/lecciones/administracion-de-datos-en-r.md +++ b/es/lecciones/administracion-de-datos-en-r.md @@ -44,7 +44,7 @@ Al final de la lección, ## Introducción Los datos que puedes encontrar disponibles en red raramente están en el formato necesario para su análisis y necesitarás manipularlos antes de explorar las preguntas que te interesan. ¡Esto puede llevar más tiempo que el análisis! En este tutorial vamos a aprender algunas técnicas básicas de manipulación, manejo y administración de tus datos en R. Más específicamente, vamos a seguir la filosofía de "datos limpios" o [*"tidy data"*](https://www.jstatsoft.org/article/view/v059i10) articulada por Hadley Wickham. -Según [Wickham](http://hadley.nz), los datos están "limpios" cuando cumplen tres criterios: +Según [Wickham](https://hadley.nz), los datos están "limpios" cuando cumplen tres criterios: 1. Cada observación está en una fila. 2. Cada variable está en una columna. 3. Cada valor tiene su propia celda. @@ -61,7 +61,7 @@ Tal vez lo más importante sea que tener nuestros datos en este formato nos perm En este tutorial nos enfocamos en el paquete [dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) de tidyverse pero merece la pena mencionar otros que nos encontraremos por el camino: [**magittr**](https://magrittr.tidyverse.org): Este paquete nos da acceso al el operador `%>%` y hace nuestro código más fácilmente de leer. -[**ggplot2**](https://ggplot2.tidyverse.org): Este paquete utiliza ["la gramática de gráficos"](http://academica-e.unavarra.es/bitstream/handle/2454/15785/Gramática.pdf?sequence=1)[^1] para ofrecer una manera fácil de visualizar nuestros datos. +[**ggplot2**](https://ggplot2.tidyverse.org): Este paquete utiliza ["la gramática de gráficos"](https://academica-e.unavarra.es/bitstream/handle/2454/15785/Gramática.pdf?sequence=1)[^1] para ofrecer una manera fácil de visualizar nuestros datos. [**readr**](https://readr.tidyverse.org): Este paquete da acceso a un método más rápido y racionalizado para importar datos rectangulares (una tabla), como son los archivos CSV (valores separados por comas). [**tibble**](https://tibble.tidyverse.org): Este paquete nos permite reconceptualizar el formato _data frame_ (marco o tabla de datos) para que sea más fácil trabajar con ellos e imprimirlos. @@ -128,7 +128,7 @@ ggplot(data=poblacion_mississipi_y_virginia, aes(x=año, y=poblacion, color=esta Hacer cambios rápidos en el código y reanalizar nuestros datos es una parte fundamental del análisis exploratorio de datos (AED, o EDA por sus siglas en inglés). En vez de tratar de "probar" una hipótesis, el análisis exploratorio de datos nos ayuda a entender nuestros datos mejor y a hacernos preguntas sobre ellos. Para los historiadores el AED ofrece una forma de saber cuándo indagar más en un tema y cuando dejarlo a un lado, y esto es en el área en el que R sobresale. ## Línea de operaciones -Antes de ver `dplyr`, tenemos que entender lo que es la línea de operaciones ```%>%``` en R porque la vamos a utilizar mucho en nuestros ejemplos. Como decíamos, la línea de operaciones es parte del paquete [magittr](https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html) creado por [Stefan Milton Bache](http://stefanbache.dk) y [Hadley Wickham](http://hadley.nz/) y está incluida en tidyverse. Su nombre es un homenaje al pintor surrealista Rene Magritte y su famosa obra "[La traición de las imágenes](https://historia-arte.com/obras/la-traicion-de-las-imagenes)", que muestra una pipa con las palabras "esto no es una pipa" debajo, en francés. +Antes de ver `dplyr`, tenemos que entender lo que es la línea de operaciones ```%>%``` en R porque la vamos a utilizar mucho en nuestros ejemplos. Como decíamos, la línea de operaciones es parte del paquete [magittr](https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html) creado por [Stefan Milton Bache](https://stefanbache.dk) y [Hadley Wickham](https://hadley.nz/) y está incluida en tidyverse. Su nombre es un homenaje al pintor surrealista Rene Magritte y su famosa obra "[La traición de las imágenes](https://historia-arte.com/obras/la-traicion-de-las-imagenes)", que muestra una pipa con las palabras "esto no es una pipa" debajo, en francés. La línea de operaciones te permite pasar lo que está a su izquierda como la primera variable en una función especificada a la derecha. Aunque pueda parecer extraño al principio, una vez que lo aprendas verás que hace tu código más fácil de leer al evitar declaraciones anidadas. No te preocupes si esto te resulta un poco complicado ahora. Será más fácil una vez que trabajemos con ejemplos. @@ -482,7 +482,7 @@ Este tutorial debería darte una idea de cómo organizar y manipular tus datos e * Para aprender más sobre el paquete 'ggplot2' puedes consultar la sección "[Visualización de datos](https://cienciadedatos.github.io/r4ds/03-visualize.html)" en el libro _R para Ciencia de Datos_ de Hadley Wickham y Garrett Grolemund. -* Tanto la *[Guía para la Presentación de Gráficos Estadísticos](https://www.inei.gob.pe/media/MenuRecursivo/metodologias/libro.pdf),* del Instituto Nacional de Estadística e Informática (2009) así como la [*Gramática de las gráficas: Pistas para mejorar las representaciones de datos*](http://academica-e.unavarra.es/bitstream/handle/2454/15785/Gramática.pdf?sequence=1) de Joaquín Sevilla Moróder ofrecen explicaciones de cómo presentar tus datos y errores a evitar. +* Tanto la *[Guía para la Presentación de Gráficos Estadísticos](https://www.inei.gob.pe/media/MenuRecursivo/metodologias/libro.pdf),* del Instituto Nacional de Estadística e Informática (2009) así como la [*Gramática de las gráficas: Pistas para mejorar las representaciones de datos*](https://academica-e.unavarra.es/bitstream/handle/2454/15785/Gramática.pdf?sequence=1) de Joaquín Sevilla Moróder ofrecen explicaciones de cómo presentar tus datos y errores a evitar. [^1]: En el tutorial original se hace referencia al libro "[The Grammar of Graphics](https://www.springer.com/us/book/9780387245447)" (2005) de Wilkinson. diff --git a/es/lecciones/analisis-de-corpus-con-antconc.md b/es/lecciones/analisis-de-corpus-con-antconc.md index 268ab82bca..5fc3b09823 100644 --- a/es/lecciones/analisis-de-corpus-con-antconc.md +++ b/es/lecciones/analisis-de-corpus-con-antconc.md @@ -1,315 +1,315 @@ ---- -title: Análisis de corpus con AntConc -authors: -- Heather Froehlich -date: 2015-11-24 -translation_date: 2018-05-04 -editors: -- Fred Gibbs -reviewers: -- Nabeel Siddiqui -- Rob Sieczkiewicz -translator: -- Carlos Manuel Varón Castañeda -translation-editor: -- Antonio Rojas Castro -translation-reviewer: -- Jennifer Isasi -- Antonio Rojas Castro -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/170 -layout: lesson -original: corpus-analysis-with-antconc -difficulty: 1 -activity: analyzing -topics: [distant-reading] -abstract: "El análisis de corpus permite hacer comparaciones a gran escala entre objetos presentes en los textos; es decir, lo que se conoce como lectura distante." -avatar_alt: Grabado de una estantería con libros -doi: 10.46430/phes0032 ---- - -
    -En 2022, Lauren Anthony sacó AntConc 4.0, una actualización importante del software AntConc. Aunque casi todas las funciones siguen siendo las mismas, algunas de las prácticas que se describen en esta lección han cambiado un poco. Si estás utilizando una versión más reciente de AntConc que la que se muestra a continuación, puedes consultar la guía de ayuda proporcionada por Laurence Anthony (solo disponible en inglés). Este aviso es especialmente relevante para abrir un corpus utilizando la nueva herramienta Corpus Manager (enero de 2022). -
    - -{% include toc.html %} - -## Introducción - -El análisis de corpus es un tipo de análisis de textos que permite hacer comparaciones a gran escala entre objetos presentes en los mismos —esto es, aquello que se conoce como lectura distante—. Lo anterior hace posible apreciar fenómenos que no necesariamente se hacen visibles cuando leemos. Si, por ejemplo, dispones de una colección de documentos, es posible que desearas encontrar patrones de uso gramatical o frases de aparición recurrente en la misma. También puede ser que quisieras hallar frases cuya probabilidad de aparición fuese más alta o más baja en la obra de un autor, o bien en un tipo determinado de textos; clases particulares de estructuras gramaticales; o muchos ejemplos de un concepto particular en una gran cantidad de documentos que se encuentran enmarcados en cierto contexto. En este sentido, el análisis de corpus resulta muy útil para demostrar hipótesis sobre textos, o para triangular resultados obtenidos a través de otras metodologías de análisis textual basadas en herramientas digitales. - -Al finalizar este tutorial, tendrás la capacidad de: - -- Crear o descargar un corpus de textos. -- Realizar una búsqueda de palabras clave en contexto. -- Identificar patrones respecto de una palabra determinada. -- Utilizar criterios de búsqueda más específicos. -- Revisar diferencias estadísticamente significativas entre corpus. -- Efectuar comparaciones multimodales a través de metodologías de análisis propias de la lingüística de corpus. - -Es posible que te hayas acercado a la ejecución de análisis como el que se describe aquí si has realizado alguna de las siguientes tareas: - -- Búsqueda de todas las apariciones de un término específico en un archivo PDF o un documento de Microsoft Word®. -- Uso de [Voyant Tools](http://voyant-tools.org/) para revisar patrones en un texto. -- Lectura y desarrollo de los tutoriales de introducción a Python disponibles en *[The Programming Historian](/es/lecciones/)*. - -En muchos sentidos, [Voyant](http://voyant-tools.org/) es una puerta de entrada a la realización de análisis más sofisticados y replicables, ya que la naturaleza de tipo “házlo tú mismo” de los *scripts* en Python o R puede no ser atractiva para todos. [AntConc](http://www.laurenceanthony.net/software/antconc/) llena este vacío en tanto se propone como una aplicación informática independiente para el análisis lingüístico de textos, la cual se encuentra disponible de forma gratuita para los sistemas operativos Windows, Mac OS X y Linux (funciona, por tanto, en múltiples plataformas), y es objeto de actualizaciones permanentes por parte de su creador, [Laurence Anthony](http://www.laurenceanthony.net/)[^1]; si bien existen otras aplicaciones para efectuar análisis de concordancias lingüísticas, se resaltan de AntConc las dos cualidades señaladas (para acceder a recursos adicionales sobre esta temática, véase *[An Introductory Bibliography to Corpus Linguistics](https://hfroehli.ch/2014/05/11/intro-bibliography-corpus-linguistics/)*). - -En este tutorial se presentan varias maneras diferentes de acercarse a un corpus de textos. Es importante tener en cuenta que las metodologías de lingüística de corpus no funcionan en todas las situaciones. Con esto, conforme sigas los pasos propuestos, es conveniente que reflexiones sobre la tarea que estés realizando y cómo puede ser de utilidad para responder una pregunta específica en relación con los datos de los que dispongas. En este sentido, si bien la presente lección está construida bajo la metodología "haz esto y luego esto para lograr *X*", no siempre es necesario seguir en orden estricto los pasos que se muestran aquí: se brinda en este espacio una síntesis general de algunos de los métodos disponibles para realizar análisis de esta naturaleza, en lugar de una receta única para el éxito. - -### Descargas necesarias para el desarrollo de este tutorial - -1. Programa: [AntConc](http://www.laurenceanthony.net/software/antconc/)[^2]. - - Descomprime el archivo del programa (si fuere necesario) e inícialo. Las capturas de pantalla presentadas aquí pueden diferir ligeramente de la versión de AntConc que utilices (y del sistema operativo, desde luego), pero los procedimientos son más o menos los mismos en todas las plataformas y versiones recientes de la aplicación. Este tutorial fue escrito teniendo como referente una versión específica (bastante antigua) de AntConc, en tanto consideramos que resulta más fácil de usar para fines introductorios. Puedes emplear la versión más reciente para desarrollar el tutorial si lo tienes a bien; pero, si deseas seguir los pasos con la misma información que presentamos en las capturas de pantalla de esta lección, es necesario que descargues la versión específica que empleamos aquí ([3.2.4](https://www.laurenceanthony.net/software/antconc/releases/AntConc324/)). - -2. Corpus de prueba: descarga este [archivo zip de reseñas cinematográficas](/assets/corpus-analysis-with-antconc/antconc_corpus_files.zip) (escritas en inglés). - -### Presentación sintética de las temáticas abordadas en la lección - -- Trabajar con archivos de texto plano -- Interfaz de usuario y carga de corpus en AntConc -- Búsqueda de palabras clave en contexto -- Búsqueda avanzada de palabras clave en contexto -- Colocaciones y listas de palabras -- Comparación de corpus -- Discusión: hacer comparaciones significativas -- Recursos adicionales - -### Trabajar con archivos de texto plano - -- AntConc solo funciona con archivos de texto plano de extensión .txt (por ejemplo, "Hamlet.txt"); **no puede leer** archivos de extensiones .doc, .docx o .pdf. Por lo tanto, si dispones de documentos de este tipo, deberás convertirlos en archivos .txt. -- La aplicación tiene la capacidad de trabajar con archivos XML (no te preocupes si los desconoces) guardados con la extensión .txt. - -Visita tu portal de noticias favorito y accede a un artículo (su naturaleza no importa, siempre que se componga mayoritariamente de texto). Luego, selecciona todo el texto (encabezado, pie de página, cuerpo, etc.), haz clic derecho y selecciona “copiar”. Después, abre un editor de texto como Bloc de notas (Windows) o TextEdit (Mac OS X) y pega allí el texto que copiaste. - -Existen otros editores de texto de uso gratuito, tales como [Notepad++](http://notepad-plus-plus.org/) (Windows) o [TextWrangler](http://www.barebones.com/products/textwrangler/) (Mac OS X), que ostentan funciones más avanzadas y son particularmente útiles para hacer una gran cantidad de tareas de limpieza de texto. Con esto último hacemos referencia a eliminar datos paratextuales tales como el texto *boilerplate* (información que incluye elementos como el título de la página, los datos del editor, etc.), el cual aparece de forma reiterada en muchos artículos. Si, por el contrario, conservas esta información, los datos se verán comprometidos, por cuanto el programa de análisis de texto tomará en cuenta estos términos en recuentos de palabras, análisis estadísticos y relaciones léxicas. A este respecto podrías considerar, por ejemplo, la posibilidad de eliminar los encabezados y pies de página estándar que aparecen en cada página (véase el tutorial [Limpieza de datos con OpenRefine](/es/lecciones/limpieza-de-datos-con-OpenRefine) para más información sobre cómo automatizar esta tarea). Ahora bien, en corpus de menor tamaño podría ser más conveniente que tú mismo hicieras dicha labor; de esa manera, adquirirás una mejor percepción de tu corpus. - -- Guarda el artículo como un archivo .txt en el escritorio. Cabría la posibilidad de que hicieras labores adicionales de limpieza del texto, tales como la remoción de los datos del autor (elimínalos y guarda el archivo nuevamente). Recuerda en este sentido que toda la información que permanezca en el archivo puede y será tomada en cuenta por el programa de análisis de texto. -- Ve al escritorio y verifica que puedas encontrar el archivo de texto que guardaste. - -Mediante la ejecución repetida de las tareas anteriores se construye un corpus de archivos de texto plano; esta labor suele implicar el abordaje de asuntos relacionados con muestreo, representatividad y organización. Recuerda: es **necesario** que cada archivo de tu corpus sea de texto plano para que AntConc pueda interpretarlo. A este respecto, se acostumbra nombrar los archivos con la extensión .txt para reconocer fácilmente su naturaleza. - -Como lo supondrás, crear un corpus significativo puede resultar bastante tedioso si este se compone archivo por archivo, en especial si pretendes analizar un conjunto extenso de documentos. Por lo tanto, es muy común hacer *web scraping* (esto es, usar un programa sencillo para tomar archivos de la web de forma automatizada) para construir el corpus; si deseas obtener más información acerca de los conceptos y técnicas asociados a dicha labor, consulta las lecciones [Scraping with Beautiful Soup](/lessons/intro-to-beautiful-soup) y [Automatic Downloading with wget](/lessons/automated-downloading-with-wget), disponibles en *The Programming Historian*. Para efectos de este tutorial, en lugar de componer el corpus documento por documento, vamos a utilizar uno ya existente, compuesto por reseñas cinematográficas y tomado del [Natural Language Processing Toolkit](http://www.nltk.org/) (NLTK). Este corpus se compone de 2000 reseñas, organizadas por su carácter —positivo o negativo—; abordaremos aquí un pequeño subconjunto de ellas (200 de cada categoría). - -La construcción de corpus es un campo de estudio en sí mismo. Para más información sobre este tópico, sugerimos consultar "[Representativeness in Corpus Design](https://academic.oup.com/dsh/article-abstract/8/4/243/928942)", Literary and Linguistic Computing, 8 (4): 243-257; y *[Developing Linguistic Corpora: a Guide to Good Practice](http://www.amazon.com/Developing-Linguistic-Corpora-Practice-Guides/dp/1842172050/ref=sr_1_1)*[^3]. - -### Primeros pasos con AntConc: interfaz de usuario y carga de corpus en la aplicación - -Al iniciarse, AntConc se verá como en la siguiente imagen: - -{% include figure.html filename="antconc1.png" caption="Ventana principal de AntConc" %} - -En el costado izquierdo de la pantalla principal hay un cuadro que enlista todos los archivos cargados del corpus, el cual usaremos más adelante. - -La parte superior de la aplicación consta de 7 pestañas: - -- **_Concordance_ (concordancia):** muestra lo que se conoce como *keyword in context view* (vista de palabras clave en contexto [KWIC, por sus iniciales en inglés]), cuyos resultados se obtienen mediante la barra de búsqueda. -- **_Concordance Plot_ (mapa de concordancia):** presenta una visualización muy sencilla de los resultados de la búsqueda de palabras clave en contexto. Las apariciones del término buscado se representarán como pequeñas líneas negras dentro de un rectángulo que representa la extensión total de cada archivo analizado. -- **_File View_ (vista de archivo):** brinda una vista del archivo completo en la que se resaltan las apariciones del término buscado, con lo cual se obtiene una visión más amplia del contexto en el que este aparece. -- **_Clusters_ (clústeres):** muestra palabras que aparecen juntas muy frecuentemente. -- **_Collocates_ (colocaciones)**: mientras que la pestaña anterior muestra palabras que *definitivamente* aparecen juntas en el corpus, esta presenta aquellas que tienen una alta probabilidad de estarlo. -- **_Word List_ (lista de palabras):** muestra todas las palabras del corpus. -- **_Keyword List_ (lista de palabras clave):** presenta los resultados de comparaciones entre dos corpus. - -Dado su carácter introductorio, este tutorial solo brinda una mirada superficial a lo que se puede hacer con AntConc. En consecuencia, solo nos concentraremos aquí en las funciones de las pestañas *Concordance*, *Collocates*, *Keywords* y *Word List*. - -#### Carga de corpus - -Tal como sucede con cualquier otro programa informático, comenzaremos por ir a “File” – “Open” (“Archivo” – Abrir); pero en lugar de abrir solo **un** archivo, haremos lo propio con la carpeta que contiene todos los documentos que constituyen el corpus. AntConc permite abrir directorios completos; en consecuencia, si ya tienes conocimiento y te sientes cómodo trabajando de esta manera, puedes abrir la carpeta “All reviews” ("Todas las reseñas") y pasar directamente a la sección de análisis de este tutorial [^4]. - -{% include figure.html filename="open-file-21.png" caption="Apertura de una carpeta." %} - -- Recuerda que guardamos los archivos en el escritorio; dirígete entonces a esa ubicación en el menú desplegable. - -{% include figure.html filename="files-on-desktop-open.png" caption="Apertura de una carpeta localizada en el escritorio." %} - -- Una vez en el escritorio, elige la carpeta “movie reviews from ntlk” ("reseñas cienmatográficas del ntlk"): - -{% include figure.html filename="browse-for-directory-inside-folder.png" caption="Localización de la carpeta *movie reviews from nltk*" %} - -- Ahora, selecciona la carpeta “Negative reviews” ("Reseñas negativas") y haz clic en “OK”. Hecho esto, deberían cargarse 200 archivos de texto en la columna izquierda del programa —confírmalo mediante la casilla “Total No.”—. - -{% include figure.html filename="open-negative-reviews.png" caption="Carga de la carpeta *Negative Reviews*." %} - -- Repite el mismo proceso para cargar la carpeta "Positive Reviews" ("Reseñas positivas"). Con esto, deberías tener 400 textos en la columna "Corpus Files". - -{% include figure.html filename="positive-reviews.png" caption="Carga de la carpeta *Positive Reviews*." %} - -{% include figure.html filename="all-reviews-loaded.png" caption="Conjunto completo de reseñas cargadas en el programa." %} - -## Búsqueda de palabras clave en contexto - -### Comenzar con una búsqueda básica - -Una de las labores en las cuales se destacan las herramientas de análisis de corpus como AntConc radica en encontrar patrones en el uso de la lengua que nos resulta difícil identificar como lectores. Nos es complicado rastrear palabras pequeñas y en apariencia poco importantes, tales como 'yo', 'él', 'ella', 'un' y 'es' porque son muy comunes, pero los computadores son muy buenos para realizar esta labor. Estos términos, que en lingüística reciben el nombre de palabras funcionales —se conocen como palabras vacías (*stopwords*) en el ámbito de las humanidades digitales—, suelen constituir indicadores estilísticos muy claros en materias de autoría y género en los textos. En consecuencia, tales palabras pueden ser términos de búsqueda bastante potentes por sí solos, o bien combinados con términos que se relacionen en mayor medida con el contenido (*content-driven terms*), lo cual ayuda al investigador a identificar patrones que tal vez no haya detectado previamente. - -En la pestaña *Concordance*, escribe la palabra 'the' en el cuadro de búsqueda ubicado en la parte inferior y haz clic en “Start”. Acto seguido, el programa mostrará cada una de las apariciones de dicho término en el corpus de reseñas cinematográficas, así como el contexto en el que estas se presentan. Esto recibe el nombre de "visor de palabras clave en contexto" (*keywords in context viewer*). - -{% include figure.html filename="the-thinking.png" caption="*The* es una palabra común en la lengua inglesa." %} - -La palabra buscada aparece 14.618 veces en el corpus según la casilla *Concordance Hits*, que se encuentra en la parte inferior de la pestaña. - -Como se indicó anteriormente, la lista KWIC resulta una buena forma de comenzar a buscar patrones. Aunque la cantidad de información suministrada con la búsqueda es aún muy grande, ¿qué tipo de palabras aparecen cerca de 'the'? - -Ahora, prueba a hacer una búsqueda del término 'a'; tanto este último como 'the' son artículos en la lengua inglesa, pero el primero es definido y el segundo indefinido; y los resultados arrojados por la búsqueda ilustrarán esa diferencia. - -Llegados a este punto, ya debes estar familiarizado con las líneas de texto que componen la vista KWIC. Ahora, realiza una nueva búsqueda, esta vez de la palabra 'shot': los resultados mostrarán las apariciones del término tanto en la función sintáctica de sustantivo (por ejemplo, “line up the shot”) como en la de verbo conjugado (por ejemplo, "this scene was shot carefully"). - -¿Qué ves? Entendemos que esta puede ser una forma de identificar patrones difícil de intepretar. Intenta presionar el botón amarillo “Sort” (clasificar): ¿qué sucede al hacerlo? - -{% include figure.html filename="sorting-shot-1l1r.png" caption="Palabras que aparecen junto a *shot*." %} - -Puedes ajustar la forma en que AntConc ordena la información encontrada si cambias los parámetros que en la imagen anterior aparecen encerrados en el círculo de color rojo: L corresponde a izquierda (*left*) y R a derecha (*right*); lo anterior puede extenderse hasta 5 posiciones en cualquier dirección. Los valores por defecto de la aplicación son 1 izquierda (1L), 2 derecha (2R), 3 derecha (3R); pero puedes alterarlos, por ejemplo, a 3 izquierda (3L), 2 izquierda (2L), 1 derecha (1R) (en aras de obtener frases o trigramas que finalicen con el término buscado) si haces clic en las flechas hacia arriba y abajo que se encuentran junto a los parámetros. Si no deseas realizar este tipo de clasificación, puedes omitirla (dejar los valores predeterminados 1L, 2R y 3R) o dejar todos los parámetros con el valor 0. Cabe la posibilidad de generar clasificaciones menos lineales, como 4L, 3R, 5R, que arrojarían como resultado mucha más información del contexto. El programa puede tardar un poco en mostrar este tipo de clasificaciones, por lo que sugerimos tener paciencia al efectuarlas. Si no estás seguro de cuáles serán los resultados arrojados por la búsqueda, haz clic en "Sort" para ver qué ocurre y efectúa los ajustes a los que haya lugar según tus necesidades. - -### Operadores de búsqueda - -#### Operador * (comodín) - -El operador * (que sirve para buscar 0 o más caracteres) puede ayudar a encontrar las formas de sustantivos en singular y plural, por ejemplo. - -**Tarea:** busca _qualit*_ y ordena los resultados. ¿Qué tiende a preceder y seguir a las palabras 'quality' y 'qualities'? Una pista: son vocablos diferentes con contextos de uso distintos; identifica patrones de uso mediante la búsqueda KWIC. - -Para obtener una lista completa de los operadores comodín disponibles y su función, revisa "Global Settings" – "Wildcard Settings". - -{% include figure.html filename="wildcard-settings.png" caption="Configuración de operadores de búsqueda." %} - -Para conocer la diferencia entre los operadores * y ?, busca _th*n_ y luego _th?n_. Estas dos búsquedas, que a simple vista parecieran muy similares, arrojan resultados distintos. - -El operador *?* es más específico que *, así: - -*wom?n* – 'women' y 'woman'. - -*m?n* – 'man', 'men' y 'min'. - -Una búsqueda de _m*n_, en cambio, no es útil porque se obtendrán resultados que incluirán 'mean', 'melon', etc. - -**Tarea:** compara los resultados de las búsquedas de *wom?n* y *m?n*. - -- Ordena los resultados de cada búsqueda de manera que arrojen datos significativos (por ejemplo, configurar los parámetros de la búsqueda en 0, 1L y 2L) - -- Haz clic en "File" – "Save Output to Text File" y guarda el archivo (no olvides agregar la extensión .txt al nombre del mismo). - -> Sugerencia: durante la exploración en tu investigación, generarás muchos documentos como este para efectos de consulta. Es conveniente, por tanto, nombrar los archivos de tal manera que se describa lo que estos contienen (por ejemplo, “wom?n-results.txt” en lugar de “antconc-results.txt”). - -{% include figure.html filename="save-output-as-text-file.png" caption="Opción *Save output as text file*." %} - -{% include figure.html filename="save-as.png" caption="Cuadro de diálogo *Save As*." %} - -Con lo anterior, puedes abrir el archivo de texto plano generado por el programa en un editor de texto; es posible que debas ampliar la ventana de la aplicación para que este sea legible. - -{% include figure.html filename="results.png" caption="Archivo de resultados de búsqueda KWIC exportado por Antconc, tal como se muestra en un editor de texto." %} - -Realiza el proceso anterior con los resultados de las dos búsquedas y compara los archivos de texto generados. ¿Qué fenómenos puedes ver? - -#### Operador | ("o") - -**Tarea:** busca _she\|he_. - -Ahora, busca las dos palabras anteriores por separado: ¿cuántas veces aparece 'she' en comparación con 'he'? - -La palabra 'she' (ella) aparece en mucha menor cantidad que 'he' (él). ¿Por qué? ¡Esa es una pregunta de investigación! Una buena manera de ampliar este cuestionamiento podría radicar en ordenar la búsqueda anterior para identificar patrones de uso de las palabras en cuestión, y revisar si las mismas están seguidas de algún verbo en particular. - -**Tarea:** a modo de práctica, busca una palabra que te interese, ordena los resultados de formas diferentes, usa los operadores comodín y exporta los datos obtenidos como archivos de texto plano. He aquí un interrogante orientador: ¿qué tipo de patrones puedes observar? ¿Puedes explicarlos? - -### Colocaciones y listas de palabras -Después de haber analizado las líneas de resultados de la vista KWIC en busca de patrones, ¿no te gustaría que hubiera una forma de que el computador te brindara una lista de palabras que aparecen más frecuentemente con la palabra clave buscada? - -Buenas noticias: existe una manera de obtener esta información en AntConc; está disponible en la pestaña *Collocates* (colocaciones). Al hacer clic en la misma, aparecerá un mensaje por medio del cual la aplicación dirá que necesita crear una lista de palabras. Haz clic en "OK" y el programa lo hará automáticamente. - -> Nota: solo recibirás este aviso cuando no hayas creado una lista de palabras. - -{% include figure.html filename="wordlistwarning.png" caption="Mensaje de advertencia para indicar la necesidad de generar una lista de palabras." %} - -Ahora, intenta generar la lista de colocaciones para el término 'she'. - -Los resultados sin clasificar parecerán comenzar con palabras funcionales (palabras con las que se construyen frases) y luego pasarán a palabras de contenido (términos que dan sentido al texto): las primeras son [las más frecuentes en inglés](http://www.wordfrequency.info/free.asp), en tanto funcionan mayormente como elementos para construir frases. Versiones más recientes de AntConc suelen incluir el término buscado como primer resultado, posiblemente porque está presente en el texto y se quiere hallar palabras que puedan aparecer junto a él. - -Algunas personas podrían tener la intención de prescindir de esta clase de palabras mediante el uso de una lista de palabras funcionales (esta es una labor común cuando se hace modelado de tópicos). Desde nuestra óptica, no promovemos esta práctica porque los computadores se destacan, justamente, en la identificación de palabras con alta frecuencia de aparición; tal como se expresó anteriormente, tendemos a pasarlas por alto. Los computadores —y en especial las aplicaciones como AntConc—, pueden mostrar dónde aparecen o no estas palabras, y esa información puede ser de interés, especialmente en colecciones de texto de gran envergadura (como se vio con las búsquedas de 'a', 'she' y 'he'). - -No obstante, en el caso de la lengua inglesa, la frecuencia de aparición de la letra 's' en el corpus también puede ser bastante alta, en tanto representa el posesivo *ʼs* (la aplicación no toma en cuenta el apóstrofo), pero AntConc la toma como otra palabra. Asimismo, la forma *ʼt* puede aparecer junto al verbo 'do' por cuanto conforman la contracción *donʼt*; la alta frecuencia de su aparición conjunta los convierte en colocaciones altamente probables. - -**Tarea:** genera la lista de colocaciones para las búsquedas de *m?n* y *wom?n*. Ahora, ordénalas de acuerdo con su frecuencia de aparición respecto del parámetro 1L. -Los resultados muestran lo que, en teoría, hace que un hombre (*man*) o una mujer (*woman*) sea “digno de mostrarse en el cine”: -- las mujeres deben ser "bellas" (beautiful), "sofisticadas" (*sophisticated*) o estar "embarazadas" (*pregnant*). -- Los hombres tienen que estar, en cierto modo, fuera de lo común: deben ser "santos" (*holy*), "negros" (*black*) o "viejos" (*old*). - -Lo anterior no alude directamente a las películas, sino a la forma como se escribe sobre ellas en las reseñas, y puede llevar a cuestionamientos más sutiles, tales como "¿de qué manera se describen los roles de las mujeres en las comedias románticas en las reseñas escritas por hombres frente a las escritas por mujeres?" - -### Comparación de corpus - -Uno de los tipos de análisis más potentes radica en comparar el corpus propio con uno de referencia más extenso. - -Para este ejercicio, hemos tomado reseñas de filmes en los que Steven Spielberg ha estado involucrado (como director o productor). Podemos compararlos con un corpus de referencia de películas de toda una gama de directores. - -Asegúrate de pensar cuidadosamente sobre las características que podría tener un corpus de referencia para tu propia investigación (por ejemplo, un estudio del lenguaje de Agatha Christie en sus últimos años funcionaría muy bien como un corpus de análisis para compararlo con un corpus de referencia de todas sus novelas). Recuerda que, como lo expresamos anteriormente, la construcción del corpus es un subcampo en sí mismo. - -- Dirígete a "Settings" – "Tool preferences" – "Keyword List". -- Asegúrate de que la casilla de verificación "Use raw files" esté seleccionada en el menú "Reference Corpus". -- Haz clic en el botón "Add Directory" y selecciona la carpeta que contiene los archivos del corpus de referencia. -- Verifica que dispongas de la lista completa de archivos en el listado que se mostrará. - -{% include figure.html filename="adding-a-reference-corpus.png" caption="Carga de un corpus de referencia." %} - -- Haz clic en el botón "Load" y espera que el programa cargue los archivos; una vez la casilla de verificación "Loaded" esté marcada, haz clic en "Apply". - -Existe la posibilidad de intercambiar los roles del corpus de referencia y los archivos principales (es decir, dar al primero la función de los segundos y viceversa) por medio del botón "Swap Ref/Main Files"; en este punto vale la pena experimentar con esta opción y comparar los resultados obtenidos. - -> Si estás utilizando una versión más reciente del programa, el botón anterior puede llamarse "Swap with Target Files". Adicionalmente, cualesquiera sean los datos que vayas a utilizar como corpus de referencia, asegúrate de que estos se carguen correctamente en AntConc (esto es, haz clic en el botón "Load" cada vez que cargues o intercambies un corpus). - -- Dirígete a la pestaña "Keyword list" y una vez allí, presiona el botón "Start" (sin escribir nada en la casilla de búsqueda). Si intercambiaste el corpus de referencia con los archivos objeto del análisis, el programa anunciará la necesidad de crear una nueva lista de palabras antes de generar la lista de palabras clave. Esta se compondrá de aquellos términos que resulten mucho más "inusuales" —de aparición menos probable en terminos estadísticos— en el corpus que se está viendo *vs.* el de referencia. - -> *Keyness* (calidad de la palabra clave): corresponde a la frecuencia de aparición de una palabra en el texto cuando se la compara con su frecuencia en un corpus de referencia, "de tal suerte que la probabilidad estadística, calculada mediante un procedimiento determinado, es menor o igual que el valor *p* especificado por el usuario" (información tomada de [este sitio](http://www.lexically.net/downloads/version6/HTML/index.html?keyness_definition.htm)). Para profundizar sobre los detalles estadísticos de este tópico, sugerimos revisar la sección sobre el mismo en la página 7 del [archivo *Readme* de AntConc](https://www.laurenceanthony.net/software/antconc/releases/AntConc335/help.pdf). - -¿Cuáles son nuestras palabras clave? - -{% include figure.html filename="spielberg-vs-movie-reviews.png" caption="Spielberg *vs.* reseñas cinematográficas." %} - -## Discusión: hacer comparaciones significativas - -Es importante tener en cuenta que la forma en que se organicen los archivos de texto para la investigación tendrá efectos en el tipo de interrogantes que puedan surgir de los mismos, así como en los resultados que se obtengan del análisis. A este respecto, recuerda que la comparación realizada aquí entre reseñas negativas y positivas es extremadamente simple; si se quisiere, podrían efectuarse comparaciones adicionales con otros subconjuntos de reseñas, lo cual daría pie a la formulación de interrogantes muy distintos. - -Así entonces, los archivos que se dispongan en el corpus determinarán los resultados obtenidos. Reiteramos que los temas de representatividad y muestreo son muy relevantes en este sentido: no siempre es necesario o ideal utilizar todo un conjunto de datos, incluso si se dispone de él. En este punto, realmente cabe preguntarse por la manera como estos métodos de análisis textual ayudan a generar preguntas de investigación. - -Si se piensa, por ejemplo, en el funcionamiento de las reseñas cinematográficas en tanto género discursivo, puede dirigirse la atención hacia oposiciones como las siguientes: - -- Reseñas cinematográficas *vs.* reseñas musicales -- Reseñas cinematográficas *vs.* reseñas de libros -- Reseñas cinematográficas *vs.* noticias deportivas -- Reseñas cinematográficas *vs.* noticias en general - -Cada una de estas comparaciones aportará información distinta y puede derivar en preguntas de investigación diferentes, tales como: - -- ¿En qué difieren las reseñas cinematográficas de otros tipos de reseñas de productos mediáticos? - -- ¿En qué se diferencian las reseñas cinematográficas de otros tipos de escritos susceptibles de publicarse? -- ¿Cómo se comparan las reseñas de películas con otros géneros de escritura, tales como la crónica deportiva? -- ¿Qué tienen en común las reseñas cinematográficas y las musicales? - -Desde luego, puede darse la vuelta a estos cuestionamientos para generar nuevas preguntas: - -- ¿En qué se diferencian las reseñas bibliográficas de las cinematográficas? - -- ¿En qué difieren las reseñas musicales de las cinematográficas? -- ¿Qué tienen en común los artículos que se publican en la prensa escrita? -- ¿En qué se asemejan las reseñas cinematográficas a otros tipos de escritos susceptibles de publicarse? - -En síntesis, vale la pena pensar en: - -- Por qué se quiere comparar dos corpus. -- Qué tipo de consultas da lugar a preguntas de investigación significativas. -- Principios de construcción de corpus: muestreo y capacidad de asegurar que se obtengan datos representativos. - -### Recursos adicionales - -*[A Short Bibliography on Corpus Linguistics](http://hfroehlich.wordpress.com/2014/05/11/intro-bibliography-corpus-linguistics/)* - -[Una versión más sencilla de este tutorial, concebida para usuarios con pocos conocimientos de computación](http://hfroehli.ch/workshops/getting-started-with-antconc/) (en inglés). - -*[Guía rápida de análisis de corpus con AntConc](https://rua.ua.es/dspace/bitstream/10045/43959/4/grac.pdf)*, publicada por la Universidad de Alicante (2015). - -## Notas de traducción - -[^1]: Investigador y docente de la Universidad de Waseda (Japón). -[^2]: La interfaz del programa solo está disponible en inglés. -[^3]: Dos materiales en español pueden ser de utilidad si se desea profundizar en esta témática: de un lado, la conferencia *[Aproximación al concepto de representatividad de corpus](https://www.youtube.com/watch?v=bvTigjPhZco)*; y de otro, la obra *[Explotación de los córpora textuales informatizados para la creación de bases de datos terminológicas basadas en el conocimiento](https://web.archive.org/web/20150926235725/https://elies.rediris.es/elies18/)*. -[^4]: Si se requiere trabajar con corpus en cuyos textos se emplean caracteres especiales (como es el caso de los documentos escritos en lengua española), es imperativo prestar atención a la codificación con la cual se guardaron los archivos que los componen. Por defecto, AntConc está configurado para operar con documentos de texto plano con codificación Unicode (UTF-8). Así entonces, es preciso verificar en el editor de texto que estos se hayan guardado atendiendo a lo anterior, o bien cambiar los parámetros de importación de archivos en el programa según las necesidades (por ejemplo, trabajar con archivos codificados en ANSI). +--- +title: Análisis de corpus con AntConc +authors: +- Heather Froehlich +date: 2015-11-24 +translation_date: 2018-05-04 +editors: +- Fred Gibbs +reviewers: +- Nabeel Siddiqui +- Rob Sieczkiewicz +translator: +- Carlos Manuel Varón Castañeda +translation-editor: +- Antonio Rojas Castro +translation-reviewer: +- Jennifer Isasi +- Antonio Rojas Castro +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/170 +layout: lesson +original: corpus-analysis-with-antconc +difficulty: 1 +activity: analyzing +topics: [distant-reading] +abstract: "El análisis de corpus permite hacer comparaciones a gran escala entre objetos presentes en los textos; es decir, lo que se conoce como lectura distante." +avatar_alt: Grabado de una estantería con libros +doi: 10.46430/phes0032 +--- + +
    +En 2022, Lauren Anthony sacó AntConc 4.0, una actualización importante del software AntConc. Aunque casi todas las funciones siguen siendo las mismas, algunas de las prácticas que se describen en esta lección han cambiado un poco. Si estás utilizando una versión más reciente de AntConc que la que se muestra a continuación, puedes consultar la guía de ayuda proporcionada por Laurence Anthony (solo disponible en inglés). Este aviso es especialmente relevante para abrir un corpus utilizando la nueva herramienta Corpus Manager (enero de 2022). +
    + +{% include toc.html %} + +## Introducción + +El análisis de corpus es un tipo de análisis de textos que permite hacer comparaciones a gran escala entre objetos presentes en los mismos —esto es, aquello que se conoce como lectura distante—. Lo anterior hace posible apreciar fenómenos que no necesariamente se hacen visibles cuando leemos. Si, por ejemplo, dispones de una colección de documentos, es posible que desearas encontrar patrones de uso gramatical o frases de aparición recurrente en la misma. También puede ser que quisieras hallar frases cuya probabilidad de aparición fuese más alta o más baja en la obra de un autor, o bien en un tipo determinado de textos; clases particulares de estructuras gramaticales; o muchos ejemplos de un concepto particular en una gran cantidad de documentos que se encuentran enmarcados en cierto contexto. En este sentido, el análisis de corpus resulta muy útil para demostrar hipótesis sobre textos, o para triangular resultados obtenidos a través de otras metodologías de análisis textual basadas en herramientas digitales. + +Al finalizar este tutorial, tendrás la capacidad de: + +- Crear o descargar un corpus de textos. +- Realizar una búsqueda de palabras clave en contexto. +- Identificar patrones respecto de una palabra determinada. +- Utilizar criterios de búsqueda más específicos. +- Revisar diferencias estadísticamente significativas entre corpus. +- Efectuar comparaciones multimodales a través de metodologías de análisis propias de la lingüística de corpus. + +Es posible que te hayas acercado a la ejecución de análisis como el que se describe aquí si has realizado alguna de las siguientes tareas: + +- Búsqueda de todas las apariciones de un término específico en un archivo PDF o un documento de Microsoft Word®. +- Uso de [Voyant Tools](https://voyant-tools.org/) para revisar patrones en un texto. +- Lectura y desarrollo de los tutoriales de introducción a Python disponibles en *[The Programming Historian](/es/lecciones/)*. + +En muchos sentidos, [Voyant](https://voyant-tools.org/) es una puerta de entrada a la realización de análisis más sofisticados y replicables, ya que la naturaleza de tipo “házlo tú mismo” de los *scripts* en Python o R puede no ser atractiva para todos. [AntConc](https://www.laurenceanthony.net/software/antconc/) llena este vacío en tanto se propone como una aplicación informática independiente para el análisis lingüístico de textos, la cual se encuentra disponible de forma gratuita para los sistemas operativos Windows, Mac OS X y Linux (funciona, por tanto, en múltiples plataformas), y es objeto de actualizaciones permanentes por parte de su creador, [Laurence Anthony](https://www.laurenceanthony.net/)[^1]; si bien existen otras aplicaciones para efectuar análisis de concordancias lingüísticas, se resaltan de AntConc las dos cualidades señaladas (para acceder a recursos adicionales sobre esta temática, véase *[An Introductory Bibliography to Corpus Linguistics](https://hfroehli.ch/2014/05/11/intro-bibliography-corpus-linguistics/)*). + +En este tutorial se presentan varias maneras diferentes de acercarse a un corpus de textos. Es importante tener en cuenta que las metodologías de lingüística de corpus no funcionan en todas las situaciones. Con esto, conforme sigas los pasos propuestos, es conveniente que reflexiones sobre la tarea que estés realizando y cómo puede ser de utilidad para responder una pregunta específica en relación con los datos de los que dispongas. En este sentido, si bien la presente lección está construida bajo la metodología "haz esto y luego esto para lograr *X*", no siempre es necesario seguir en orden estricto los pasos que se muestran aquí: se brinda en este espacio una síntesis general de algunos de los métodos disponibles para realizar análisis de esta naturaleza, en lugar de una receta única para el éxito. + +### Descargas necesarias para el desarrollo de este tutorial + +1. Programa: [AntConc](https://www.laurenceanthony.net/software/antconc/)[^2]. + + Descomprime el archivo del programa (si fuere necesario) e inícialo. Las capturas de pantalla presentadas aquí pueden diferir ligeramente de la versión de AntConc que utilices (y del sistema operativo, desde luego), pero los procedimientos son más o menos los mismos en todas las plataformas y versiones recientes de la aplicación. Este tutorial fue escrito teniendo como referente una versión específica (bastante antigua) de AntConc, en tanto consideramos que resulta más fácil de usar para fines introductorios. Puedes emplear la versión más reciente para desarrollar el tutorial si lo tienes a bien; pero, si deseas seguir los pasos con la misma información que presentamos en las capturas de pantalla de esta lección, es necesario que descargues la versión específica que empleamos aquí ([3.2.4](https://www.laurenceanthony.net/software/antconc/releases/AntConc324/)). + +2. Corpus de prueba: descarga este [archivo zip de reseñas cinematográficas](/assets/corpus-analysis-with-antconc/antconc_corpus_files.zip) (escritas en inglés). + +### Presentación sintética de las temáticas abordadas en la lección + +- Trabajar con archivos de texto plano +- Interfaz de usuario y carga de corpus en AntConc +- Búsqueda de palabras clave en contexto +- Búsqueda avanzada de palabras clave en contexto +- Colocaciones y listas de palabras +- Comparación de corpus +- Discusión: hacer comparaciones significativas +- Recursos adicionales + +### Trabajar con archivos de texto plano + +- AntConc solo funciona con archivos de texto plano de extensión .txt (por ejemplo, "Hamlet.txt"); **no puede leer** archivos de extensiones .doc, .docx o .pdf. Por lo tanto, si dispones de documentos de este tipo, deberás convertirlos en archivos .txt. +- La aplicación tiene la capacidad de trabajar con archivos XML (no te preocupes si los desconoces) guardados con la extensión .txt. + +Visita tu portal de noticias favorito y accede a un artículo (su naturaleza no importa, siempre que se componga mayoritariamente de texto). Luego, selecciona todo el texto (encabezado, pie de página, cuerpo, etc.), haz clic derecho y selecciona “copiar”. Después, abre un editor de texto como Bloc de notas (Windows) o TextEdit (Mac OS X) y pega allí el texto que copiaste. + +Existen otros editores de texto de uso gratuito, tales como [Notepad++](https://notepad-plus-plus.org/) (Windows) o [TextWrangler](https://www.barebones.com/products/textwrangler/) (Mac OS X), que ostentan funciones más avanzadas y son particularmente útiles para hacer una gran cantidad de tareas de limpieza de texto. Con esto último hacemos referencia a eliminar datos paratextuales tales como el texto *boilerplate* (información que incluye elementos como el título de la página, los datos del editor, etc.), el cual aparece de forma reiterada en muchos artículos. Si, por el contrario, conservas esta información, los datos se verán comprometidos, por cuanto el programa de análisis de texto tomará en cuenta estos términos en recuentos de palabras, análisis estadísticos y relaciones léxicas. A este respecto podrías considerar, por ejemplo, la posibilidad de eliminar los encabezados y pies de página estándar que aparecen en cada página (véase el tutorial [Limpieza de datos con OpenRefine](/es/lecciones/limpieza-de-datos-con-OpenRefine) para más información sobre cómo automatizar esta tarea). Ahora bien, en corpus de menor tamaño podría ser más conveniente que tú mismo hicieras dicha labor; de esa manera, adquirirás una mejor percepción de tu corpus. + +- Guarda el artículo como un archivo .txt en el escritorio. Cabría la posibilidad de que hicieras labores adicionales de limpieza del texto, tales como la remoción de los datos del autor (elimínalos y guarda el archivo nuevamente). Recuerda en este sentido que toda la información que permanezca en el archivo puede y será tomada en cuenta por el programa de análisis de texto. +- Ve al escritorio y verifica que puedas encontrar el archivo de texto que guardaste. + +Mediante la ejecución repetida de las tareas anteriores se construye un corpus de archivos de texto plano; esta labor suele implicar el abordaje de asuntos relacionados con muestreo, representatividad y organización. Recuerda: es **necesario** que cada archivo de tu corpus sea de texto plano para que AntConc pueda interpretarlo. A este respecto, se acostumbra nombrar los archivos con la extensión .txt para reconocer fácilmente su naturaleza. + +Como lo supondrás, crear un corpus significativo puede resultar bastante tedioso si este se compone archivo por archivo, en especial si pretendes analizar un conjunto extenso de documentos. Por lo tanto, es muy común hacer *web scraping* (esto es, usar un programa sencillo para tomar archivos de la web de forma automatizada) para construir el corpus; si deseas obtener más información acerca de los conceptos y técnicas asociados a dicha labor, consulta las lecciones [Scraping with Beautiful Soup](/en/lessons/intro-to-beautiful-soup) y [Automatic Downloading with wget](/en/lessons/automated-downloading-with-wget), disponibles en *The Programming Historian*. Para efectos de este tutorial, en lugar de componer el corpus documento por documento, vamos a utilizar uno ya existente, compuesto por reseñas cinematográficas y tomado del [Natural Language Processing Toolkit](https://www.nltk.org/) (NLTK). Este corpus se compone de 2000 reseñas, organizadas por su carácter —positivo o negativo—; abordaremos aquí un pequeño subconjunto de ellas (200 de cada categoría). + +La construcción de corpus es un campo de estudio en sí mismo. Para más información sobre este tópico, sugerimos consultar "[Representativeness in Corpus Design](https://academic.oup.com/dsh/article-abstract/8/4/243/928942)", Literary and Linguistic Computing, 8 (4): 243-257; y *[Developing Linguistic Corpora: a Guide to Good Practice](https://www.amazon.com/Developing-Linguistic-Corpora-Practice-Guides/dp/1842172050/ref=sr_1_1)*[^3]. + +### Primeros pasos con AntConc: interfaz de usuario y carga de corpus en la aplicación + +Al iniciarse, AntConc se verá como en la siguiente imagen: + +{% include figure.html filename="antconc1.png" caption="Ventana principal de AntConc" %} + +En el costado izquierdo de la pantalla principal hay un cuadro que enlista todos los archivos cargados del corpus, el cual usaremos más adelante. + +La parte superior de la aplicación consta de 7 pestañas: + +- **_Concordance_ (concordancia):** muestra lo que se conoce como *keyword in context view* (vista de palabras clave en contexto [KWIC, por sus iniciales en inglés]), cuyos resultados se obtienen mediante la barra de búsqueda. +- **_Concordance Plot_ (mapa de concordancia):** presenta una visualización muy sencilla de los resultados de la búsqueda de palabras clave en contexto. Las apariciones del término buscado se representarán como pequeñas líneas negras dentro de un rectángulo que representa la extensión total de cada archivo analizado. +- **_File View_ (vista de archivo):** brinda una vista del archivo completo en la que se resaltan las apariciones del término buscado, con lo cual se obtiene una visión más amplia del contexto en el que este aparece. +- **_Clusters_ (clústeres):** muestra palabras que aparecen juntas muy frecuentemente. +- **_Collocates_ (colocaciones)**: mientras que la pestaña anterior muestra palabras que *definitivamente* aparecen juntas en el corpus, esta presenta aquellas que tienen una alta probabilidad de estarlo. +- **_Word List_ (lista de palabras):** muestra todas las palabras del corpus. +- **_Keyword List_ (lista de palabras clave):** presenta los resultados de comparaciones entre dos corpus. + +Dado su carácter introductorio, este tutorial solo brinda una mirada superficial a lo que se puede hacer con AntConc. En consecuencia, solo nos concentraremos aquí en las funciones de las pestañas *Concordance*, *Collocates*, *Keywords* y *Word List*. + +#### Carga de corpus + +Tal como sucede con cualquier otro programa informático, comenzaremos por ir a “File” – “Open” (“Archivo” – Abrir); pero en lugar de abrir solo **un** archivo, haremos lo propio con la carpeta que contiene todos los documentos que constituyen el corpus. AntConc permite abrir directorios completos; en consecuencia, si ya tienes conocimiento y te sientes cómodo trabajando de esta manera, puedes abrir la carpeta “All reviews” ("Todas las reseñas") y pasar directamente a la sección de análisis de este tutorial [^4]. + +{% include figure.html filename="open-file-21.png" caption="Apertura de una carpeta." %} + +- Recuerda que guardamos los archivos en el escritorio; dirígete entonces a esa ubicación en el menú desplegable. + +{% include figure.html filename="files-on-desktop-open.png" caption="Apertura de una carpeta localizada en el escritorio." %} + +- Una vez en el escritorio, elige la carpeta “movie reviews from ntlk” ("reseñas cienmatográficas del ntlk"): + +{% include figure.html filename="browse-for-directory-inside-folder.png" caption="Localización de la carpeta *movie reviews from nltk*" %} + +- Ahora, selecciona la carpeta “Negative reviews” ("Reseñas negativas") y haz clic en “OK”. Hecho esto, deberían cargarse 200 archivos de texto en la columna izquierda del programa —confírmalo mediante la casilla “Total No.”—. + +{% include figure.html filename="open-negative-reviews.png" caption="Carga de la carpeta *Negative Reviews*." %} + +- Repite el mismo proceso para cargar la carpeta "Positive Reviews" ("Reseñas positivas"). Con esto, deberías tener 400 textos en la columna "Corpus Files". + +{% include figure.html filename="positive-reviews.png" caption="Carga de la carpeta *Positive Reviews*." %} + +{% include figure.html filename="all-reviews-loaded.png" caption="Conjunto completo de reseñas cargadas en el programa." %} + +## Búsqueda de palabras clave en contexto + +### Comenzar con una búsqueda básica + +Una de las labores en las cuales se destacan las herramientas de análisis de corpus como AntConc radica en encontrar patrones en el uso de la lengua que nos resulta difícil identificar como lectores. Nos es complicado rastrear palabras pequeñas y en apariencia poco importantes, tales como 'yo', 'él', 'ella', 'un' y 'es' porque son muy comunes, pero los computadores son muy buenos para realizar esta labor. Estos términos, que en lingüística reciben el nombre de palabras funcionales —se conocen como palabras vacías (*stopwords*) en el ámbito de las humanidades digitales—, suelen constituir indicadores estilísticos muy claros en materias de autoría y género en los textos. En consecuencia, tales palabras pueden ser términos de búsqueda bastante potentes por sí solos, o bien combinados con términos que se relacionen en mayor medida con el contenido (*content-driven terms*), lo cual ayuda al investigador a identificar patrones que tal vez no haya detectado previamente. + +En la pestaña *Concordance*, escribe la palabra 'the' en el cuadro de búsqueda ubicado en la parte inferior y haz clic en “Start”. Acto seguido, el programa mostrará cada una de las apariciones de dicho término en el corpus de reseñas cinematográficas, así como el contexto en el que estas se presentan. Esto recibe el nombre de "visor de palabras clave en contexto" (*keywords in context viewer*). + +{% include figure.html filename="the-thinking.png" caption="*The* es una palabra común en la lengua inglesa." %} + +La palabra buscada aparece 14.618 veces en el corpus según la casilla *Concordance Hits*, que se encuentra en la parte inferior de la pestaña. + +Como se indicó anteriormente, la lista KWIC resulta una buena forma de comenzar a buscar patrones. Aunque la cantidad de información suministrada con la búsqueda es aún muy grande, ¿qué tipo de palabras aparecen cerca de 'the'? + +Ahora, prueba a hacer una búsqueda del término 'a'; tanto este último como 'the' son artículos en la lengua inglesa, pero el primero es definido y el segundo indefinido; y los resultados arrojados por la búsqueda ilustrarán esa diferencia. + +Llegados a este punto, ya debes estar familiarizado con las líneas de texto que componen la vista KWIC. Ahora, realiza una nueva búsqueda, esta vez de la palabra 'shot': los resultados mostrarán las apariciones del término tanto en la función sintáctica de sustantivo (por ejemplo, “line up the shot”) como en la de verbo conjugado (por ejemplo, "this scene was shot carefully"). + +¿Qué ves? Entendemos que esta puede ser una forma de identificar patrones difícil de intepretar. Intenta presionar el botón amarillo “Sort” (clasificar): ¿qué sucede al hacerlo? + +{% include figure.html filename="sorting-shot-1l1r.png" caption="Palabras que aparecen junto a *shot*." %} + +Puedes ajustar la forma en que AntConc ordena la información encontrada si cambias los parámetros que en la imagen anterior aparecen encerrados en el círculo de color rojo: L corresponde a izquierda (*left*) y R a derecha (*right*); lo anterior puede extenderse hasta 5 posiciones en cualquier dirección. Los valores por defecto de la aplicación son 1 izquierda (1L), 2 derecha (2R), 3 derecha (3R); pero puedes alterarlos, por ejemplo, a 3 izquierda (3L), 2 izquierda (2L), 1 derecha (1R) (en aras de obtener frases o trigramas que finalicen con el término buscado) si haces clic en las flechas hacia arriba y abajo que se encuentran junto a los parámetros. Si no deseas realizar este tipo de clasificación, puedes omitirla (dejar los valores predeterminados 1L, 2R y 3R) o dejar todos los parámetros con el valor 0. Cabe la posibilidad de generar clasificaciones menos lineales, como 4L, 3R, 5R, que arrojarían como resultado mucha más información del contexto. El programa puede tardar un poco en mostrar este tipo de clasificaciones, por lo que sugerimos tener paciencia al efectuarlas. Si no estás seguro de cuáles serán los resultados arrojados por la búsqueda, haz clic en "Sort" para ver qué ocurre y efectúa los ajustes a los que haya lugar según tus necesidades. + +### Operadores de búsqueda + +#### Operador * (comodín) + +El operador * (que sirve para buscar 0 o más caracteres) puede ayudar a encontrar las formas de sustantivos en singular y plural, por ejemplo. + +**Tarea:** busca _qualit*_ y ordena los resultados. ¿Qué tiende a preceder y seguir a las palabras 'quality' y 'qualities'? Una pista: son vocablos diferentes con contextos de uso distintos; identifica patrones de uso mediante la búsqueda KWIC. + +Para obtener una lista completa de los operadores comodín disponibles y su función, revisa "Global Settings" – "Wildcard Settings". + +{% include figure.html filename="wildcard-settings.png" caption="Configuración de operadores de búsqueda." %} + +Para conocer la diferencia entre los operadores * y ?, busca _th*n_ y luego _th?n_. Estas dos búsquedas, que a simple vista parecieran muy similares, arrojan resultados distintos. + +El operador *?* es más específico que *, así: + +*wom?n* – 'women' y 'woman'. + +*m?n* – 'man', 'men' y 'min'. + +Una búsqueda de _m*n_, en cambio, no es útil porque se obtendrán resultados que incluirán 'mean', 'melon', etc. + +**Tarea:** compara los resultados de las búsquedas de *wom?n* y *m?n*. + +- Ordena los resultados de cada búsqueda de manera que arrojen datos significativos (por ejemplo, configurar los parámetros de la búsqueda en 0, 1L y 2L) + +- Haz clic en "File" – "Save Output to Text File" y guarda el archivo (no olvides agregar la extensión .txt al nombre del mismo). + +> Sugerencia: durante la exploración en tu investigación, generarás muchos documentos como este para efectos de consulta. Es conveniente, por tanto, nombrar los archivos de tal manera que se describa lo que estos contienen (por ejemplo, “wom?n-results.txt” en lugar de “antconc-results.txt”). + +{% include figure.html filename="save-output-as-text-file.png" caption="Opción *Save output as text file*." %} + +{% include figure.html filename="save-as.png" caption="Cuadro de diálogo *Save As*." %} + +Con lo anterior, puedes abrir el archivo de texto plano generado por el programa en un editor de texto; es posible que debas ampliar la ventana de la aplicación para que este sea legible. + +{% include figure.html filename="results.png" caption="Archivo de resultados de búsqueda KWIC exportado por Antconc, tal como se muestra en un editor de texto." %} + +Realiza el proceso anterior con los resultados de las dos búsquedas y compara los archivos de texto generados. ¿Qué fenómenos puedes ver? + +#### Operador | ("o") + +**Tarea:** busca _she\|he_. + +Ahora, busca las dos palabras anteriores por separado: ¿cuántas veces aparece 'she' en comparación con 'he'? + +La palabra 'she' (ella) aparece en mucha menor cantidad que 'he' (él). ¿Por qué? ¡Esa es una pregunta de investigación! Una buena manera de ampliar este cuestionamiento podría radicar en ordenar la búsqueda anterior para identificar patrones de uso de las palabras en cuestión, y revisar si las mismas están seguidas de algún verbo en particular. + +**Tarea:** a modo de práctica, busca una palabra que te interese, ordena los resultados de formas diferentes, usa los operadores comodín y exporta los datos obtenidos como archivos de texto plano. He aquí un interrogante orientador: ¿qué tipo de patrones puedes observar? ¿Puedes explicarlos? + +### Colocaciones y listas de palabras +Después de haber analizado las líneas de resultados de la vista KWIC en busca de patrones, ¿no te gustaría que hubiera una forma de que el computador te brindara una lista de palabras que aparecen más frecuentemente con la palabra clave buscada? + +Buenas noticias: existe una manera de obtener esta información en AntConc; está disponible en la pestaña *Collocates* (colocaciones). Al hacer clic en la misma, aparecerá un mensaje por medio del cual la aplicación dirá que necesita crear una lista de palabras. Haz clic en "OK" y el programa lo hará automáticamente. + +> Nota: solo recibirás este aviso cuando no hayas creado una lista de palabras. + +{% include figure.html filename="wordlistwarning.png" caption="Mensaje de advertencia para indicar la necesidad de generar una lista de palabras." %} + +Ahora, intenta generar la lista de colocaciones para el término 'she'. + +Los resultados sin clasificar parecerán comenzar con palabras funcionales (palabras con las que se construyen frases) y luego pasarán a palabras de contenido (términos que dan sentido al texto): las primeras son [las más frecuentes en inglés](https://www.wordfrequency.info/free.asp), en tanto funcionan mayormente como elementos para construir frases. Versiones más recientes de AntConc suelen incluir el término buscado como primer resultado, posiblemente porque está presente en el texto y se quiere hallar palabras que puedan aparecer junto a él. + +Algunas personas podrían tener la intención de prescindir de esta clase de palabras mediante el uso de una lista de palabras funcionales (esta es una labor común cuando se hace modelado de tópicos). Desde nuestra óptica, no promovemos esta práctica porque los computadores se destacan, justamente, en la identificación de palabras con alta frecuencia de aparición; tal como se expresó anteriormente, tendemos a pasarlas por alto. Los computadores —y en especial las aplicaciones como AntConc—, pueden mostrar dónde aparecen o no estas palabras, y esa información puede ser de interés, especialmente en colecciones de texto de gran envergadura (como se vio con las búsquedas de 'a', 'she' y 'he'). + +No obstante, en el caso de la lengua inglesa, la frecuencia de aparición de la letra 's' en el corpus también puede ser bastante alta, en tanto representa el posesivo *ʼs* (la aplicación no toma en cuenta el apóstrofo), pero AntConc la toma como otra palabra. Asimismo, la forma *ʼt* puede aparecer junto al verbo 'do' por cuanto conforman la contracción *donʼt*; la alta frecuencia de su aparición conjunta los convierte en colocaciones altamente probables. + +**Tarea:** genera la lista de colocaciones para las búsquedas de *m?n* y *wom?n*. Ahora, ordénalas de acuerdo con su frecuencia de aparición respecto del parámetro 1L. +Los resultados muestran lo que, en teoría, hace que un hombre (*man*) o una mujer (*woman*) sea “digno de mostrarse en el cine”: +- las mujeres deben ser "bellas" (beautiful), "sofisticadas" (*sophisticated*) o estar "embarazadas" (*pregnant*). +- Los hombres tienen que estar, en cierto modo, fuera de lo común: deben ser "santos" (*holy*), "negros" (*black*) o "viejos" (*old*). + +Lo anterior no alude directamente a las películas, sino a la forma como se escribe sobre ellas en las reseñas, y puede llevar a cuestionamientos más sutiles, tales como "¿de qué manera se describen los roles de las mujeres en las comedias románticas en las reseñas escritas por hombres frente a las escritas por mujeres?" + +### Comparación de corpus + +Uno de los tipos de análisis más potentes radica en comparar el corpus propio con uno de referencia más extenso. + +Para este ejercicio, hemos tomado reseñas de filmes en los que Steven Spielberg ha estado involucrado (como director o productor). Podemos compararlos con un corpus de referencia de películas de toda una gama de directores. + +Asegúrate de pensar cuidadosamente sobre las características que podría tener un corpus de referencia para tu propia investigación (por ejemplo, un estudio del lenguaje de Agatha Christie en sus últimos años funcionaría muy bien como un corpus de análisis para compararlo con un corpus de referencia de todas sus novelas). Recuerda que, como lo expresamos anteriormente, la construcción del corpus es un subcampo en sí mismo. + +- Dirígete a "Settings" – "Tool preferences" – "Keyword List". +- Asegúrate de que la casilla de verificación "Use raw files" esté seleccionada en el menú "Reference Corpus". +- Haz clic en el botón "Add Directory" y selecciona la carpeta que contiene los archivos del corpus de referencia. +- Verifica que dispongas de la lista completa de archivos en el listado que se mostrará. + +{% include figure.html filename="adding-a-reference-corpus.png" caption="Carga de un corpus de referencia." %} + +- Haz clic en el botón "Load" y espera que el programa cargue los archivos; una vez la casilla de verificación "Loaded" esté marcada, haz clic en "Apply". + +Existe la posibilidad de intercambiar los roles del corpus de referencia y los archivos principales (es decir, dar al primero la función de los segundos y viceversa) por medio del botón "Swap Ref/Main Files"; en este punto vale la pena experimentar con esta opción y comparar los resultados obtenidos. + +> Si estás utilizando una versión más reciente del programa, el botón anterior puede llamarse "Swap with Target Files". Adicionalmente, cualesquiera sean los datos que vayas a utilizar como corpus de referencia, asegúrate de que estos se carguen correctamente en AntConc (esto es, haz clic en el botón "Load" cada vez que cargues o intercambies un corpus). + +- Dirígete a la pestaña "Keyword list" y una vez allí, presiona el botón "Start" (sin escribir nada en la casilla de búsqueda). Si intercambiaste el corpus de referencia con los archivos objeto del análisis, el programa anunciará la necesidad de crear una nueva lista de palabras antes de generar la lista de palabras clave. Esta se compondrá de aquellos términos que resulten mucho más "inusuales" —de aparición menos probable en terminos estadísticos— en el corpus que se está viendo *vs.* el de referencia. + +> *Keyness* (calidad de la palabra clave): corresponde a la frecuencia de aparición de una palabra en el texto cuando se la compara con su frecuencia en un corpus de referencia, "de tal suerte que la probabilidad estadística, calculada mediante un procedimiento determinado, es menor o igual que el valor *p* especificado por el usuario" (información tomada de [este sitio](https://www.lexically.net/downloads/version6/HTML/index.html?keyness_definition.htm)). Para profundizar sobre los detalles estadísticos de este tópico, sugerimos revisar la sección sobre el mismo en la página 7 del [archivo *Readme* de AntConc](https://www.laurenceanthony.net/software/antconc/releases/AntConc335/help.pdf). + +¿Cuáles son nuestras palabras clave? + +{% include figure.html filename="spielberg-vs-movie-reviews.png" caption="Spielberg *vs.* reseñas cinematográficas." %} + +## Discusión: hacer comparaciones significativas + +Es importante tener en cuenta que la forma en que se organicen los archivos de texto para la investigación tendrá efectos en el tipo de interrogantes que puedan surgir de los mismos, así como en los resultados que se obtengan del análisis. A este respecto, recuerda que la comparación realizada aquí entre reseñas negativas y positivas es extremadamente simple; si se quisiere, podrían efectuarse comparaciones adicionales con otros subconjuntos de reseñas, lo cual daría pie a la formulación de interrogantes muy distintos. + +Así entonces, los archivos que se dispongan en el corpus determinarán los resultados obtenidos. Reiteramos que los temas de representatividad y muestreo son muy relevantes en este sentido: no siempre es necesario o ideal utilizar todo un conjunto de datos, incluso si se dispone de él. En este punto, realmente cabe preguntarse por la manera como estos métodos de análisis textual ayudan a generar preguntas de investigación. + +Si se piensa, por ejemplo, en el funcionamiento de las reseñas cinematográficas en tanto género discursivo, puede dirigirse la atención hacia oposiciones como las siguientes: + +- Reseñas cinematográficas *vs.* reseñas musicales +- Reseñas cinematográficas *vs.* reseñas de libros +- Reseñas cinematográficas *vs.* noticias deportivas +- Reseñas cinematográficas *vs.* noticias en general + +Cada una de estas comparaciones aportará información distinta y puede derivar en preguntas de investigación diferentes, tales como: + +- ¿En qué difieren las reseñas cinematográficas de otros tipos de reseñas de productos mediáticos? + +- ¿En qué se diferencian las reseñas cinematográficas de otros tipos de escritos susceptibles de publicarse? +- ¿Cómo se comparan las reseñas de películas con otros géneros de escritura, tales como la crónica deportiva? +- ¿Qué tienen en común las reseñas cinematográficas y las musicales? + +Desde luego, puede darse la vuelta a estos cuestionamientos para generar nuevas preguntas: + +- ¿En qué se diferencian las reseñas bibliográficas de las cinematográficas? + +- ¿En qué difieren las reseñas musicales de las cinematográficas? +- ¿Qué tienen en común los artículos que se publican en la prensa escrita? +- ¿En qué se asemejan las reseñas cinematográficas a otros tipos de escritos susceptibles de publicarse? + +En síntesis, vale la pena pensar en: + +- Por qué se quiere comparar dos corpus. +- Qué tipo de consultas da lugar a preguntas de investigación significativas. +- Principios de construcción de corpus: muestreo y capacidad de asegurar que se obtengan datos representativos. + +### Recursos adicionales + +*[A Short Bibliography on Corpus Linguistics](https://hfroehlich.wordpress.com/2014/05/11/intro-bibliography-corpus-linguistics/)* + +[Una versión más sencilla de este tutorial, concebida para usuarios con pocos conocimientos de computación](https://hfroehli.ch/workshops/getting-started-with-antconc/) (en inglés). + +*[Guía rápida de análisis de corpus con AntConc](https://rua.ua.es/dspace/bitstream/10045/43959/4/grac.pdf)*, publicada por la Universidad de Alicante (2015). + +## Notas de traducción + +[^1]: Investigador y docente de la Universidad de Waseda (Japón). +[^2]: La interfaz del programa solo está disponible en inglés. +[^3]: Dos materiales en español pueden ser de utilidad si se desea profundizar en esta témática: de un lado, la conferencia *[Aproximación al concepto de representatividad de corpus](https://www.youtube.com/watch?v=bvTigjPhZco)*; y de otro, la obra *[Explotación de los córpora textuales informatizados para la creación de bases de datos terminológicas basadas en el conocimiento](https://web.archive.org/web/20150926235725/https://elies.rediris.es/elies18/)*. +[^4]: Si se requiere trabajar con corpus en cuyos textos se emplean caracteres especiales (como es el caso de los documentos escritos en lengua española), es imperativo prestar atención a la codificación con la cual se guardaron los archivos que los componen. Por defecto, AntConc está configurado para operar con documentos de texto plano con codificación Unicode (UTF-8). Así entonces, es preciso verificar en el editor de texto que estos se hayan guardado atendiendo a lo anterior, o bien cambiar los parámetros de importación de archivos en el programa según las necesidades (por ejemplo, trabajar con archivos codificados en ANSI). diff --git a/es/lecciones/analisis-de-correspondencia-en-r.md b/es/lecciones/analisis-de-correspondencia-en-r.md index a288c91498..789525dc0f 100644 --- a/es/lecciones/analisis-de-correspondencia-en-r.md +++ b/es/lecciones/analisis-de-correspondencia-en-r.md @@ -108,7 +108,7 @@ Como historiador, mi sospecha es que los MP se organizan según los temas de cad ## Preparando R para el análisis de correspondencia -Para realizar un análisis de correspondencia necesitaremos un paquete que pueda realizar álgebra lineal. Para quienes tengan más inclinación por las matemáticas, en esta lección se incluye un [apéndice](#Apéndice) con algunos detalles sobre cómo se realiza esto. En R, existe una serie de opciones para el AC, pero nosotros utilizaremos el paquete [FactoMineR](https://perma.cc/YFT7-P5Y7),[^factominer] que está enfocado en el "análisis exploratorio de datos multivariados". FactoMineR puede ser usado para realizar todo tipo de análisis multivariados, incluyendo conglomerados jerárquicos, análisis factorial, etcétera. +Para realizar un análisis de correspondencia necesitaremos un paquete que pueda realizar álgebra lineal. Para quienes tengan más inclinación por las matemáticas, en esta lección se incluye un [apéndice](#apéndice-las-matemáticas-detrás-del-análisis-de-correspondencia) con algunos detalles sobre cómo se realiza esto. En R, existe una serie de opciones para el AC, pero nosotros utilizaremos el paquete [FactoMineR](https://perma.cc/YFT7-P5Y7),[^factominer] que está enfocado en el "análisis exploratorio de datos multivariados". FactoMineR puede ser usado para realizar todo tipo de análisis multivariados, incluyendo conglomerados jerárquicos, análisis factorial, etcétera. Pero primero, así es como se instalan y llaman los paquetes de R, y cómo luego los datos se asignan a un objeto de R para trabajar sobre ellos. @@ -133,7 +133,7 @@ harper_df <- read.csv("https://raw.githubusercontent.com/programminghistorian/je ## Los datos -Los datos originales de la versión en inglés de este tutorial se encuentran archivados en [Zenodo](http://doi.org/10.5281/zenodo.889846), en caso de que quieras ver los datos brutos. Se han incluido en formato tabular también. En esta traducción al español trabajaremos sobre una versión traducida de los datos (no es necesario que descargues estos archivos de forma manual; los descargaremos directamente usando R): +Los datos originales de la versión en inglés de este tutorial se encuentran archivados en [Zenodo](https://doi.org/10.5281/zenodo.889846), en caso de que quieras ver los datos brutos. Se han incluido en formato tabular también. En esta traducción al español trabajaremos sobre una versión traducida de los datos (no es necesario que descargues estos archivos de forma manual; los descargaremos directamente usando R): 1) [CPCs de Harper]({{ site.baseurl }}/assets/correspondence-analysis-in-R/es-translation/HarperCP-es.csv) 2) [CPCs de Trudeau]({{ site.baseurl }}/assets/correspondence-analysis-in-R/es-translation/TrudeauCP-es.csv) @@ -182,7 +182,7 @@ Desafortunadamente, tenemos un problema más. Un gran número de MP son miembros ``` La función `colSums` suma los valores de cada columna de la tabla. `rowSums` puede utilizarse para sumar las filas si fuese necesario. -La función `CA` (análisis de correspondencia, por las siglas en inglés de Correspondence Analysis) grafica los resultados para las dos dimensiones principales y guarda el resumen de los datos en una variable a la que llamamos `AC_harper`. En gran medida, la función `CA` hace casi todo el trabajo por nosotros. Agregamos el argumento `title = "Mapa de factores AC - Harper"` para que el título aparezca en español. Si no incluyes esa línea de código obtendrás el mismo gráfico, pero con el título por defecto en inglés ("CA factor map"). Como se señaló antes, en el [apéndice](#Apéndice) se ofrecen más detalles sobre las matemáticas detrás del AC. +La función `CA` (análisis de correspondencia, por las siglas en inglés de Correspondence Analysis) grafica los resultados para las dos dimensiones principales y guarda el resumen de los datos en una variable a la que llamamos `AC_harper`. En gran medida, la función `CA` hace casi todo el trabajo por nosotros. Agregamos el argumento `title = "Mapa de factores AC - Harper"` para que el título aparezca en español. Si no incluyes esa línea de código obtendrás el mismo gráfico, pero con el título por defecto en inglés ("CA factor map"). Como se señaló antes, en el [apéndice](#apéndice-las-matemáticas-detrás-del-análisis-de-correspondencia) se ofrecen más detalles sobre las matemáticas detrás del AC. Al ejecutar el código, deberías obtener un gráfico parecido a este: @@ -215,7 +215,7 @@ En vez de superponerse, ahora las etiquetas utilizan flechas para mostrar su ubi Los gráficos con los datos se ven mejor, pero ¿qué tanto podemos confiar en la validez de los resultados? Nuestra primera pista es mirar las dimensiones (ver Dim 1 y Dim 2 en los ejes horizontal y vertical del gráfico). En el AC de los datos de Harper, solo un once y diez por ciento de valor explicativo aparece en los ejes horizontal y vertical,[^explanatory] lo que da un total ¡de 21 por ciento! Eso no suena muy prometedor para nuestro análisis. Si recordamos que el total del número de dimensiones es igual al número de filas o columnas (la que sea más pequeña), esto puede ser preocupante. Cuando ocurren valores tan bajos, suele significar que que los puntos de datos están distribuidos equitativamente. Que los MP estén distribuidos de manera equitativa en los CP es una convención bastante establecida en el parlamento canadiense. -Otra manera de mirar los datos es a través de los valores de inercia.[^inertia] Se pueden encontrar más detalles sobre ella en el [apéndice](#Apéndice), pero mirando este gráfico, se puede decir que los puntos distantes del origen tienen mayor inercia. Estos puntos sugieren valores fuera de rango, es decir, actores o eventos que tienen que tienen menos conexiones que los que se encuentran en el centro. Los valores bajos, por su parte, sugieren la existencia de puntos de datos que tienen más en común con el grupo como un todo. Como herramienta de análisis, puede ser útil para encontrar actores renegados o subgrupos dentro del conjunto de datos. Si todos los puntos tienen una inercia alta, puede ser indicador de una alta diversidad o fragmentación para las redes. Si es baja puede ser indicador de una mayor cohesión o convergencia general. Lo que esto signifique dependerá del conjunto de datos. En el caso de nuestros gráficos, ningún punto se aventura más allá de dos pasos desde la media. Nuevamente, esto es un indicador de que las relaciones están distribuidas de una forma relativamente equitativa. +Otra manera de mirar los datos es a través de los valores de inercia.[^inertia] Se pueden encontrar más detalles sobre ella en el [apéndice](#apéndice-las-matemáticas-detrás-del-análisis-de-correspondencia), pero mirando este gráfico, se puede decir que los puntos distantes del origen tienen mayor inercia. Estos puntos sugieren valores fuera de rango, es decir, actores o eventos que tienen que tienen menos conexiones que los que se encuentran en el centro. Los valores bajos, por su parte, sugieren la existencia de puntos de datos que tienen más en común con el grupo como un todo. Como herramienta de análisis, puede ser útil para encontrar actores renegados o subgrupos dentro del conjunto de datos. Si todos los puntos tienen una inercia alta, puede ser indicador de una alta diversidad o fragmentación para las redes. Si es baja puede ser indicador de una mayor cohesión o convergencia general. Lo que esto signifique dependerá del conjunto de datos. En el caso de nuestros gráficos, ningún punto se aventura más allá de dos pasos desde la media. Nuevamente, esto es un indicador de que las relaciones están distribuidas de una forma relativamente equitativa. Miremos los datos más de cerca: @@ -473,7 +473,7 @@ Otro puntaje importante es visible en el gráfico CA: el porcentaje de valor exp [^inertia]: En general, en estadística el término _inercia_ hace referencia a la variación o "extensión" de un conjunto de datos. Es análoga a la desviación estándar en la distribución de datos. -[^pickton]: Ver Laura Kane (April 3, 2017), "Missing and murdered women's inquiry not reaching out to families, say advocates." *CBC News Indigenous*. [http://www.cbc.ca/news/indigenous/mmiw-inquiry-not-reaching-out-to-families-says-advocates-1.4053694](https://perma.cc/MH3Y-9HW2) +[^pickton]: Ver Laura Kane (April 3, 2017), "Missing and murdered women's inquiry not reaching out to families, say advocates." *CBC News Indigenous*. [https://www.cbc.ca/news/indigenous/mmiw-inquiry-not-reaching-out-to-families-says-advocates-1.4053694](https://perma.cc/MH3Y-9HW2) [^pvalue]: En estadística, un valor p, una abreviación para _valor de probabilidad_, es un indicador de qué tan probable es que un determinado resultado haya ocurrido por azar. Un valor p bajo sugiere una baja probabilidad de que el resultado sea producto del azar y, por lo tanto, entrega evidencia de que la hipótesos nula, (en este caso, que los MP y los CP son categorías independientes) es poco probable. diff --git a/es/lecciones/analisis-de-sentimientos-r.md b/es/lecciones/analisis-de-sentimientos-r.md index ea74d54d62..ceeb6967c2 100644 --- a/es/lecciones/analisis-de-sentimientos-r.md +++ b/es/lecciones/analisis-de-sentimientos-r.md @@ -49,11 +49,11 @@ El paquete `syuzhet` trabaja con cuatro diccionarios de sentimientos: Bing, Afin Sus términos de uso indican que el vocabulario puede ser utilizado de forma gratuita con propósitos de investigación, por lo que todos los datos están disponible para su descarga. -Si sabes inglés, puedes interactuar con las diferentes categorías en su página web [NRC Word-Emotion Association Lexicon](http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm). En ella también puedes encontrar trabajos publicados sobre la obtención de los valores para el vocabulario, su organización, ampliación, etc. +Si sabes inglés, puedes interactuar con las diferentes categorías en su página web [NRC Word-Emotion Association Lexicon](https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm). En ella también puedes encontrar trabajos publicados sobre la obtención de los valores para el vocabulario, su organización, ampliación, etc. ## Paquete `syuzhet` -El [paquete de R `syuzhet`](https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html) fue desarrollado en 2015 por Matthew Jockers; continuamente introduce cambios y se encarga de mantenerlo (al momento de preparar esta lección se usó la versión de diciembre 2017). Una serie de entradas de blog acompañan el desarrollo del paquete, y pueden consultarse (en inglés) en el blog del profesor desde el [5 de junio de 2014.](http://www.matthewjockers.net/page/2/) +El [paquete de R `syuzhet`](https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html) fue desarrollado en 2015 por Matthew Jockers; continuamente introduce cambios y se encarga de mantenerlo (al momento de preparar esta lección se usó la versión de diciembre 2017). Una serie de entradas de blog acompañan el desarrollo del paquete, y pueden consultarse (en inglés) en el blog del profesor desde el [5 de junio de 2014.](https://www.matthewjockers.net/page/2/) Por descontado, el paquete ha sido desarrollado con pruebas en textos escritos o traducidos al inglés y no sin debate sobre su utilidad, por asignar valores a textos literarios que suelen ser, por naturaleza, bastante subjetivos. diff --git a/es/lecciones/analisis-redes-sociales-teatro-1.md b/es/lecciones/analisis-redes-sociales-teatro-1.md index 08e3edd32f..49d5820804 100644 --- a/es/lecciones/analisis-redes-sociales-teatro-1.md +++ b/es/lecciones/analisis-redes-sociales-teatro-1.md @@ -12,7 +12,7 @@ reviewers: editors: - Jennifer Isasi review-ticket: https://github.com/programminghistorian/ph-submissions/issues/517 -next: analisis-redes-sociales-teatro-2 +next: /es/lecciones/analisis-redes-sociales-teatro-2 series_total: 2 lessons sequence: 1 difficulty: 1 @@ -40,7 +40,7 @@ En esta lección trabajaremos las relaciones entre los personajes de los textos Para poder estudiar las relaciones entre personajes nos serviremos del [Análisis de Redes Sociales](https://perma.cc/UW6A-33KQ) (ARS), un campo de estudio interdisciplinario que toma elementos de la sociología, la psicología, la estadística, las matemáticas y las ciencias computacionales[^3]. Gracias al análisis de redes podemos abstraer y representar cualquier sistema formado por elementos relacionados y estudiarlo aplicando conceptos y medidas de la [teoría de grafos](https://perma.cc/P963-APQC). La informática, la física, la biología o la sociología, son disciplinas que tradicionalmente han identificado en sus campos de investigación sistemas susceptibles de estudiarse a través de redes, y recientemente también lo han hecho las humanidades, especialmente la historia[^4] y los estudios literarios. Del interés de la historia por el análisis de redes dan cuenta las lecciones de _Programming Historian_ [Análisis de redes temporal en R](/es/lecciones/analisis-temporal-red) o [De la hermenéutica a las redes de datos: Extracción de datos y visualización de redes en fuentes históricas](/es/lecciones/creando-diagramas-de-redes-desde-fuentes-historicas). Por otro lado, los estudios literarios han utilizado el análisis de redes para el estudio de los sistemas de personajes, de las redes de producción literaria, para representar los resultados del análisis estilométricos de autoría, etc[^5]. Por ejemplo, sobre el estudio de personajes tetrales a través del análisis de redes sociales, podemos destacar los trabajos del grupo de investigación HDAUNIR a partir del corpus [BETTE](https://perma.cc/2NR3-V5UU)[^6]; y sobre el estudio de la novela y sus personajes los trabajos de Isasi[^7]. -El análisis de redes sociales es para la crítica literaria una metodología de tipo "distant reading" ([lectura distante](https://web.archive.org/web/20210622210039/http://dictionaryworldliterature.org/index.php/Lectura_distante)) en términos de Moretti[^8], o "macroanlysis" si preferimos el concepto de Matthew L. Jockers[^9]. Es decir, nos permite estudiar grandes cantidades de textos a través de sus formas, relaciones, estructuras y modelos[^10], al cambiar el foco de atención de las características individuales a las tendencias o patrones de repetidas en un corpus [^11]. Más recientemente, Escobar Varela ha investigado las posibilidades de estudiar el teatro a través de datos como parte de lo que denomina "computational theater research"[^12]. Este concepto refiere a los estudios teatrales computacionales en su sentido más amplio; incluye los enfoques escénicos además de los literarios. Desde un enfoque puramente textual, dentro de los "Computational Literary Studies"(CLS), está en proceso de conformación un área especializada en teatro, denominada "Computational Drama Analysis", que integra el análisis de redes sociales, junto a otras metodologías cuantitativas y computacionales, tal como la estilometría, el análisis de sentimientos o el modelado de tópicos[^13]. +El análisis de redes sociales es para la crítica literaria una metodología de tipo "distant reading" ([lectura distante](https://web.archive.org/web/20210622210039/https://dictionaryworldliterature.org/index.php/Lectura_distante)) en términos de Moretti[^8], o "macroanlysis" si preferimos el concepto de Matthew L. Jockers[^9]. Es decir, nos permite estudiar grandes cantidades de textos a través de sus formas, relaciones, estructuras y modelos[^10], al cambiar el foco de atención de las características individuales a las tendencias o patrones de repetidas en un corpus [^11]. Más recientemente, Escobar Varela ha investigado las posibilidades de estudiar el teatro a través de datos como parte de lo que denomina "computational theater research"[^12]. Este concepto refiere a los estudios teatrales computacionales en su sentido más amplio; incluye los enfoques escénicos además de los literarios. Desde un enfoque puramente textual, dentro de los "Computational Literary Studies"(CLS), está en proceso de conformación un área especializada en teatro, denominada "Computational Drama Analysis", que integra el análisis de redes sociales, junto a otras metodologías cuantitativas y computacionales, tal como la estilometría, el análisis de sentimientos o el modelado de tópicos[^13]. Para llevar a cabo un análisis de redes sociales de personajes teatrales debemos seguir una serie de pasos consecutivos: * Paso 1. Creación del corpus de análisis @@ -471,7 +471,7 @@ Jiménez Fernández, C. M., y Calvo Tello, J. "Grafos de Escenas y Estudios Lite Jockers, M. L. _Macroanalysis: Digital Methods and Literary History_. University of Illinois Press, 2013. -Merino Recalde, D. "El sistema de personajes de las comedias urbanas de Lope de Vega. Propuesta metodológica y posibilidades del análisis de redes sociales para el estudio del teatro del Siglo de Oro" (Trabajo de Fin de Máster, Universidad Nacional de Educación a Distancia, 2022). [http://e-spacio.uned.es/fez/view/bibliuned:master-Filologia-FILTCE-Dmerino](https://perma.cc/4C7R-39V3). +Merino Recalde, D. "El sistema de personajes de las comedias urbanas de Lope de Vega. Propuesta metodológica y posibilidades del análisis de redes sociales para el estudio del teatro del Siglo de Oro" (Trabajo de Fin de Máster, Universidad Nacional de Educación a Distancia, 2022). [https://e-spacio.uned.es/fez/view/bibliuned:master-Filologia-FILTCE-Dmerino](https://perma.cc/4C7R-39V3). Martínez Carro, E. "Una interpretación digital de dos tragedias lorquianas: Yerma y Doña Rosita la soltera." _Caracteres: estudios culturales y críticos de la esfera digital_ 7, no. 2 (2018): 240-267. diff --git a/es/lecciones/analisis-redes-sociales-teatro-2.md b/es/lecciones/analisis-redes-sociales-teatro-2.md index 01fdde10fc..031d941494 100644 --- a/es/lecciones/analisis-redes-sociales-teatro-2.md +++ b/es/lecciones/analisis-redes-sociales-teatro-2.md @@ -1,288 +1,288 @@ ---- -title: "Análisis de redes sociales de personajes teatrales (parte 2)" -slug: analisis-redes-sociales-teatro-2 -layout: lesson -collection: lessons -date: 2023-11-30 -authors: -- David Merino Recalde -reviewers: -- Sara Arribas Colmenar -- Andrés Lombana -editors: -- Jennifer Isasi -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/547 -previous: analisis-redes-sociales-teatro-1 -series_total: 2 lessons -sequence: 2 -difficulty: 2 -activity: analyzing -topics: [network-analysis, distant-reading, data-visualization] -abstract: En esta lección aprenderás a realizar un Análisis de Redes Sociales con los personajes de un texto teatral. Aprenderás sobre la importación de datos a Gephi, la creación de visualizaciones, la implementación de medidas y algoritmos, y el análisis e interpretación de los resultados. -avatar_alt: Recorte de dibujo a pluma de la escenografía usada en la representación de la comedia 'La fiera, el rayo y la piedra' de Pedro Calderón de la Barca en 1690, en el que se puede ver a varios personajes interactuando en escena. -doi: 10.46430/phes0065 ---- - -{% include toc.html %} - -## Introducción a la segunda parte - -Esta es la segunda parte de la lección _Análisis de redes sociales de personajes teatrales_. En la [primera parte](/es/lecciones/analisis-redes-sociales-teatro-1) conocimos algunas de las aplicaciones del análisis de redes sociales (ARS) a los estudios literarios y aprendimos los conceptos y nociones necesarias para enfrentarnos a esta metodología computacional-cuantitativa. Además, establecimos que para llevar a cabo un análisis de redes sociales de personajes teatrales debemos seguir una serie de pasos consecutivos: - - * Paso 1. Creación del corpus de análisis - * Paso 2. Conseguir los datos - * Toma de decisiones para la extracción de datos - * Extracción y estructuración de datos - * El proceso de vaciado - * Paso 4. Visualización y análisis de grafos con Gephi - * Paso 5. Interpretación de los resultados - -Ya hemos visto los pasos 1 y 2, y en esta segunda parte trataremos los dos últimos pasos. Si has seguido la primera parte de la lección cuentas con todos los archivos necesarios para continuar. Si has saltado directamente a la segunda parte porque lo que te interesa es aprender visualización y análisis de grafos con [Gephi](https://gephi.org/), debes descargar ahora los archivos que utilizaremos aquí. En cualquier caso, recomendamos leer la primera parte, pues es importante comprender el proceso de extracción y recogida de datos para poder analizar correctamente los resultados del análisis. ¡Vamos a ello! - -## Paso 3. Visualización y análisis de grafos con Gephi - -Tenemos tres archivos CSV: por un lado, una [lista de nodos](/assets/analisis-redes-sociales-teatro-1/nodos_bizarrias.csv) (`nodos_bizarrias.csv`); por el otro, la [lista de aristas](/assets/analisis-redes-sociales-teatro-1/aristas-coaparicion_bizarrias.csv) de un grafo no dirigido (`aristas-coaparicion_bizarrias.csv`) y la [matriz de adyacencia](/assets/analisis-redes-sociales-teatro-1/aristas-interaccion_bizarrias.csv) de uno dirigido (`aristas-interaccion_bizarrias.csv`), según el criterio de la coaparición de personajes en escena y el de interacciones lingüísticas directas entre personajes, respectivamente. El siguiente paso es generar visualizaciones, los grafos propiamente dichos, y analizarlos aplicando lo que se conoce como 'medidas' o 'métricas' de ARS. - -### Instalación de Gephi y primeros pasos - -El programa que vamos a utilizar para llevar a cabo todo esto se llama [Gephi](https://gephi.org/), pero existen muchos otros para los que también te servirán los archivos CSV que hemos preparado[^1]. Gephi es un software libre de código abierto especializado en análisis de redes, muy conocido y utilizado en Humanidades Digitales, bastante intuitivo, y que es sostenido y actualizado por sus desarrolladores[^2]. Además, disponemos de numerosos [plugins](https://gephi.org/plugins/#/) (complementos de software que añaden funcionalidades al programa), [guías de uso](https://perma.cc/4RFA-TZB9), videotutoriales en español[^3] y una comunidad activa en Twitter/X y Github a la que consultar nuestras dudas. - -Lo primero que debemos hacer es instalar el programa. En su sitio web, [https://gephi.org/](https://gephi.org/), haz clic en _Download FREE_. Está disponible para Windows, Mac OS y Linux. Es posible que la web reconozca tu sistema operativo y te ofrezca lo que necesitas, si no, selecciona en el apartado **All Downloads** de tu sistema operativo. Si necesitas ayuda con la instalación, puedes visitar [https://gephi.org/users/install/](https://perma.cc/YF6E-994N) (está solo disponible en inglés, pero puedes consultar los primeros minutos de este [videotutorial en español](https://www.youtube.com/watch?v=sX5XYec4tWo)). - -Una vez que finalices la instalación, ejecuta Gephi. Se abrirá una ventana de bienvenida con distintas opciones: crear un nuevo proyento, abrir un archivo de grafo ya existente, una columna con proyectos y archivos recientes (si los hubiese) y varios proyectos de ejemplo. Haz clic en _Nuevo proyecto_: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-01.png" alt="Captura de pantalla de la ventana de bienvenida al programa Gephi, con las opciones de crear un nuevo proyecto, abrir recientes o proyectos de ejemplo" caption=" Figura 1. Ventana de bienvenida de Gephi" %} - -Ahora estás en la pantalla principal del programa. Gephi funciona mediante proyectos (fíjate que te indicará en la barra superior que estás en el **Proyecto 1**), y dentro de cada proyecto puedes crear distintos espacios de trabajo. Ahora estás en el **Espacio de trabajo 1**. Cada espacio de trabajo funciona como la pestaña de un navegador web y contiene a su vez los tres apartados de Gephi: **Vista general**, **Laboratorio de datos** y **Previsualización**. - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-02.png" alt="Captura de pantalla de la pantalla principal del programa Gephi, la llamada vista general" caption="Figura 2. Pantalla principal de Gephi, la Vista general" %} - -
    -Si te aparece el programa en inglés te recomiendo cambiar el idioma, pues esta lección se ha preparado con Gephi en español. Puedes hacerlo fácilmente en Tools > Language > Español. Te indicará que el programa debe cerrarse y que deberás reiniciarlo manualmente, es decir, volver a abrirlo. No es necesario que guardes nada si aún no has importando ningún dato. -
    - -En la pestaña **Vista general**, se crean las visualizaciones y se aplican los filtros y medidas para analizar los grafos. En **Laboratorio de datos** se trabaja con los datos que generan los grafos, pudiéndose importar o introducir directamente, modificar y exportar. En el apartado de **Previsualización** se realizan los últimos ajustes para generar y exportar las visualizaciones (grafos) en formato de imagen `.svg`, `.pdf` o `.png`. - -Comencemos a trabajar: -1. En la barra de opciones superior, haz clic en **Espacio de trabajo** > **Nuevo** para crear un nuevo espacio de trabajo. -2. Renombra los dos espacios creados. Dentro de cada espacio, has clic en **Espacio de trabajo** > _Renombrar_. Denomina al primero 'Coaparición en escena', y al segundo, 'Interacción lingüística'. -3. Guarda el proyecto en **Archivo** > _Guardar como_, y denomínalo `bizarrias.gephi`. - -### El laboratorio de datos: importación de aristas y nodos - -Ahora vamos a importar nuestros datos. Lo haremos en paralelo con los dos grafos, pues te ayudará a no perderte. Primero las aristas del grafo de coaparición de personajes en escena: -1\. En el espacio de trabajo 'Coaparición en escena', dirígete al **Laboratorio de datos** y haz clic en _Importar hoja de cálculo_. -2\. Busca y selecciona el archivo `aristas-coaparicion_bizarrias.csv` y haz clic en _Abrir_. -3\. Se abrirá una primera ventana de **Opciones generales de CSV**. Seguramente Gephi ha detectado que se trata de una tabla de aristas, que el separador es la coma y que el formato de codificación de caracterse es UTF-8. Si no, selecciona estas opciones en los desplegables y haz clic en _Siguiente_. - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-03.png" alt="Captura de pantalla de la ventana de importación de hojas de cálculo con las opciones generales de importación desde archivos CSV para la lista de aristas" caption="Figura 3. Ventana de importación de hojas de cálculo con las opciones generales para la lista de aristas" %} - -4\. En la siguiente ventana, **Parámetros de importación**, deja seleccionadas todas las casillas, pues queremos importar nuestras cinco columnas. Gephi reconoce el tipo de datos: `double` (números) para el peso y `string` (cadena de caracteres) para las etiquetas. Haz clic en _Terminar_. -5\. Ahora te aparecerá la última ventana del proceso: el **Informe de importación**. Verás que Gephi ha detectado que se trata de un grafo 'no dirigido' con 11 nodos y 42 aristas, y que no encuentra ningún problema en el archivo. Muy importante: cambia la selección de **Nuevo espacio de trabajo** a **Añadir al espacio de trabajo existente**. Queremos que nos importe los datos en el espacio en el que estamos trabajando, **Coaparición en escena**. Cuando lo hagas, haz clic en _Aceptar_. - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-04.png" alt="Captura de pantalla del informe de importación de una lista de aristas, con opciones finales como seleccionar el tipo de grafo o en qué espacio de trabajo se quiere realizar la importación" caption="Figura 4. Ventana con el informe de importación de la lista de aristas" %} - -Verás que ha aparecido una tabla con los `id` de los personajes en la pestaña **Nodos** y una tabla con las relaciones en la pestaña **Aristas**. Gephi ha extraido esta información de nuestra lista de aristas, asignando además un `id` a cada arista. - -Ahora vamos a importar las aristas del grafo de interacciones lingüísticas directas, siguiendo los mismos pasos: -1. Dentro del espacio de trabajo **Interacción lingüística** dirígete al **Laboratorio de datos** y haz clic en _Importar hoja de cálculo_. -2. Busca y selecciona el archivo `aristas-interaccion_bizarrias.csv` y haz clic en _Abrir_. -3. Se abrirá una primera ventana de **Opciones generales de CSV**. Seguramente Gephi ha detectado que se trata de una matriz, que el separador es la coma y que el formato de codificación de caracterse es UTF-8. Si no, selecciona estas opciones en los desplegables y haz clic en _Siguiente_. -4. En la siguiente ventana, **Parámetros de importación**, simplemente haz clic en _Terminar_. Ahora no hay columnas entre las que poder elegir. -5. Por último te aparecerá la ventana **Informe de importación**. Verás que Gephi ha detectado que se trata de un grafo 'dirigido' con 11 nodos y 51 aristas, y que no encuentra ningún problema en el archivo. Muy importante: cambia la selección de **Nuevo espacio de trabajo** a **Añadir al espacio de trabajo existente**. Como antes, queremos que nos importe los datos en el espacio en el que estamos trabajando, **Interacción lingüística**. Cuando lo hagas, haz clic en _Aceptar_. - -Gephi ha importado nuestra matriz y la ha transformado en una lista de aristas con un nodo de origen, otro de destino, un tipo de relación, un peso y un `id`. Además, ha creado 11 nodos utilizando como etiqueta el `id` numérico que les asignamos. - -En la nueva lista de aristas importada, que puedes ver en la pestaña **Aristas** del **Laboratorio de datos**, verás que nos faltan los atributos (‘Label’, etiqueta) que sí pudimos importar en en el grafo de coaparición en escena, pues venían ya en nuestro archivo CSV. Nos faltan las relaciones entre los personajes: amor correspondido, amistad, servidumbre, etc. Para poder visualizarlas en este grafo tendremos que introducirlas manualmente en la columna correspondiente (’Label’, etiqueta). Puedes coger esta información de la lista de aristas del grafo no dirigido, teniendo en cuenta que ahora las relaciones están duplicadas y también tendrás, por tanto, que duplicar las etiquetas. Es decir, etiqueta como `amor correspondido` la relación de Belisa (nodo 1) a Don Juan (nodo 6) y también un `amor correspondido` de Don Juan (nodo 6) a Belisa (nodo 1). Y una relación de `amistad` de Belisa (nodo 1) a Celia (nodo 3) y otra relación de `amistad` de Celia (nodo 3) a Belisa (nodo 1). Cuando termines, tu lista de aristas dirigidas debería verse así: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-05.png" alt="Captura de pantalla del laboratorio de datos en la pestaña de aristas, ya con los todos los datos introducidos" caption="Figura 5. Pestaña de aristas después de introducir manualmente las etiquetas de las relaciones" %} - -Con las aristas preparadas, ahora vamos a importar los datos referentes a los nodos de los dos grafos. Los pasos ahora son exactamente los mismos para los dos grafos, así que hazlo primero en un espacio de trabajo y luego en el otro: - -1. Dentro del **Laboratorio de datos** de cada espacio de trabajo vuelve a hacer clic en _Importar hoja de cálculo_. -2. Ahora busca y selecciona el archivo [`nodos_bizarrias-csv`](/assets/analisis-redes-sociales-teatro-1/nodos_bizarrias.csv) y haz clic en _Abrir_. -3. En esta ocasión Gephi habrá detectado que se trata de una 'tabla de nodos', que nuevamente el separador es la coma y que la codificación de caracteres es UTF-8. Si no, selecciona estas opciones en los desplegables y haz clic en _Siguiente_. -4. En la ventana **Parámetros de importación**, mantén seleccionadas todas las casillas; queremos que importe las cuatro columnas. Ahora ha detectado que tanto la columna `género` como `función` son cadenas de caracteres. Haz clic en _Terminar_. -5. En la última ventana, **Informe de importación**, cerciórate que de que ha identificado 11 nodos y que no hay problemas en la importación. En el desplegable referente al tipo de grafo, selecciona **No dirigido** o **Dirigido** en función del grafo al que estés importando los nodos. Importante: cambia una vez más la opción de **Nuevo espacio de trabajo** a **Añadir al espacio de trabajo existente**. Después, haz clic en _Aceptar_. - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-06.png" alt="Captura de pantalla de la ventana con el informe de importación de la lista de nodos" caption="Figura 6. Ventana con el informe de importación de la lista de nodos del grafo de coaparición de personajes en escena" %} - -Gephi ha importado la lista de nodos y ha combinado la nueva información con los nodos que creó antes a partir de la lista de aristas o la matriz de adyacencia. Este es el motivo por el que era importante sustituir los nombres de los personaje por su `id` antes de exportar las hojas de cálculo a CSV. Así, Gephi ha podido identificar quién es quién y fusionar los datos de ambos archivos. - -¡Enhorabuena! Hemos terminado la importación de los datos de los dos grafos, ahora podemos pasar a trabajar en la pestaña **Vista general**. - -### La vista general -La **Vista general** es donde modificaremos la visualización de nuestros grafos (que se ve en el centro del programa) y donde aplicaremos las medidas y métricas de análisis. A la izquierda tienes las opciones de visualización (los paneles **Apariencia** y **Distribución**), y a la derecha están el panel con información sobre el grafo (**Contexto**) y los paneles **Filtros** y **Estadísticas** para consultar y analizar el grafo: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-07.png" alt="Captura de pantalla de la vista general del espacio de trabajo con una primera visualización del grafo, aún sin cambiar parámetros de visualización" caption="Figura 7. Vista general de nuestro espacio de trabajo" %} - -Las opciones de visualización y análisis son muy numerosas y no las cubriremos todas en esta lección, así que para explorar e introducirnos en Gephi vamos a crear una visualización sencilla y aplicar solo algunas medidas básicas. A partir de ahora todos los pasos que des en un espacio de trabajo puedes replicarlos en el otro. Así, repetir los mismos pasos dos veces te servirá además para aprender a usar el programa. Después, te animo a continuar probando todas las demás opciones y configuraciones por tu cuenta. - -#### Modificar la apariencia y distribución del grafo - -En el centro de la **Vista general**, en el panel llamado **Grafo**, nos ha tenido que aparecer una red con nodos y aristas en negro. Seguramente, el grafo de la captura de arriba (es el de coaparición en escena) no es exactamente igual al que te ha aparecido a ti. Es normal, se ha generado con una distribución de nodos aleatoria. Comencemos a dar forma y color a nuestra red de personajes: - -1. Para desenmarañar la red empezaremos por aplicar un 'algoritmo de distribución'. En el panel de abajo a la izquierda, **Distribución** elige el algoritmo `ForceAtlas 2` y modifica estos parámetros: escalado 2500 y activar _Evitar el solapamiento_. Lo demás puedes dejarlo como está por defecto. Haz clic en _Ejecutar_ y cuando el grafo se estabilice y deje de moverse, haz clic en _Parar_. ¿Qué ha ocurrido? Los nodos han comenzado a repelerse (alejarse) entre ellos a la vez que las aristas que los conectan los han intentado atraer. Así, se ha generado un movimiento que ha terminado convergiendo en una posición balanceada para cada nodo en la que aquellos personajes más conectados entre sí han quedado más cerca y los menos conectados más alejados. El objetivo de este algoritmo de distribución no es otro que colocar los nodos de forma que nos ayude a entender e interpretar mejor el grafo [^4]. Además de `ForceAtlas 2` existen otros algoritmos, como puedes comprobar en el desplegable, pero este nos ofrece buenos resultados y es uno de los más extendidos. -2. Ahora haz clic en el icono 'T' negro que se encuentra en la cinta de opciones inferior, a la derecha de la cámara fotográfica, en la parte inferior del panel del Grafo. Has activado las etiquetas (label) de los nodos, es decir, los nombres de los personajes. Puedes modificar el tamaño, tipografía y color en el resto de opciones de la cinta. -3. Vamos a modificar ahora el color y el tamaño de los nodos y aristas. Para ello, ve al panel **Apariencia** (arriba a la izquierda) y sigue estas indicaciones: -a. En **Nodos-Color** (icono de la paleta de pintura), selecciona **Partición** y escoge el atributo `Función`. Gephi asigna un color distinto a cada valor del atributo, puedes modificar la paleta de colores o dejar los colores por defecto y hacer clic en _Aplicar_. Los nodos del grafo se han coloreado y también lo han hecho las aristas. Ve a la cinta de opciones inferior y deselecciona la opción **Las aristas tienen el color del nodo de origen**, su icono es una línea con un arcoiris. Ahora las aristas serán todas de un mismo color gris. -b. En **Nodos-Tamaño** (icono de los círculos), selecciona **Ranking** y escoge el atributo `Grado` (Gephi calcula automáticamente el grado de los nodos). Cambia el tamaño mínimo a 10 y el máximo a 40 y haz clic en _Aplicar_. Ahora los nodos tienen un tamaño relativo a su grado, es decir, a la cantidad de nodos con los que están relacionados. A mayor número de personajes con los que comparte escena un personaje -> mayor grado del nodo que representa el personaje -> mayor diámetro del nodo en la visualización. -c. En **Aristas-Color** (icono de la paleta de pintura), selecciona **Ranking** y escoge el atributo `Peso`. Te aparecerá un gradiente de color. Puedes cambiar la paleta de colores o dejarlo en verde y hacer clic en _Aplicar_. Ahora el color de las aristas está más o menos intenso en función de su peso, es decir, del número de escenas que comparten dos los personajes o de sus interacciones lingüísticas. Si las ves muy finas, puedes cambiar el tamaño de las aristas en la cinta de opciones inferior, están por defecto más o menos gruesas también según el peso. - -Seguramente te ha quedado algo muy similar esto en el caso del grafo de coaparición de personajes en escena: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-08.png" alt="Captura de pantalla de la vista general del espacio de trabajo con la visualización del grafo una vez aplicados los parámetros de visualización escogidos" caption="Figura 8. Visualización del grafo de coaparición de personajes en escena, resultado de aplicar los parámetros indicados" %} - -¡Enhorabuena! Ahora puedes ver cuáles son los personajes más relacionados (`grado`) por el tamaño de los nodos, la `función` de estos personajes por el color de los nodos y la cantidad de veces que dos personajes coinciden en escena o interactúan entre ellos (`peso`) por el grosor y la intensidad de color de sus aristas. Si comparas la captura con tu vista del grafo de coaparición en escena puede que tu grafo tenga otra disposición. En realidad tus nodos y los míos están colocados en el mismo sitio y a la misma distancia, solo que están rotados en otro sentido. En el panel de **Distribución** puedes utilizar la opción **Rotar** (en el desplegable) y buscar una disposición que te guste más. No cambiará la distribución que creó el algoritmo `ForceAtlas 2`. Otras opciones que puedes explorar son **Contracción** y **Expansión**, o **Ajuste de etiquetas** si alguna está superpuesta. - -Una vez repitas los pasos también en el espacio de trabajo del grafo de interacciones lingüísticas y hayas modificado su apariencia verás que en este caso las aristas tienen flechas que nos indican la dirección de las relaciones, se trata de un grafo dirigido: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-09.png" alt="Captura de pantalla de la vista general del espacio de trabajo con la visualización del grafo una vez aplicados los parámetros de visualización escogidos" caption="Figura 9. Visualización del grafo de interacciones lingüísticas entre personajes, resultado de aplicar los parámetros indicados" %} - -También puedes activar las etiquetas de las aristas, haciendo clic en la 'T' blanca en la cinta de opciones de debajo del grafo. El color de las etiquetas y su tamaño deberás modificarlo en **Apariencia**, en la pestaña **Aristas-A subrayada** (color) y en la pestaña **Aristas-tT** (tamaño): - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-10.png" alt="Captura de pantalla de la vista general del espacio de trabajo con la visualización del grafo según los parámetros escogidos y con las etiquetas de las aristas visibles" caption="Figura 10. Visualización del grafo de coaparición de personajes en escena con las etiqutas de las aristas activadas" %} - -#### El contexto y los filtros - -Nos quedan por explorar los paneles de configuración de la derecha. El de **Contexto** nos da información sobre grafo en pantalla. Por ejemplo, en el de interacciones lingüísticas nos dice que se trata de un 'grafo dirigido' con 11 nodos y 51 aristas. - -Vamos a probar los filtros, por ejemplo, filtrando cualquiera de los grafos según el género de los personajes: -1. En el panel **Filtros**, despliega las carpetas **Atributos** y **Partición** (dentro de la primera). -2. Selecciona el atributo `género (Nodo)` y arrástralo al panel de **Consultas**. -3. Haz clic en _Mujer (45,45 %)_ y en _Filtrar_. - -Verás algo similar a esto, un grafo solo con los personajes clasificados por ti como **Mujer**: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-11.png" alt="Captura de pantalla de la vista general del espacio de trabajo con el resultado de filtrar el grafo según el atributo 'mujer'" caption="Figura 11. Grafo resultante de filtrar por el atributo 'Mujer'" %} - -Puedes hacer lo mismo con los personajes **Hombre** o utilizar otro atributo para el filtrado, como la función de los personajes. Con cada filtro que apliques verás que la información del **Contexto** cambia. Para volver atrás, elimina el filtro con el botón derecho _Suprimir_ sobre el filtro o haciendo clic en _Restaurar_. - -#### Medidas, métricas y algoritmos de análisis - -Ahora vamos a aplicar algunas medidas en el panel **Estadísticas**. Te dejaré explicaciones de cada una. Gephi ha simplificado al máximo el análisis de los grafos, pues es tan fácil como hacer clic en _Ejecutar_ en la medida o algoritmo que queramos implementar. Algunas de estas medidas abriran una ventana emergente al ejecutarlas, un pequeño informe que podemos descargar u opciones de configuración. Otras, simplemente añadirán columnas en nuestra tabla de nodos del **Laboratorio de datos**. Estos nuevos datos, generados gracias a la aplicación de medidas, nos dan más información sobre nuestro grafo, nos permiten modificar la visualización en base a ellos (son como nuevos atributos) y exportándolos podremos procesarlos en otra herramienta o programa. En esta lección no nos adentraremos ahí, pero quiero que sepas que a partir de aquí las posibilidades se multiplican. - -En el apartado **Visión general de la red** lo primero que encontramos es el ['grado medio'](https://perma.cc/M8B7-34LD), es decir, la media de los grados de todos los nodos del grafo. Recordemos que el grado es el número de nodos con los que un nodo está conectado. En el caso de los grafos dirigidos, obtendremos además el 'grado medio de entrada' y el 'grado medio de salida'. Después, el 'grado medio con pesos', que tiene en cuenta el peso de las aristas conectadas a un nodo y no simplemente la cantidad nodos con los que se conecta. De nuevo, habrá un 'grado medio con pesos de entrada' y un 'grado medio con pesos de salida'. Al ejecutar estas dos estadísticas, se añadirán dos columnas nuevas en la tabla de nodos del **Laboratorio de datos** con los valores de grado y grado con peso de cada nodo: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-12.png" alt="Captura de pantalla del laboratorio de datos del grafo de interacciones lingüísticas con columnas resultantantes de aplicar las medidas de grado" caption="Figura 12. Laboratorio de datos del grafo de interacciones lingüísticas con las nuevas columnas de grado" %} - -El 'diámetro de la red' es una de las medidas de tamaño o distancia. Para entenderlo, primero has de saber que en análisis de redes se entiende por 'camino' una secuencia de nodos conectados por aristas. Esta noción de camino nos permite calcular las métricas de distancia y tamaño de la red. Por otro lado, se entiende por ['distancia'](https://perma.cc/YYA3-ZLG9) o 'longitud' de un camino el número de aristas (no de nodos) que deben cruzarse para ir de un nodo a otro (siempre por el camino más corto). El ['diámetro'](https://perma.cc/2EU8-J4ZR) es, entonces, la distancia entre los nodos más alejados de una red: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-13.png" alt="Grafo explicativo del concepto 'diámetro', con las aristas que sirven para medir el diámetro coloreadas" caption="Figura 13. Ejemplo del diámetro de una red" %} - -Haz clic en _Ejecutar_ el diámetro: -1. En la ventana que se ha abierto encontrarás definiciones de las métricas de distancia: distancia media, diámetro y las medidas de centralidad de intermediación, cercanía y excentricidad. Al ejecutar esta función, no solo se calcula el diámetro sino todas esas métricas relacionadas con la distancia. -2. Gephi te permite normalizar las centralidades (ahora veremos lo que son) en un rango [0,1], lo que facilita después la comparación de grafos de obras distintas. Marca esta opción y haz clic en _Aceptar_. - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-14.png" alt="Captura de pantalla de la ventana de parámetros que se abre para ejecutar las medidas de distancia de un grafo" caption="Figura 14. Ventana de parámetros de distancia del grafo de coaparición de personajes en escena" %} - -Si comparas el diámetro de los dos grafos verás que hay diferencias: en uno es 2 y en el otro 4. Es normal la diferencia, nos habla de que hay personajes que comparten escena pero que no interactúan entre ellos. - -Si te diriges al **Laboratorio de datos**, verás que se han añadido varias columnas más en la tabla de nodos, ahora con los resultados de las 'medidas de centralidad'. La 'centralidad' en ARS tiene que ver con el lugar que ocupan los nodos en el conjunto de una red y nos ayuda a entender la 'importancia' de los nodos dentro del sistema que analizamos[^5]. Estas son algunas de las medidas de centralidad, pero hay unas cuantas más: -- El 'grado' o el 'grado con pesos' pueden ser medidas de centralidad, pues valores más altos indican mayor conectividad. En ese caso, nos referimos a ellas como ['centralidad de grado'](https://perma.cc/2SW2-LZT4) (degree centrality) y 'centralidad de grado con pesos' (weighted degree centrality). -- La ['centralidad de cercanía'](https://perma.cc/7E9Y-CH68) (closeness centrality) de un nodo se obtiene midiendo la distancia media que guarda dicho nodo con todos los demás del grafo. Dicho de otra forma, nos ayuda a encontrar el nodo más cercano a todos los demás, que no tiene por qué ser el de mayor grado (el más conectado). -- La ['centralidad de intermediación'](https://perma.cc/5YSB-9KVX) (betweenness centrality) de un nodo se halla calculando la cantidad de veces que dicho nodo se encuentra en el camino más corto entre todos los otros nodos. La importancia de los nodos depende, en este caso, de su labor de intermediación, de puente conector entre nodos separados. Si faltan estos nodos, la estructura de un grafo suele verse muy afectada. - -Por ejemplo, en la comedia con la que estamos trabajando, *Las bizarrías de Belisa*, ningún personaje tiene una centralidad de intermediación normalizada demasiado alta. No hay ningún nodo que eliminándolo provoque un 'grafo disconexo' en el que ciertos nodos queden desconectados del núcleo principal. - -Siguiendo en el panel de **Estadísticas** nos encontramos la **Densidad**. La ['densidad'](https://perma.cc/E5C7-XVX8) mide el nivel de conectividad entre todos los nodos de un grafo. Por ejemplo, un grafo tendría una densidad del 100% cuando todos los nodos están conectados entre sí. Matemáticamente la densidad se calcula a través de la proporción de aristas que tiene una red frente al total de aristas posibles, expresado el resultado en un rango [0,1]: cerca de 1 se dice que es un grafo 'denso'; cuanto más cerca de 0 se habla de un grafo 'disperso'. Haz clic en _Ejecutar_: -1. Se abrirá una ventana que nos permite elegir seleccionar si nuestro grafo es dirigido o no dirigido. -2. Selecciona tu opción haz clic en _Aceptar_. - -Nuevamente, hay diferencia entre la densidad del grafo de coaparición en escena y la del grafo de interacciones lingüísticas por el mismo motivo: hay personajes que comparten escena pero que no intercambian palabra. - -Vamos a saltar ahora al apartado **Community Detection**. En ARS se entiende por ['comunidad'](https://perma.cc/CJ23-HB7M) un grupo de nodos que están densamente interconectados entre sí y que a su vez están poco conectados con los nodos de otra comunidad: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-15.png" alt="Grafo explicativo del concepto 'comunidad' con los nodos coloreados según la comunidad a la que pertenecen" caption="Figura 15. Ejemplo de grafo con comunidades coloreadas en dos colores distintos" %} - -Las distintas comunidades de un grafo se hayan implementando un ’algoritmo de [modularidad](https://perma.cc/PY99-MBVB)’ que Gephi incorpora, que podemos utilizar simplemente haciendo clic en _Ejecutar_. -1. Se abrirá una ventana de **Parámetro de Modularid**. No es necesario que modifiques nada: utiliza la opción de aleatoriedad y de incorporar los pesos de las aristas, y deja la resolución en 1 (modularidad estándar). -2. El algoritmo va a numerar las comunidades a partir del 0, pero si quieres que comience a contar en 1, simplemente cambia la opción **Classes start at: 1** y dale a _Aceptar_. - -Si implementas el algoritmo de modularidad en el grafo de interacciones lingüísticas directas comprobarás que se detectan tres comunidades de nodos. Puedes ver qué comunidad ha sido asignada a cada nodo en la nueva columna del **Laboratorio de datos**. Para visualizar las comunidades en el grafo, ve al panel **Apariencia** de la **Vista general** y cambia el color de los nodos eligiendo la partición **Modularity Class**, haciendo clic en _Aplicar_ con los colores por defecto o modificándolos. Debería quedarte un grafo similar a este: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-16.png" alt="Captura de pantalla de la vista general del espacio de tabajo con la visualización del grafo de interacciones lingüísticas con los nodos coloreados según la comunidad a la que pertenecen: morado, verde o naranja" caption="Figura 16. Grafo de interacciones lingüísticas con los nodos coloreados según la comunidad a la que pertenecen, detectadas gracias al algoritmo de modiularidad" %} - -Cuando has desplegado el menú de **Partición** en el color de los nodos habrás visto que han aparecido muchas más opciones de las que teníamos al principio, y es que puedes utilizar los resultados de las medidas que has ido implementando para colorear y dar tamaño a los nodos y aristas. Por ejemplo, utilizando la opción **Ranking** puedes poner el diámetro de los nodos en función de su centralidad de intermediación y el color graduado en intensidad según su grado. Esto te permitiría a golpe de vista comparar la diferencia entre ambas medidas para cada nodo. ¿Ves cómo las opciones se multiplican? - -### La previsualización: últimos ajustes y exportación de visualizaciones - -Para finalizar con el trabajo en Gephi, vamos a exportar alguna visualización en la pestaña de **Previsualización**. Al entrar, verás un panel grande en gris vacío: es donde aparecerá el grafo una vez introduzcas los parámetros en el panel de configuración de la izquierda. Haz una prueba: entra a la previsualización del espacio de trabajo **Coaparición en escena**, haz clic en _Refrescar_ y mira cómo se ve tu grafo con los parámetros que vienen por defecto. Estarás viendo el mismo grafo de la **Vista general** pero con algunos ajustes de visualización. Ahora modifica estos parámetros y deja el resto como están por defecto: -- Nodos: - - Ancho de borde: 0.0 -- Etiquetas de nodos: - - Mostrar etiqueta: activado - - Fuente: Arial 24 Sin Formato - - Tamaño proporcional: desactivado -- Aristas: - - Grosor: 20 - - Reescalar pesos: activado - - Color: original (es decir, el gradiente que pusimos en la vista general) -- Etiquetas de aristas - - Mostrar etiquetas: activado - - Fuente: Arial 14 Sin Formato - - Color: específico: #000000 - -Haz clic en _Refrescar_ de nuevo y debería aparecerte un grafo similar a este, quizá con otra rotación: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-17.png" alt="Captura de pantalla de la pestaña de previsualización, con la columna de opciones finales de visualización a la izquierda y el grafo resultante a la derecha" caption="Figura 17. Visualización final del grafo de coaparición de personajes en escena" %} - -Ahora puedes exportar la visualización hacienco clic en _Exportar SVG/PDF/PNG_ en la parte inferior del panel de la izquierda. Como bien deduces, esos son los tres formatos que permite exportar Gephi. [PNG](https://perma.cc/3CAF-NZTD) es un buen formato de imagen, y podrás insertarlo en un documento de texto, utilizarlo para crear un póster o una presentación de diapositivas. Si seleccionas en el desplegable `Files of type` la opción `Archivos PNG (*.png)` y accedes al menú de **Opciones**, Gephi te permitirá configurar la resolución de la imagen, el margen alrededor del grafo y si quieres fondo transparente o no. - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-18.png" alt="Captura de pantalla de las ventanas del menú de exportación de visualizaciones" caption="Figura 18. Menú de exportación de visualizaciones" %} - -Otra buena opción es exportar en [SVG](https://perma.cc/EBJ4-C2KZ), el formato de gráficos vectoriales escalables que se suele utilizar en diseño gráfico, ya que son manipulables por ejemplo con [CSS](https://perma.cc/6M8D-Q4MS) y [JavaScript](https://perma.cc/2M3K-JRT8). Si quieres utilizar tus visualizaciones en un sitio web, puede que este formato sea el que más te convenga. Además, este formato lo puedes abrir y editar con programas de código abierto como [Inkscape](https://inkscape.org/es/) o [LibreOffice Draw](https://documentation.libreoffice.org/assets/Uploads/Documentation/es/DG76/PDF/DG76-Guia-de-Draw.pdf) o privativos como [Adoble Illustrator](https://www.adobe.com/es/products/illustrator.html). - -Si repites lo mismo con el grafo de interacción lingüística directa ahora podrás seleccionar si quieres aristas curvas (que marcan la dirección en el sentido de las agujas de un reloj) o rectas con flechas. Por ejemplo, reutiliza los parámetros anteriores y modifica estos: -- Aristas: - - Curvas: desactivado -- Flechas de aristas: - - Tamaño: 3.0 -- Etiquetas de aristas: - - Mostrar etiquetas: desactivado - -Haz clic en _Refrescar_ y verás algo así (con los nodos coloreados según su comunidad porque antes aplicamos este cambio en la vista general): - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-19.png" alt="Captura de pantalla de la pestaña de previsualización, con la columna de opciones finales de visualización a la izquierda y el grafo resultante a la derecha" caption="Figura 19. Visualización final del grafo de interacciones lingüísticas entre personjes" %} - -## Paso 4. Interpretación de los resultados - -Hemos generado visualizaciones y aplicado medidas a los grafos construidos gracias a los datos que primero extrajimos de *Las bizarrías de Belisa*. Las visualizaciones ya nos pueden ayudar en el análisis de una obra, por ejemplo, ilustrando un análisis de los personajes más 'tradicional'. Pero si has llegado hasta aquí seguramente lo que te interesa es tener en consideración los datos obtenidos de la aplicación de medidas, métricas y algoritmos. - -Primero creo que es necesario incidir en que los datos obtenidos de un análisis de redes sociales como el que hemos llevado a cabo deben analizarse cuidadosamente y no utilizarse para confirmar hipótesis sin una valoración crítica. En realidad, todo el proceso que has llevado a cabo, desde la elección del corpus hasta la creación de visualizaciones, debe considerarse parte del proceso crítico de investigación. Piensa, por ejemplo, en la tediosa extracción de datos y todas las decisiones interpretativas que has tomado. ¡Cualquier otra decisión variaría los resultados! Por eso debes insistir en ser consistente con el procedimiento y criterios de análisis que elijas, y comunicarlos con detalle para contextualizar tus resultados. - -Vamos entonces a explorar los datos y grafos obtenidos de nuestro análisis de redes sociales de *Las bizarrías de Belisa*. Mi primera recomendación es que, después de aplicar las medidas y algoritmos que te interesen, vayas al **Laboratorio de datos** y hagas clic en _Exportar tabla_ para exportar la tabla de nodos pero ahora con las nuevas columnas agregadas con más datos sobre los personajes. Gracias a este CSV podrás procesar los resultados cómodamente con lenguajes de programación como [R](https://perma.cc/7ESJ-S5K4) (enfocado al análisis estadístico) o [Python](https://perma.cc/BT4G-U7FE), o incluso con el mismo programa de hojas de cálculo que utilizaste para recoger tus datos. - -Hagamos esto último. Abre un nuevo archivo de hojas de cálculo e importa la tabla de nodos CSV del grafo de interacción lingüística que acabas de exportar de Gephi. Puedes llamar a este nuevo archivo `analisis-datos_Bizarrias`. ¿Qué podemos hacer ahora? Primero analicemos el grado de los personajes que, recordemos, cuantifica lo conectado que está un nodo con el resto de nodos de la red social. Los nodos además de 'grado' (a secas) también tienen 'grado con peso'. El primero tiene que ver con el número de personajes con los que habla un nodo (en un sentido y otro) y el segundo tiene en cuenta además la cantidad de interacciones. Fijémonos en las diferencias entre una y otra medida, observando estos gráficos generados en la hoja de cálculo mediante las opciones que ofrece Google Sheets: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-20.png" alt="Gráficos de barras verticales con los gafos y grados con pesos de los personajes de la comedia analizada, ordenados de mayor a menor grado" caption="Figura 20. Grados y grados con pesos de los personajes de 'Las bizarrías de Belisa' según sus interacciones lingüísticas directas" %} - -Don Juan ha resultado ser el personaje que más interactúa, logrando el grado más alto de toda la red social (15) y superando a Belisa por un punto, la indiscutible protagonista femenina que incluso da nombre a la comedia. ¿Por qué? Si vamos a nuestro grafo podremos ver cómo Don Juan interactúa con Octavio y Julio, mientras que Belisa, aunque se enfrenta a ellos vestida de hombre y con espada, no cruza palabra durante dicho enfrentamiento. Sin embargo, si vemos los datos del grafo de coaparición en escena, son Belisa y su criada Finea quienes logran el grado más alto, convirtiéndose en los dos únicos personajes de la comedia que comparten escena al menos una vez con todos los demás personajes (por eso su grado es 10). Pero recordemos, compartir escena no significa necesariamente compartir diálogo, como nos demuestra el grafo dirigido. ¿Y en cuanto al grado con peso? Si volvemos al gráfico de barras, ahora sí Belisa logra la primera posición, y supera con creces a Don Juan. Su grado con peso es 318, es decir, se dirige 157 veces a otros personajes y es receptora de 161 intervenciones. Como vemos, en función de qué nos interese estudiar de un texto teatral, puede interesarnos más un criterio de análisis u otro. - -Veamos por último un dato global de los grafos: su densidad. El grafo de coaparición en escena (no dirigido) tiene una densidad de 0,764, mientras que el de interacción lingüística alcanza tan solo 0,464. ¿Qué nos aporta esta información? *Las bizarrías de Belisa* se trata de una comedia bastante densa en cuanto a la coaparición de personajes en escena (cuanto más cerca de 1, mayor densidad). Son pocos personajes, tan solo diez, y la configuración de la acción genera que compartan muchas escenas. Lope escribió una comedia urbana del gusto de la época, alejado ya de sus primeras incursiones al género en las que el reparto superaba los 20 personajes y las acciones estaban más dispersas. Sin embargo, la densidad del grafo dirigido no llega al medio punto, lo que nos demuestra que aunque los personajes coinciden en escena, no significa que necesariamente dialoguen. La diferencia entre la densidad de los dos tipos de grafo en esta comedia podemos explicarla principalmente por la situación particular de Octavio, galán rival de don Juan (por ser pretendiente también de Lucinda, la segunda dama). Aunque sabemos que Octavio visita a Lucinda (le vemos salir de su casa), esta pareja nunca interactúa en el escenario. Es una situación quizá algo atípica pero que entendemos por el desdoblamiento de galanes rivales: don Juan y el Conde pretenden a Belisa, y don Juan y Octavio pretenden a Lucinda. Dado que la acción amorosa principal es la de Belisa, Lope no dedica demasiados versos al desarrollo de la relación entre Octavio y Lucinda. - -No podemos explorar todos los resultados del análisis practicado sobre *Las bizarrías de Belisa*, así que sirva lo dicho para comprender el tipo de conclusiones a las que nos llevan los datos y grafos generados. Por último, apuntar las posibilidades del análisis comparado de redes sociales, es decir, a partir de un corpus de dos o más obras. Por ejemplo, este es un gráfico en el que se compara el grado con pesos normalizado (sobre 1) de los primeros galanes y primeras damas de ocho comedias urbanas de Lope de Vega (en orden cronológico), entre las que se incluye la que hemos utilizado en esta lección: - -{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-21.png" alt="Diagrama de dispersión de puntos con líneas de tendencia comparando el grado con pesos normalizado de los primeros galanes y primeras damas de ocho comedias urbanas de Lope de Vega" caption="Figura 21. Gráfico comparativo del grado con pesos normalizado de los primeros galanes y primeras damas de ocho comedias urbanas de Lope de Vega (elaboración propia, Merino Recalde (2022)" %} - -## Recapitulación final - -Terminemos esta lección anotando las cuestiones elementales que deberás tener en cuenta cuando realices un análisis de redes sociales de textos teatrales: -1. Divide el proceso en cuatro partes diferenciadas: - a. Creación del corpus - b. Extracción y estructuración de datos - c. Visualizaciones y análisis - d. Interpretación de los resultados (datos y grafos) -2. Documenta el proceso y la toma de decisiones. Sé consistente en ello. Procura basarte siempre en criterios preestablecidos, ya sean provenientes de otras investigaciones que trabajen con el mismo tipo de obras o diseñados por ti en función de tus objetivos y del corpus de análisis. -3. Procura guardar tus datos finales en [formatos abiertos](https://perma.cc/M2XM-DYUZ) que garanticen el acceso a los datos a largo plazo, como el CSV (`.csv`). Si únicamente guardas tus datos en formato excel (`.xlxs`) o en la extensión del propio Gephi (`.gephi`) puede que tu archivo termine corrompiéndose o fallando. Un CSV tiene una vida más larga, es más fácil de preservar y rápidamente puedes importarlo, transformarlo y volver sobre tus datos para reconstruir tus grafos y análisis. -4. Cuando generes visualizaciones anota los parámetros que utilizaste (tamaño de los nodos, colores, algoritmo de distribución, etc.). Es importante que acompañes tus resultados de esta información, pues ayuda a entender y contextualizar las representaciones. - -Y sobre todo, no tengas miedo de probar y explorar todas las posibilidades que nos ofrece el análisis de redes para estudiar la literatura teatral. - -## Notas - -[^1]: Existen otros programas y herramientas de análisis de redes que podemos mencionar. Por ejemplo, [Cytoscape](https://cytoscape.org/) es otro programa de código abierto y libre descarga, muy utilizado en bioinformática. También hay aplicaciones web: [Palladio](http://hdlab.stanford.edu/palladio/), desarrollada por el Humanities+Design Research Lab de la Standford University y pensada para la investigación histórica; o [ONODO](https://onodo.org/), una aplicación muy sencilla que permite crear redes e implementar medidas fácilmente. -[^2]: Esta lección se ha preparado con la versión 0.9.7 de Gephi. En 2022, y tras cinco años sin actualizaciones, se han publicado 5 versiones nuevas corrigiendo errores (bug fixes) y añadiendo mejoras. Por ejemplo, desde la versión 0.9.3 ya no es necesario instalar Java para que Gephi funcione en Windows y Linux, lo que causaba numerosos problemas en Windows. Durante las revisiones de está lección se han publicado las versiones 0.10 y 0.10.1, pero sus actualizaciones no impiden el correcto seguimiento de esta lección. Puedes leer más acerca de las actualizaciones de Gephi en [https://gephi.wordpress.com/2022/05/11/transition-to-semantic-versioning/](https://perma.cc/XPF2-ZKJY) y en [https://github.com/gephi/gephi/releases](https://perma.cc/NQL4-77P2). -[^3]: Por ejemplo, este estupendo videotutorial en 5 partes de Salvador Sánchez, disponible en YouTube: [https://www.youtube.com/playlist?list=PLIvIcfwy1T6IDiW3K10TplK3rvdwMLOb2](https://www.youtube.com/playlist?list=PLIvIcfwy1T6IDiW3K10TplK3rvdwMLOb2). O la *introducción rápida a Gephi* de José Manuel Galán, también en Youtube: [https://www.youtube.com/watch?v=sX5XYec4tWo](https://www.youtube.com/watch?v=sX5XYec4tWo). -[^4]: Si te interesa conocer más sobre cómo funciona `ForceAtlas 2` y sabes inglés, te recomiendo este artículo de sus desarrolladores: Jacomy, Mathieu, Tommaso Venturini, Sebastien Heymann, y Mathieu Bastian. «ForceAtlas2, a Continuous Graph Layout Algorithm for Handy Network Visualization Designed for the Gephi Software». PLoS ONE 9, n.º 6 (2014): e98679. [https://doi.org/10.1371/journal.pone.0098679](https://doi.org/10.1371/journal.pone.0098679). -[^5]: 'Importancia' es un concepto algo complejo. Debemos diferenciar la importancia de los nodos según su centralidad (una importancia cuantitativa derivada del ARS) y la importancia que le otorgamos a los personajes (una importancia cualitativa, por ejemplo: protagonista, secundario, terciario, etc.). La correlación entre estos dos tipos de importancia no siempre se da, como demuestran Santa María Fernández et al. en un estudio de 2020. Te recomiendo este artículo para explorar en profundidad las implicaciones de las medidas de centralidad: Santa María Fernández, Teresa, José Calvo Tello, y Concepción María Jiménez Fernández. «¿Existe correlación entre importancia y centralidad? Evaluación de personajes con redes sociales en obras teatrales de la Edad de Plata». Digital Scholarship in the Humanities 36, n.º June (2020): i81-i88. [https://doi.org/10.1093/llc/fqaa015](https://doi.org/10.1093/llc/fqaa015). +--- +title: "Análisis de redes sociales de personajes teatrales (parte 2)" +slug: analisis-redes-sociales-teatro-2 +layout: lesson +collection: lessons +date: 2023-11-30 +authors: +- David Merino Recalde +reviewers: +- Sara Arribas Colmenar +- Andrés Lombana +editors: +- Jennifer Isasi +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/547 +previous: /es/lecciones/analisis-redes-sociales-teatro-1 +series_total: 2 lessons +sequence: 2 +difficulty: 2 +activity: analyzing +topics: [network-analysis, distant-reading, data-visualization] +abstract: En esta lección aprenderás a realizar un Análisis de Redes Sociales con los personajes de un texto teatral. Aprenderás sobre la importación de datos a Gephi, la creación de visualizaciones, la implementación de medidas y algoritmos, y el análisis e interpretación de los resultados. +avatar_alt: Recorte de dibujo a pluma de la escenografía usada en la representación de la comedia 'La fiera, el rayo y la piedra' de Pedro Calderón de la Barca en 1690, en el que se puede ver a varios personajes interactuando en escena. +doi: 10.46430/phes0065 +--- + +{% include toc.html %} + +## Introducción a la segunda parte + +Esta es la segunda parte de la lección _Análisis de redes sociales de personajes teatrales_. En la [primera parte](/es/lecciones/analisis-redes-sociales-teatro-1) conocimos algunas de las aplicaciones del análisis de redes sociales (ARS) a los estudios literarios y aprendimos los conceptos y nociones necesarias para enfrentarnos a esta metodología computacional-cuantitativa. Además, establecimos que para llevar a cabo un análisis de redes sociales de personajes teatrales debemos seguir una serie de pasos consecutivos: + + * Paso 1. Creación del corpus de análisis + * Paso 2. Conseguir los datos + * Toma de decisiones para la extracción de datos + * Extracción y estructuración de datos + * El proceso de vaciado + * Paso 4. Visualización y análisis de grafos con Gephi + * Paso 5. Interpretación de los resultados + +Ya hemos visto los pasos 1 y 2, y en esta segunda parte trataremos los dos últimos pasos. Si has seguido la primera parte de la lección cuentas con todos los archivos necesarios para continuar. Si has saltado directamente a la segunda parte porque lo que te interesa es aprender visualización y análisis de grafos con [Gephi](https://gephi.org/), debes descargar ahora los archivos que utilizaremos aquí. En cualquier caso, recomendamos leer la primera parte, pues es importante comprender el proceso de extracción y recogida de datos para poder analizar correctamente los resultados del análisis. ¡Vamos a ello! + +## Paso 3. Visualización y análisis de grafos con Gephi + +Tenemos tres archivos CSV: por un lado, una [lista de nodos](/assets/analisis-redes-sociales-teatro-1/nodos_bizarrias.csv) (`nodos_bizarrias.csv`); por el otro, la [lista de aristas](/assets/analisis-redes-sociales-teatro-1/aristas-coaparicion_bizarrias.csv) de un grafo no dirigido (`aristas-coaparicion_bizarrias.csv`) y la [matriz de adyacencia](/assets/analisis-redes-sociales-teatro-1/aristas-interaccion_bizarrias.csv) de uno dirigido (`aristas-interaccion_bizarrias.csv`), según el criterio de la coaparición de personajes en escena y el de interacciones lingüísticas directas entre personajes, respectivamente. El siguiente paso es generar visualizaciones, los grafos propiamente dichos, y analizarlos aplicando lo que se conoce como 'medidas' o 'métricas' de ARS. + +### Instalación de Gephi y primeros pasos + +El programa que vamos a utilizar para llevar a cabo todo esto se llama [Gephi](https://gephi.org/), pero existen muchos otros para los que también te servirán los archivos CSV que hemos preparado[^1]. Gephi es un software libre de código abierto especializado en análisis de redes, muy conocido y utilizado en Humanidades Digitales, bastante intuitivo, y que es sostenido y actualizado por sus desarrolladores[^2]. Además, disponemos de numerosos [plugins](https://gephi.org/plugins/#/) (complementos de software que añaden funcionalidades al programa), [guías de uso](https://perma.cc/4RFA-TZB9), videotutoriales en español[^3] y una comunidad activa en Twitter/X y Github a la que consultar nuestras dudas. + +Lo primero que debemos hacer es instalar el programa. En su sitio web, [https://gephi.org/](https://gephi.org/), haz clic en _Download FREE_. Está disponible para Windows, Mac OS y Linux. Es posible que la web reconozca tu sistema operativo y te ofrezca lo que necesitas, si no, selecciona en el apartado **All Downloads** de tu sistema operativo. Si necesitas ayuda con la instalación, puedes visitar [https://gephi.org/users/install/](https://perma.cc/YF6E-994N) (está solo disponible en inglés, pero puedes consultar los primeros minutos de este [videotutorial en español](https://www.youtube.com/watch?v=sX5XYec4tWo)). + +Una vez que finalices la instalación, ejecuta Gephi. Se abrirá una ventana de bienvenida con distintas opciones: crear un nuevo proyento, abrir un archivo de grafo ya existente, una columna con proyectos y archivos recientes (si los hubiese) y varios proyectos de ejemplo. Haz clic en _Nuevo proyecto_: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-01.png" alt="Captura de pantalla de la ventana de bienvenida al programa Gephi, con las opciones de crear un nuevo proyecto, abrir recientes o proyectos de ejemplo" caption=" Figura 1. Ventana de bienvenida de Gephi" %} + +Ahora estás en la pantalla principal del programa. Gephi funciona mediante proyectos (fíjate que te indicará en la barra superior que estás en el **Proyecto 1**), y dentro de cada proyecto puedes crear distintos espacios de trabajo. Ahora estás en el **Espacio de trabajo 1**. Cada espacio de trabajo funciona como la pestaña de un navegador web y contiene a su vez los tres apartados de Gephi: **Vista general**, **Laboratorio de datos** y **Previsualización**. + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-02.png" alt="Captura de pantalla de la pantalla principal del programa Gephi, la llamada vista general" caption="Figura 2. Pantalla principal de Gephi, la Vista general" %} + +
    +Si te aparece el programa en inglés te recomiendo cambiar el idioma, pues esta lección se ha preparado con Gephi en español. Puedes hacerlo fácilmente en Tools > Language > Español. Te indicará que el programa debe cerrarse y que deberás reiniciarlo manualmente, es decir, volver a abrirlo. No es necesario que guardes nada si aún no has importando ningún dato. +
    + +En la pestaña **Vista general**, se crean las visualizaciones y se aplican los filtros y medidas para analizar los grafos. En **Laboratorio de datos** se trabaja con los datos que generan los grafos, pudiéndose importar o introducir directamente, modificar y exportar. En el apartado de **Previsualización** se realizan los últimos ajustes para generar y exportar las visualizaciones (grafos) en formato de imagen `.svg`, `.pdf` o `.png`. + +Comencemos a trabajar: +1. En la barra de opciones superior, haz clic en **Espacio de trabajo** > **Nuevo** para crear un nuevo espacio de trabajo. +2. Renombra los dos espacios creados. Dentro de cada espacio, has clic en **Espacio de trabajo** > _Renombrar_. Denomina al primero 'Coaparición en escena', y al segundo, 'Interacción lingüística'. +3. Guarda el proyecto en **Archivo** > _Guardar como_, y denomínalo `bizarrias.gephi`. + +### El laboratorio de datos: importación de aristas y nodos + +Ahora vamos a importar nuestros datos. Lo haremos en paralelo con los dos grafos, pues te ayudará a no perderte. Primero las aristas del grafo de coaparición de personajes en escena: +1\. En el espacio de trabajo 'Coaparición en escena', dirígete al **Laboratorio de datos** y haz clic en _Importar hoja de cálculo_. +2\. Busca y selecciona el archivo `aristas-coaparicion_bizarrias.csv` y haz clic en _Abrir_. +3\. Se abrirá una primera ventana de **Opciones generales de CSV**. Seguramente Gephi ha detectado que se trata de una tabla de aristas, que el separador es la coma y que el formato de codificación de caracterse es UTF-8. Si no, selecciona estas opciones en los desplegables y haz clic en _Siguiente_. + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-03.png" alt="Captura de pantalla de la ventana de importación de hojas de cálculo con las opciones generales de importación desde archivos CSV para la lista de aristas" caption="Figura 3. Ventana de importación de hojas de cálculo con las opciones generales para la lista de aristas" %} + +4\. En la siguiente ventana, **Parámetros de importación**, deja seleccionadas todas las casillas, pues queremos importar nuestras cinco columnas. Gephi reconoce el tipo de datos: `double` (números) para el peso y `string` (cadena de caracteres) para las etiquetas. Haz clic en _Terminar_. +5\. Ahora te aparecerá la última ventana del proceso: el **Informe de importación**. Verás que Gephi ha detectado que se trata de un grafo 'no dirigido' con 11 nodos y 42 aristas, y que no encuentra ningún problema en el archivo. Muy importante: cambia la selección de **Nuevo espacio de trabajo** a **Añadir al espacio de trabajo existente**. Queremos que nos importe los datos en el espacio en el que estamos trabajando, **Coaparición en escena**. Cuando lo hagas, haz clic en _Aceptar_. + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-04.png" alt="Captura de pantalla del informe de importación de una lista de aristas, con opciones finales como seleccionar el tipo de grafo o en qué espacio de trabajo se quiere realizar la importación" caption="Figura 4. Ventana con el informe de importación de la lista de aristas" %} + +Verás que ha aparecido una tabla con los `id` de los personajes en la pestaña **Nodos** y una tabla con las relaciones en la pestaña **Aristas**. Gephi ha extraido esta información de nuestra lista de aristas, asignando además un `id` a cada arista. + +Ahora vamos a importar las aristas del grafo de interacciones lingüísticas directas, siguiendo los mismos pasos: +1. Dentro del espacio de trabajo **Interacción lingüística** dirígete al **Laboratorio de datos** y haz clic en _Importar hoja de cálculo_. +2. Busca y selecciona el archivo `aristas-interaccion_bizarrias.csv` y haz clic en _Abrir_. +3. Se abrirá una primera ventana de **Opciones generales de CSV**. Seguramente Gephi ha detectado que se trata de una matriz, que el separador es la coma y que el formato de codificación de caracterse es UTF-8. Si no, selecciona estas opciones en los desplegables y haz clic en _Siguiente_. +4. En la siguiente ventana, **Parámetros de importación**, simplemente haz clic en _Terminar_. Ahora no hay columnas entre las que poder elegir. +5. Por último te aparecerá la ventana **Informe de importación**. Verás que Gephi ha detectado que se trata de un grafo 'dirigido' con 11 nodos y 51 aristas, y que no encuentra ningún problema en el archivo. Muy importante: cambia la selección de **Nuevo espacio de trabajo** a **Añadir al espacio de trabajo existente**. Como antes, queremos que nos importe los datos en el espacio en el que estamos trabajando, **Interacción lingüística**. Cuando lo hagas, haz clic en _Aceptar_. + +Gephi ha importado nuestra matriz y la ha transformado en una lista de aristas con un nodo de origen, otro de destino, un tipo de relación, un peso y un `id`. Además, ha creado 11 nodos utilizando como etiqueta el `id` numérico que les asignamos. + +En la nueva lista de aristas importada, que puedes ver en la pestaña **Aristas** del **Laboratorio de datos**, verás que nos faltan los atributos (‘Label’, etiqueta) que sí pudimos importar en en el grafo de coaparición en escena, pues venían ya en nuestro archivo CSV. Nos faltan las relaciones entre los personajes: amor correspondido, amistad, servidumbre, etc. Para poder visualizarlas en este grafo tendremos que introducirlas manualmente en la columna correspondiente (’Label’, etiqueta). Puedes coger esta información de la lista de aristas del grafo no dirigido, teniendo en cuenta que ahora las relaciones están duplicadas y también tendrás, por tanto, que duplicar las etiquetas. Es decir, etiqueta como `amor correspondido` la relación de Belisa (nodo 1) a Don Juan (nodo 6) y también un `amor correspondido` de Don Juan (nodo 6) a Belisa (nodo 1). Y una relación de `amistad` de Belisa (nodo 1) a Celia (nodo 3) y otra relación de `amistad` de Celia (nodo 3) a Belisa (nodo 1). Cuando termines, tu lista de aristas dirigidas debería verse así: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-05.png" alt="Captura de pantalla del laboratorio de datos en la pestaña de aristas, ya con los todos los datos introducidos" caption="Figura 5. Pestaña de aristas después de introducir manualmente las etiquetas de las relaciones" %} + +Con las aristas preparadas, ahora vamos a importar los datos referentes a los nodos de los dos grafos. Los pasos ahora son exactamente los mismos para los dos grafos, así que hazlo primero en un espacio de trabajo y luego en el otro: + +1. Dentro del **Laboratorio de datos** de cada espacio de trabajo vuelve a hacer clic en _Importar hoja de cálculo_. +2. Ahora busca y selecciona el archivo [`nodos_bizarrias-csv`](/assets/analisis-redes-sociales-teatro-1/nodos_bizarrias.csv) y haz clic en _Abrir_. +3. En esta ocasión Gephi habrá detectado que se trata de una 'tabla de nodos', que nuevamente el separador es la coma y que la codificación de caracteres es UTF-8. Si no, selecciona estas opciones en los desplegables y haz clic en _Siguiente_. +4. En la ventana **Parámetros de importación**, mantén seleccionadas todas las casillas; queremos que importe las cuatro columnas. Ahora ha detectado que tanto la columna `género` como `función` son cadenas de caracteres. Haz clic en _Terminar_. +5. En la última ventana, **Informe de importación**, cerciórate que de que ha identificado 11 nodos y que no hay problemas en la importación. En el desplegable referente al tipo de grafo, selecciona **No dirigido** o **Dirigido** en función del grafo al que estés importando los nodos. Importante: cambia una vez más la opción de **Nuevo espacio de trabajo** a **Añadir al espacio de trabajo existente**. Después, haz clic en _Aceptar_. + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-06.png" alt="Captura de pantalla de la ventana con el informe de importación de la lista de nodos" caption="Figura 6. Ventana con el informe de importación de la lista de nodos del grafo de coaparición de personajes en escena" %} + +Gephi ha importado la lista de nodos y ha combinado la nueva información con los nodos que creó antes a partir de la lista de aristas o la matriz de adyacencia. Este es el motivo por el que era importante sustituir los nombres de los personaje por su `id` antes de exportar las hojas de cálculo a CSV. Así, Gephi ha podido identificar quién es quién y fusionar los datos de ambos archivos. + +¡Enhorabuena! Hemos terminado la importación de los datos de los dos grafos, ahora podemos pasar a trabajar en la pestaña **Vista general**. + +### La vista general +La **Vista general** es donde modificaremos la visualización de nuestros grafos (que se ve en el centro del programa) y donde aplicaremos las medidas y métricas de análisis. A la izquierda tienes las opciones de visualización (los paneles **Apariencia** y **Distribución**), y a la derecha están el panel con información sobre el grafo (**Contexto**) y los paneles **Filtros** y **Estadísticas** para consultar y analizar el grafo: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-07.png" alt="Captura de pantalla de la vista general del espacio de trabajo con una primera visualización del grafo, aún sin cambiar parámetros de visualización" caption="Figura 7. Vista general de nuestro espacio de trabajo" %} + +Las opciones de visualización y análisis son muy numerosas y no las cubriremos todas en esta lección, así que para explorar e introducirnos en Gephi vamos a crear una visualización sencilla y aplicar solo algunas medidas básicas. A partir de ahora todos los pasos que des en un espacio de trabajo puedes replicarlos en el otro. Así, repetir los mismos pasos dos veces te servirá además para aprender a usar el programa. Después, te animo a continuar probando todas las demás opciones y configuraciones por tu cuenta. + +#### Modificar la apariencia y distribución del grafo + +En el centro de la **Vista general**, en el panel llamado **Grafo**, nos ha tenido que aparecer una red con nodos y aristas en negro. Seguramente, el grafo de la captura de arriba (es el de coaparición en escena) no es exactamente igual al que te ha aparecido a ti. Es normal, se ha generado con una distribución de nodos aleatoria. Comencemos a dar forma y color a nuestra red de personajes: + +1. Para desenmarañar la red empezaremos por aplicar un 'algoritmo de distribución'. En el panel de abajo a la izquierda, **Distribución** elige el algoritmo `ForceAtlas 2` y modifica estos parámetros: escalado 2500 y activar _Evitar el solapamiento_. Lo demás puedes dejarlo como está por defecto. Haz clic en _Ejecutar_ y cuando el grafo se estabilice y deje de moverse, haz clic en _Parar_. ¿Qué ha ocurrido? Los nodos han comenzado a repelerse (alejarse) entre ellos a la vez que las aristas que los conectan los han intentado atraer. Así, se ha generado un movimiento que ha terminado convergiendo en una posición balanceada para cada nodo en la que aquellos personajes más conectados entre sí han quedado más cerca y los menos conectados más alejados. El objetivo de este algoritmo de distribución no es otro que colocar los nodos de forma que nos ayude a entender e interpretar mejor el grafo [^4]. Además de `ForceAtlas 2` existen otros algoritmos, como puedes comprobar en el desplegable, pero este nos ofrece buenos resultados y es uno de los más extendidos. +2. Ahora haz clic en el icono 'T' negro que se encuentra en la cinta de opciones inferior, a la derecha de la cámara fotográfica, en la parte inferior del panel del Grafo. Has activado las etiquetas (label) de los nodos, es decir, los nombres de los personajes. Puedes modificar el tamaño, tipografía y color en el resto de opciones de la cinta. +3. Vamos a modificar ahora el color y el tamaño de los nodos y aristas. Para ello, ve al panel **Apariencia** (arriba a la izquierda) y sigue estas indicaciones: +a. En **Nodos-Color** (icono de la paleta de pintura), selecciona **Partición** y escoge el atributo `Función`. Gephi asigna un color distinto a cada valor del atributo, puedes modificar la paleta de colores o dejar los colores por defecto y hacer clic en _Aplicar_. Los nodos del grafo se han coloreado y también lo han hecho las aristas. Ve a la cinta de opciones inferior y deselecciona la opción **Las aristas tienen el color del nodo de origen**, su icono es una línea con un arcoiris. Ahora las aristas serán todas de un mismo color gris. +b. En **Nodos-Tamaño** (icono de los círculos), selecciona **Ranking** y escoge el atributo `Grado` (Gephi calcula automáticamente el grado de los nodos). Cambia el tamaño mínimo a 10 y el máximo a 40 y haz clic en _Aplicar_. Ahora los nodos tienen un tamaño relativo a su grado, es decir, a la cantidad de nodos con los que están relacionados. A mayor número de personajes con los que comparte escena un personaje -> mayor grado del nodo que representa el personaje -> mayor diámetro del nodo en la visualización. +c. En **Aristas-Color** (icono de la paleta de pintura), selecciona **Ranking** y escoge el atributo `Peso`. Te aparecerá un gradiente de color. Puedes cambiar la paleta de colores o dejarlo en verde y hacer clic en _Aplicar_. Ahora el color de las aristas está más o menos intenso en función de su peso, es decir, del número de escenas que comparten dos los personajes o de sus interacciones lingüísticas. Si las ves muy finas, puedes cambiar el tamaño de las aristas en la cinta de opciones inferior, están por defecto más o menos gruesas también según el peso. + +Seguramente te ha quedado algo muy similar esto en el caso del grafo de coaparición de personajes en escena: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-08.png" alt="Captura de pantalla de la vista general del espacio de trabajo con la visualización del grafo una vez aplicados los parámetros de visualización escogidos" caption="Figura 8. Visualización del grafo de coaparición de personajes en escena, resultado de aplicar los parámetros indicados" %} + +¡Enhorabuena! Ahora puedes ver cuáles son los personajes más relacionados (`grado`) por el tamaño de los nodos, la `función` de estos personajes por el color de los nodos y la cantidad de veces que dos personajes coinciden en escena o interactúan entre ellos (`peso`) por el grosor y la intensidad de color de sus aristas. Si comparas la captura con tu vista del grafo de coaparición en escena puede que tu grafo tenga otra disposición. En realidad tus nodos y los míos están colocados en el mismo sitio y a la misma distancia, solo que están rotados en otro sentido. En el panel de **Distribución** puedes utilizar la opción **Rotar** (en el desplegable) y buscar una disposición que te guste más. No cambiará la distribución que creó el algoritmo `ForceAtlas 2`. Otras opciones que puedes explorar son **Contracción** y **Expansión**, o **Ajuste de etiquetas** si alguna está superpuesta. + +Una vez repitas los pasos también en el espacio de trabajo del grafo de interacciones lingüísticas y hayas modificado su apariencia verás que en este caso las aristas tienen flechas que nos indican la dirección de las relaciones, se trata de un grafo dirigido: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-09.png" alt="Captura de pantalla de la vista general del espacio de trabajo con la visualización del grafo una vez aplicados los parámetros de visualización escogidos" caption="Figura 9. Visualización del grafo de interacciones lingüísticas entre personajes, resultado de aplicar los parámetros indicados" %} + +También puedes activar las etiquetas de las aristas, haciendo clic en la 'T' blanca en la cinta de opciones de debajo del grafo. El color de las etiquetas y su tamaño deberás modificarlo en **Apariencia**, en la pestaña **Aristas-A subrayada** (color) y en la pestaña **Aristas-tT** (tamaño): + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-10.png" alt="Captura de pantalla de la vista general del espacio de trabajo con la visualización del grafo según los parámetros escogidos y con las etiquetas de las aristas visibles" caption="Figura 10. Visualización del grafo de coaparición de personajes en escena con las etiqutas de las aristas activadas" %} + +#### El contexto y los filtros + +Nos quedan por explorar los paneles de configuración de la derecha. El de **Contexto** nos da información sobre grafo en pantalla. Por ejemplo, en el de interacciones lingüísticas nos dice que se trata de un 'grafo dirigido' con 11 nodos y 51 aristas. + +Vamos a probar los filtros, por ejemplo, filtrando cualquiera de los grafos según el género de los personajes: +1. En el panel **Filtros**, despliega las carpetas **Atributos** y **Partición** (dentro de la primera). +2. Selecciona el atributo `género (Nodo)` y arrástralo al panel de **Consultas**. +3. Haz clic en _Mujer (45,45 %)_ y en _Filtrar_. + +Verás algo similar a esto, un grafo solo con los personajes clasificados por ti como **Mujer**: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-11.png" alt="Captura de pantalla de la vista general del espacio de trabajo con el resultado de filtrar el grafo según el atributo 'mujer'" caption="Figura 11. Grafo resultante de filtrar por el atributo 'Mujer'" %} + +Puedes hacer lo mismo con los personajes **Hombre** o utilizar otro atributo para el filtrado, como la función de los personajes. Con cada filtro que apliques verás que la información del **Contexto** cambia. Para volver atrás, elimina el filtro con el botón derecho _Suprimir_ sobre el filtro o haciendo clic en _Restaurar_. + +#### Medidas, métricas y algoritmos de análisis + +Ahora vamos a aplicar algunas medidas en el panel **Estadísticas**. Te dejaré explicaciones de cada una. Gephi ha simplificado al máximo el análisis de los grafos, pues es tan fácil como hacer clic en _Ejecutar_ en la medida o algoritmo que queramos implementar. Algunas de estas medidas abriran una ventana emergente al ejecutarlas, un pequeño informe que podemos descargar u opciones de configuración. Otras, simplemente añadirán columnas en nuestra tabla de nodos del **Laboratorio de datos**. Estos nuevos datos, generados gracias a la aplicación de medidas, nos dan más información sobre nuestro grafo, nos permiten modificar la visualización en base a ellos (son como nuevos atributos) y exportándolos podremos procesarlos en otra herramienta o programa. En esta lección no nos adentraremos ahí, pero quiero que sepas que a partir de aquí las posibilidades se multiplican. + +En el apartado **Visión general de la red** lo primero que encontramos es el ['grado medio'](https://perma.cc/M8B7-34LD), es decir, la media de los grados de todos los nodos del grafo. Recordemos que el grado es el número de nodos con los que un nodo está conectado. En el caso de los grafos dirigidos, obtendremos además el 'grado medio de entrada' y el 'grado medio de salida'. Después, el 'grado medio con pesos', que tiene en cuenta el peso de las aristas conectadas a un nodo y no simplemente la cantidad nodos con los que se conecta. De nuevo, habrá un 'grado medio con pesos de entrada' y un 'grado medio con pesos de salida'. Al ejecutar estas dos estadísticas, se añadirán dos columnas nuevas en la tabla de nodos del **Laboratorio de datos** con los valores de grado y grado con peso de cada nodo: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-12.png" alt="Captura de pantalla del laboratorio de datos del grafo de interacciones lingüísticas con columnas resultantantes de aplicar las medidas de grado" caption="Figura 12. Laboratorio de datos del grafo de interacciones lingüísticas con las nuevas columnas de grado" %} + +El 'diámetro de la red' es una de las medidas de tamaño o distancia. Para entenderlo, primero has de saber que en análisis de redes se entiende por 'camino' una secuencia de nodos conectados por aristas. Esta noción de camino nos permite calcular las métricas de distancia y tamaño de la red. Por otro lado, se entiende por ['distancia'](https://perma.cc/YYA3-ZLG9) o 'longitud' de un camino el número de aristas (no de nodos) que deben cruzarse para ir de un nodo a otro (siempre por el camino más corto). El ['diámetro'](https://perma.cc/2EU8-J4ZR) es, entonces, la distancia entre los nodos más alejados de una red: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-13.png" alt="Grafo explicativo del concepto 'diámetro', con las aristas que sirven para medir el diámetro coloreadas" caption="Figura 13. Ejemplo del diámetro de una red" %} + +Haz clic en _Ejecutar_ el diámetro: +1. En la ventana que se ha abierto encontrarás definiciones de las métricas de distancia: distancia media, diámetro y las medidas de centralidad de intermediación, cercanía y excentricidad. Al ejecutar esta función, no solo se calcula el diámetro sino todas esas métricas relacionadas con la distancia. +2. Gephi te permite normalizar las centralidades (ahora veremos lo que son) en un rango [0,1], lo que facilita después la comparación de grafos de obras distintas. Marca esta opción y haz clic en _Aceptar_. + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-14.png" alt="Captura de pantalla de la ventana de parámetros que se abre para ejecutar las medidas de distancia de un grafo" caption="Figura 14. Ventana de parámetros de distancia del grafo de coaparición de personajes en escena" %} + +Si comparas el diámetro de los dos grafos verás que hay diferencias: en uno es 2 y en el otro 4. Es normal la diferencia, nos habla de que hay personajes que comparten escena pero que no interactúan entre ellos. + +Si te diriges al **Laboratorio de datos**, verás que se han añadido varias columnas más en la tabla de nodos, ahora con los resultados de las 'medidas de centralidad'. La 'centralidad' en ARS tiene que ver con el lugar que ocupan los nodos en el conjunto de una red y nos ayuda a entender la 'importancia' de los nodos dentro del sistema que analizamos[^5]. Estas son algunas de las medidas de centralidad, pero hay unas cuantas más: +- El 'grado' o el 'grado con pesos' pueden ser medidas de centralidad, pues valores más altos indican mayor conectividad. En ese caso, nos referimos a ellas como ['centralidad de grado'](https://perma.cc/2SW2-LZT4) (degree centrality) y 'centralidad de grado con pesos' (weighted degree centrality). +- La ['centralidad de cercanía'](https://perma.cc/7E9Y-CH68) (closeness centrality) de un nodo se obtiene midiendo la distancia media que guarda dicho nodo con todos los demás del grafo. Dicho de otra forma, nos ayuda a encontrar el nodo más cercano a todos los demás, que no tiene por qué ser el de mayor grado (el más conectado). +- La ['centralidad de intermediación'](https://perma.cc/5YSB-9KVX) (betweenness centrality) de un nodo se halla calculando la cantidad de veces que dicho nodo se encuentra en el camino más corto entre todos los otros nodos. La importancia de los nodos depende, en este caso, de su labor de intermediación, de puente conector entre nodos separados. Si faltan estos nodos, la estructura de un grafo suele verse muy afectada. + +Por ejemplo, en la comedia con la que estamos trabajando, *Las bizarrías de Belisa*, ningún personaje tiene una centralidad de intermediación normalizada demasiado alta. No hay ningún nodo que eliminándolo provoque un 'grafo disconexo' en el que ciertos nodos queden desconectados del núcleo principal. + +Siguiendo en el panel de **Estadísticas** nos encontramos la **Densidad**. La ['densidad'](https://perma.cc/E5C7-XVX8) mide el nivel de conectividad entre todos los nodos de un grafo. Por ejemplo, un grafo tendría una densidad del 100% cuando todos los nodos están conectados entre sí. Matemáticamente la densidad se calcula a través de la proporción de aristas que tiene una red frente al total de aristas posibles, expresado el resultado en un rango [0,1]: cerca de 1 se dice que es un grafo 'denso'; cuanto más cerca de 0 se habla de un grafo 'disperso'. Haz clic en _Ejecutar_: +1. Se abrirá una ventana que nos permite elegir seleccionar si nuestro grafo es dirigido o no dirigido. +2. Selecciona tu opción haz clic en _Aceptar_. + +Nuevamente, hay diferencia entre la densidad del grafo de coaparición en escena y la del grafo de interacciones lingüísticas por el mismo motivo: hay personajes que comparten escena pero que no intercambian palabra. + +Vamos a saltar ahora al apartado **Community Detection**. En ARS se entiende por ['comunidad'](https://perma.cc/CJ23-HB7M) un grupo de nodos que están densamente interconectados entre sí y que a su vez están poco conectados con los nodos de otra comunidad: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-15.png" alt="Grafo explicativo del concepto 'comunidad' con los nodos coloreados según la comunidad a la que pertenecen" caption="Figura 15. Ejemplo de grafo con comunidades coloreadas en dos colores distintos" %} + +Las distintas comunidades de un grafo se hayan implementando un ’algoritmo de [modularidad](https://perma.cc/PY99-MBVB)’ que Gephi incorpora, que podemos utilizar simplemente haciendo clic en _Ejecutar_. +1. Se abrirá una ventana de **Parámetro de Modularid**. No es necesario que modifiques nada: utiliza la opción de aleatoriedad y de incorporar los pesos de las aristas, y deja la resolución en 1 (modularidad estándar). +2. El algoritmo va a numerar las comunidades a partir del 0, pero si quieres que comience a contar en 1, simplemente cambia la opción **Classes start at: 1** y dale a _Aceptar_. + +Si implementas el algoritmo de modularidad en el grafo de interacciones lingüísticas directas comprobarás que se detectan tres comunidades de nodos. Puedes ver qué comunidad ha sido asignada a cada nodo en la nueva columna del **Laboratorio de datos**. Para visualizar las comunidades en el grafo, ve al panel **Apariencia** de la **Vista general** y cambia el color de los nodos eligiendo la partición **Modularity Class**, haciendo clic en _Aplicar_ con los colores por defecto o modificándolos. Debería quedarte un grafo similar a este: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-16.png" alt="Captura de pantalla de la vista general del espacio de tabajo con la visualización del grafo de interacciones lingüísticas con los nodos coloreados según la comunidad a la que pertenecen: morado, verde o naranja" caption="Figura 16. Grafo de interacciones lingüísticas con los nodos coloreados según la comunidad a la que pertenecen, detectadas gracias al algoritmo de modiularidad" %} + +Cuando has desplegado el menú de **Partición** en el color de los nodos habrás visto que han aparecido muchas más opciones de las que teníamos al principio, y es que puedes utilizar los resultados de las medidas que has ido implementando para colorear y dar tamaño a los nodos y aristas. Por ejemplo, utilizando la opción **Ranking** puedes poner el diámetro de los nodos en función de su centralidad de intermediación y el color graduado en intensidad según su grado. Esto te permitiría a golpe de vista comparar la diferencia entre ambas medidas para cada nodo. ¿Ves cómo las opciones se multiplican? + +### La previsualización: últimos ajustes y exportación de visualizaciones + +Para finalizar con el trabajo en Gephi, vamos a exportar alguna visualización en la pestaña de **Previsualización**. Al entrar, verás un panel grande en gris vacío: es donde aparecerá el grafo una vez introduzcas los parámetros en el panel de configuración de la izquierda. Haz una prueba: entra a la previsualización del espacio de trabajo **Coaparición en escena**, haz clic en _Refrescar_ y mira cómo se ve tu grafo con los parámetros que vienen por defecto. Estarás viendo el mismo grafo de la **Vista general** pero con algunos ajustes de visualización. Ahora modifica estos parámetros y deja el resto como están por defecto: +- Nodos: + - Ancho de borde: 0.0 +- Etiquetas de nodos: + - Mostrar etiqueta: activado + - Fuente: Arial 24 Sin Formato + - Tamaño proporcional: desactivado +- Aristas: + - Grosor: 20 + - Reescalar pesos: activado + - Color: original (es decir, el gradiente que pusimos en la vista general) +- Etiquetas de aristas + - Mostrar etiquetas: activado + - Fuente: Arial 14 Sin Formato + - Color: específico: #000000 + +Haz clic en _Refrescar_ de nuevo y debería aparecerte un grafo similar a este, quizá con otra rotación: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-17.png" alt="Captura de pantalla de la pestaña de previsualización, con la columna de opciones finales de visualización a la izquierda y el grafo resultante a la derecha" caption="Figura 17. Visualización final del grafo de coaparición de personajes en escena" %} + +Ahora puedes exportar la visualización hacienco clic en _Exportar SVG/PDF/PNG_ en la parte inferior del panel de la izquierda. Como bien deduces, esos son los tres formatos que permite exportar Gephi. [PNG](https://perma.cc/3CAF-NZTD) es un buen formato de imagen, y podrás insertarlo en un documento de texto, utilizarlo para crear un póster o una presentación de diapositivas. Si seleccionas en el desplegable `Files of type` la opción `Archivos PNG (*.png)` y accedes al menú de **Opciones**, Gephi te permitirá configurar la resolución de la imagen, el margen alrededor del grafo y si quieres fondo transparente o no. + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-18.png" alt="Captura de pantalla de las ventanas del menú de exportación de visualizaciones" caption="Figura 18. Menú de exportación de visualizaciones" %} + +Otra buena opción es exportar en [SVG](https://perma.cc/EBJ4-C2KZ), el formato de gráficos vectoriales escalables que se suele utilizar en diseño gráfico, ya que son manipulables por ejemplo con [CSS](https://perma.cc/6M8D-Q4MS) y [JavaScript](https://perma.cc/2M3K-JRT8). Si quieres utilizar tus visualizaciones en un sitio web, puede que este formato sea el que más te convenga. Además, este formato lo puedes abrir y editar con programas de código abierto como [Inkscape](https://inkscape.org/es/) o [LibreOffice Draw](https://documentation.libreoffice.org/assets/Uploads/Documentation/es/DG76/PDF/DG76-Guia-de-Draw.pdf) o privativos como [Adoble Illustrator](https://www.adobe.com/es/products/illustrator.html). + +Si repites lo mismo con el grafo de interacción lingüística directa ahora podrás seleccionar si quieres aristas curvas (que marcan la dirección en el sentido de las agujas de un reloj) o rectas con flechas. Por ejemplo, reutiliza los parámetros anteriores y modifica estos: +- Aristas: + - Curvas: desactivado +- Flechas de aristas: + - Tamaño: 3.0 +- Etiquetas de aristas: + - Mostrar etiquetas: desactivado + +Haz clic en _Refrescar_ y verás algo así (con los nodos coloreados según su comunidad porque antes aplicamos este cambio en la vista general): + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-19.png" alt="Captura de pantalla de la pestaña de previsualización, con la columna de opciones finales de visualización a la izquierda y el grafo resultante a la derecha" caption="Figura 19. Visualización final del grafo de interacciones lingüísticas entre personjes" %} + +## Paso 4. Interpretación de los resultados + +Hemos generado visualizaciones y aplicado medidas a los grafos construidos gracias a los datos que primero extrajimos de *Las bizarrías de Belisa*. Las visualizaciones ya nos pueden ayudar en el análisis de una obra, por ejemplo, ilustrando un análisis de los personajes más 'tradicional'. Pero si has llegado hasta aquí seguramente lo que te interesa es tener en consideración los datos obtenidos de la aplicación de medidas, métricas y algoritmos. + +Primero creo que es necesario incidir en que los datos obtenidos de un análisis de redes sociales como el que hemos llevado a cabo deben analizarse cuidadosamente y no utilizarse para confirmar hipótesis sin una valoración crítica. En realidad, todo el proceso que has llevado a cabo, desde la elección del corpus hasta la creación de visualizaciones, debe considerarse parte del proceso crítico de investigación. Piensa, por ejemplo, en la tediosa extracción de datos y todas las decisiones interpretativas que has tomado. ¡Cualquier otra decisión variaría los resultados! Por eso debes insistir en ser consistente con el procedimiento y criterios de análisis que elijas, y comunicarlos con detalle para contextualizar tus resultados. + +Vamos entonces a explorar los datos y grafos obtenidos de nuestro análisis de redes sociales de *Las bizarrías de Belisa*. Mi primera recomendación es que, después de aplicar las medidas y algoritmos que te interesen, vayas al **Laboratorio de datos** y hagas clic en _Exportar tabla_ para exportar la tabla de nodos pero ahora con las nuevas columnas agregadas con más datos sobre los personajes. Gracias a este CSV podrás procesar los resultados cómodamente con lenguajes de programación como [R](https://perma.cc/7ESJ-S5K4) (enfocado al análisis estadístico) o [Python](https://perma.cc/BT4G-U7FE), o incluso con el mismo programa de hojas de cálculo que utilizaste para recoger tus datos. + +Hagamos esto último. Abre un nuevo archivo de hojas de cálculo e importa la tabla de nodos CSV del grafo de interacción lingüística que acabas de exportar de Gephi. Puedes llamar a este nuevo archivo `analisis-datos_Bizarrias`. ¿Qué podemos hacer ahora? Primero analicemos el grado de los personajes que, recordemos, cuantifica lo conectado que está un nodo con el resto de nodos de la red social. Los nodos además de 'grado' (a secas) también tienen 'grado con peso'. El primero tiene que ver con el número de personajes con los que habla un nodo (en un sentido y otro) y el segundo tiene en cuenta además la cantidad de interacciones. Fijémonos en las diferencias entre una y otra medida, observando estos gráficos generados en la hoja de cálculo mediante las opciones que ofrece Google Sheets: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-20.png" alt="Gráficos de barras verticales con los gafos y grados con pesos de los personajes de la comedia analizada, ordenados de mayor a menor grado" caption="Figura 20. Grados y grados con pesos de los personajes de 'Las bizarrías de Belisa' según sus interacciones lingüísticas directas" %} + +Don Juan ha resultado ser el personaje que más interactúa, logrando el grado más alto de toda la red social (15) y superando a Belisa por un punto, la indiscutible protagonista femenina que incluso da nombre a la comedia. ¿Por qué? Si vamos a nuestro grafo podremos ver cómo Don Juan interactúa con Octavio y Julio, mientras que Belisa, aunque se enfrenta a ellos vestida de hombre y con espada, no cruza palabra durante dicho enfrentamiento. Sin embargo, si vemos los datos del grafo de coaparición en escena, son Belisa y su criada Finea quienes logran el grado más alto, convirtiéndose en los dos únicos personajes de la comedia que comparten escena al menos una vez con todos los demás personajes (por eso su grado es 10). Pero recordemos, compartir escena no significa necesariamente compartir diálogo, como nos demuestra el grafo dirigido. ¿Y en cuanto al grado con peso? Si volvemos al gráfico de barras, ahora sí Belisa logra la primera posición, y supera con creces a Don Juan. Su grado con peso es 318, es decir, se dirige 157 veces a otros personajes y es receptora de 161 intervenciones. Como vemos, en función de qué nos interese estudiar de un texto teatral, puede interesarnos más un criterio de análisis u otro. + +Veamos por último un dato global de los grafos: su densidad. El grafo de coaparición en escena (no dirigido) tiene una densidad de 0,764, mientras que el de interacción lingüística alcanza tan solo 0,464. ¿Qué nos aporta esta información? *Las bizarrías de Belisa* se trata de una comedia bastante densa en cuanto a la coaparición de personajes en escena (cuanto más cerca de 1, mayor densidad). Son pocos personajes, tan solo diez, y la configuración de la acción genera que compartan muchas escenas. Lope escribió una comedia urbana del gusto de la época, alejado ya de sus primeras incursiones al género en las que el reparto superaba los 20 personajes y las acciones estaban más dispersas. Sin embargo, la densidad del grafo dirigido no llega al medio punto, lo que nos demuestra que aunque los personajes coinciden en escena, no significa que necesariamente dialoguen. La diferencia entre la densidad de los dos tipos de grafo en esta comedia podemos explicarla principalmente por la situación particular de Octavio, galán rival de don Juan (por ser pretendiente también de Lucinda, la segunda dama). Aunque sabemos que Octavio visita a Lucinda (le vemos salir de su casa), esta pareja nunca interactúa en el escenario. Es una situación quizá algo atípica pero que entendemos por el desdoblamiento de galanes rivales: don Juan y el Conde pretenden a Belisa, y don Juan y Octavio pretenden a Lucinda. Dado que la acción amorosa principal es la de Belisa, Lope no dedica demasiados versos al desarrollo de la relación entre Octavio y Lucinda. + +No podemos explorar todos los resultados del análisis practicado sobre *Las bizarrías de Belisa*, así que sirva lo dicho para comprender el tipo de conclusiones a las que nos llevan los datos y grafos generados. Por último, apuntar las posibilidades del análisis comparado de redes sociales, es decir, a partir de un corpus de dos o más obras. Por ejemplo, este es un gráfico en el que se compara el grado con pesos normalizado (sobre 1) de los primeros galanes y primeras damas de ocho comedias urbanas de Lope de Vega (en orden cronológico), entre las que se incluye la que hemos utilizado en esta lección: + +{% include figure.html filename="es-or-analisis-redes-sociales-teatro-2-21.png" alt="Diagrama de dispersión de puntos con líneas de tendencia comparando el grado con pesos normalizado de los primeros galanes y primeras damas de ocho comedias urbanas de Lope de Vega" caption="Figura 21. Gráfico comparativo del grado con pesos normalizado de los primeros galanes y primeras damas de ocho comedias urbanas de Lope de Vega (elaboración propia, Merino Recalde (2022)" %} + +## Recapitulación final + +Terminemos esta lección anotando las cuestiones elementales que deberás tener en cuenta cuando realices un análisis de redes sociales de textos teatrales: +1. Divide el proceso en cuatro partes diferenciadas: + a. Creación del corpus + b. Extracción y estructuración de datos + c. Visualizaciones y análisis + d. Interpretación de los resultados (datos y grafos) +2. Documenta el proceso y la toma de decisiones. Sé consistente en ello. Procura basarte siempre en criterios preestablecidos, ya sean provenientes de otras investigaciones que trabajen con el mismo tipo de obras o diseñados por ti en función de tus objetivos y del corpus de análisis. +3. Procura guardar tus datos finales en [formatos abiertos](https://perma.cc/M2XM-DYUZ) que garanticen el acceso a los datos a largo plazo, como el CSV (`.csv`). Si únicamente guardas tus datos en formato excel (`.xlxs`) o en la extensión del propio Gephi (`.gephi`) puede que tu archivo termine corrompiéndose o fallando. Un CSV tiene una vida más larga, es más fácil de preservar y rápidamente puedes importarlo, transformarlo y volver sobre tus datos para reconstruir tus grafos y análisis. +4. Cuando generes visualizaciones anota los parámetros que utilizaste (tamaño de los nodos, colores, algoritmo de distribución, etc.). Es importante que acompañes tus resultados de esta información, pues ayuda a entender y contextualizar las representaciones. + +Y sobre todo, no tengas miedo de probar y explorar todas las posibilidades que nos ofrece el análisis de redes para estudiar la literatura teatral. + +## Notas + +[^1]: Existen otros programas y herramientas de análisis de redes que podemos mencionar. Por ejemplo, [Cytoscape](https://cytoscape.org/) es otro programa de código abierto y libre descarga, muy utilizado en bioinformática. También hay aplicaciones web: [Palladio](https://hdlab.stanford.edu/palladio/), desarrollada por el Humanities+Design Research Lab de la Standford University y pensada para la investigación histórica; o [ONODO](https://onodo.org/), una aplicación muy sencilla que permite crear redes e implementar medidas fácilmente. +[^2]: Esta lección se ha preparado con la versión 0.9.7 de Gephi. En 2022, y tras cinco años sin actualizaciones, se han publicado 5 versiones nuevas corrigiendo errores (bug fixes) y añadiendo mejoras. Por ejemplo, desde la versión 0.9.3 ya no es necesario instalar Java para que Gephi funcione en Windows y Linux, lo que causaba numerosos problemas en Windows. Durante las revisiones de está lección se han publicado las versiones 0.10 y 0.10.1, pero sus actualizaciones no impiden el correcto seguimiento de esta lección. Puedes leer más acerca de las actualizaciones de Gephi en [https://gephi.wordpress.com/2022/05/11/transition-to-semantic-versioning/](https://perma.cc/XPF2-ZKJY) y en [https://github.com/gephi/gephi/releases](https://perma.cc/NQL4-77P2). +[^3]: Por ejemplo, este estupendo videotutorial en 5 partes de Salvador Sánchez, disponible en YouTube: [https://www.youtube.com/playlist?list=PLIvIcfwy1T6IDiW3K10TplK3rvdwMLOb2](https://www.youtube.com/playlist?list=PLIvIcfwy1T6IDiW3K10TplK3rvdwMLOb2). O la *introducción rápida a Gephi* de José Manuel Galán, también en Youtube: [https://www.youtube.com/watch?v=sX5XYec4tWo](https://www.youtube.com/watch?v=sX5XYec4tWo). +[^4]: Si te interesa conocer más sobre cómo funciona `ForceAtlas 2` y sabes inglés, te recomiendo este artículo de sus desarrolladores: Jacomy, Mathieu, Tommaso Venturini, Sebastien Heymann, y Mathieu Bastian. «ForceAtlas2, a Continuous Graph Layout Algorithm for Handy Network Visualization Designed for the Gephi Software». PLoS ONE 9, n.º 6 (2014): e98679. [https://doi.org/10.1371/journal.pone.0098679](https://doi.org/10.1371/journal.pone.0098679). +[^5]: 'Importancia' es un concepto algo complejo. Debemos diferenciar la importancia de los nodos según su centralidad (una importancia cuantitativa derivada del ARS) y la importancia que le otorgamos a los personajes (una importancia cualitativa, por ejemplo: protagonista, secundario, terciario, etc.). La correlación entre estos dos tipos de importancia no siempre se da, como demuestran Santa María Fernández et al. en un estudio de 2020. Te recomiendo este artículo para explorar en profundidad las implicaciones de las medidas de centralidad: Santa María Fernández, Teresa, José Calvo Tello, y Concepción María Jiménez Fernández. «¿Existe correlación entre importancia y centralidad? Evaluación de personajes con redes sociales en obras teatrales de la Edad de Plata». Digital Scholarship in the Humanities 36, n.º June (2020): i81-i88. [https://doi.org/10.1093/llc/fqaa015](https://doi.org/10.1093/llc/fqaa015). diff --git a/es/lecciones/analisis-temporal-red.md b/es/lecciones/analisis-temporal-red.md index 67c5769915..e6f88b29d6 100644 --- a/es/lecciones/analisis-temporal-red.md +++ b/es/lecciones/analisis-temporal-red.md @@ -388,7 +388,7 @@ Vamos a dar un paso atrás y reflexionar sobre lo que hemos aprendido. En este m Si hay algo que espero que hayas aprendido con este tutorial es la idea de que agregar datos temporales a los nodos y a los vínculos transforma una herramienta general de las ciencias sociales en un método útil para la argumentación histórica. La comparación de estructuras de red y las métricas para comparar intervalos de tiempo les da significación histórica que puede ser difícil o imposible de discernir en los análisis de redes sociales estáticos tradicionales. -Este tutorial ha presentado solo algunas de las muchas herramientas y técnicas que se pueden usar para el análisis de redes temporal. Un área especialmente interesante de este campo es la simulación dinámica que modela la transmisión de algo como, por ejemplo, una enfermedad o una idea entre individuos dentro de una red temporal. Si eso te suena interesante, echa un vistazo al paquete [EpiModel](http://www.epimodel.org) (en inglés) u otras herramientas creadas por los epidemiólogos para modelar la difusión dentro de redes dinámicas. +Este tutorial ha presentado solo algunas de las muchas herramientas y técnicas que se pueden usar para el análisis de redes temporal. Un área especialmente interesante de este campo es la simulación dinámica que modela la transmisión de algo como, por ejemplo, una enfermedad o una idea entre individuos dentro de una red temporal. Si eso te suena interesante, echa un vistazo al paquete [EpiModel](https://www.epimodel.org) (en inglés) u otras herramientas creadas por los epidemiólogos para modelar la difusión dentro de redes dinámicas. Dependiendo de los datos históricos con los que estés trabajando, el análisis de redes temporal te puede ofrecer ideas importantes sobre cómo las propiedades de los nodos, sus vínculos y la red en su conjunto cambian a lo largo del tiempo. Tanto si decides o no dar el salto al análisis de redes temporal, es útil recordar que las redes de todo tipo son fenómenos históricos que emergen, se desarrollan, se transforman más allá de su reconocimiento y desaparecen con el transcurso del tiempo. @@ -399,7 +399,7 @@ Si has hecho este tutorial pero todavía te sientes más cómodo/a usando una in * [Convertir una red con fechas en una red dinámica](https://seinecle.github.io/gephi-tutorials/generated-html/converting-a-network-with-dates-into-dynamic.html) (en inglés) de Clément Levallois. * Ken Cherven hace un buen recorrido por el Análisis de Redes Dinámico con Gephi en su libro *Mastering Gephi Network Visualization* (2015) -Si tienes más ganas de realizar análisis de redes temporal con R, [este tutorial](https://web.archive.org/web/20180423112846/http://statnet.csde.washington.edu/workshops/SUNBELT/current/ndtv/ndtv_workshop.html) (en inglés) de Skye Bender-deMoll explica funciones adicionales y propiedades de los paquetes que hemos usado. Me sirvió como guía para aprender sobre el análisis de redes temporal, inspirándome a escribir este tutorial. +Si tienes más ganas de realizar análisis de redes temporal con R, [este tutorial](https://web.archive.org/web/20180423112846/https://statnet.csde.washington.edu/workshops/SUNBELT/current/ndtv/ndtv_workshop.html) (en inglés) de Skye Bender-deMoll explica funciones adicionales y propiedades de los paquetes que hemos usado. Me sirvió como guía para aprender sobre el análisis de redes temporal, inspirándome a escribir este tutorial. También puedes adentrarte en la documentación de los paquetes [networkDynamic](https://cran.r-project.org/web/packages/networkDynamic/index.html), [TSNA](https://cran.r-project.org/web/packages/tsna/index.html) y [NDTV](https://cran.r-project.org/web/packages/networkDynamic/index.html). diff --git a/es/lecciones/analisis-voyant-tools.md b/es/lecciones/analisis-voyant-tools.md index b030675b31..b2fe7f6f34 100644 --- a/es/lecciones/analisis-voyant-tools.md +++ b/es/lecciones/analisis-voyant-tools.md @@ -30,9 +30,9 @@ En este tutorial se aprenderá cómo organizar un conjunto de textos para la inv ## Análisis de corpus -El análisis de corpus es un tipo de [análisis de contenido](http://vocabularios.caicyt.gov.ar/portalthes/42/term/26) que permite hacer comparaciones a gran escala sobre un conjunto de textos o corpus. +El análisis de corpus es un tipo de [análisis de contenido](https://vocabularios.caicyt.gov.ar/portalthes/42/term/26) que permite hacer comparaciones a gran escala sobre un conjunto de textos o corpus. -Desde el inicio de la informática, tanto lingüistas computacionales como especialistas de la [recuperación de la información](http://vocabularios.caicyt.gov.ar/portalthes/42/term/178) han creado y utilizado software para apreciar patrones que no son evidentes en una lectura tradicional o bien para corroborar hipótesis que intuían al leer ciertos textos pero que requerían de trabajos laboriosos, costosos y mecánicos. Por ejemplo, para obtener los patrones de uso y decaimiento de ciertos términos en una época dada era necesario contratar a personas que revisaran manualmente un texto y anotaran cuántas veces aparecía el término buscado. Muy pronto, al observar las capacidades de "contar" que tenían las computadoras, estos especialistas no tardaron en escribir programas que facilitaran la tarea de crear listas de frecuencias o tablas de concordancia (es decir, tablas con los contextos izquierdos y derechos de un término). El programa que aprenderás a usar en este tutorial, se inscribe en este contexto. +Desde el inicio de la informática, tanto lingüistas computacionales como especialistas de la [recuperación de la información](https://vocabularios.caicyt.gov.ar/portalthes/42/term/178) han creado y utilizado software para apreciar patrones que no son evidentes en una lectura tradicional o bien para corroborar hipótesis que intuían al leer ciertos textos pero que requerían de trabajos laboriosos, costosos y mecánicos. Por ejemplo, para obtener los patrones de uso y decaimiento de ciertos términos en una época dada era necesario contratar a personas que revisaran manualmente un texto y anotaran cuántas veces aparecía el término buscado. Muy pronto, al observar las capacidades de "contar" que tenían las computadoras, estos especialistas no tardaron en escribir programas que facilitaran la tarea de crear listas de frecuencias o tablas de concordancia (es decir, tablas con los contextos izquierdos y derechos de un término). El programa que aprenderás a usar en este tutorial, se inscribe en este contexto. ## Qué aprenderás en este tutorial @@ -63,7 +63,7 @@ Lo primero que debes hacer es buscar la información que te interesa. Para este ### 2. Copiar en editor de texto plano Una vez localizada la información, el segundo paso es copiar el texto que te interesa desde la primera palabra dicha hasta la última y guardarla en un editor de texto sin formato. Por ejemplo: -* en Windows podría guardarse en [Bloc de Notas](https://web.archive.org/web/20091013225307/http://windows.microsoft.com/en-us/windows-vista/Notepad-frequently-asked-questions) +* en Windows podría guardarse en [Bloc de Notas](https://web.archive.org/web/20091013225307/https://windows.microsoft.com/en-us/windows-vista/Notepad-frequently-asked-questions) * en Mac, en [TextEdit](https://support.apple.com/es-mx/guide/textedit/welcome/mac); * y en Linux, en [Gedit](https://wiki.gnome.org/Apps/Gedit). @@ -156,8 +156,8 @@ Este corpus tiene _ documentos con un total de palabras de _ y _ palabras única #### Extensión de documentos Lo segundo que vemos es la sección de "extensión del documento". Ahí aparece lo siguiente: -- Más largo: [2008_cl_bachelet](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (20702); [2007_ar_kircher](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (20390); [2006_ar_kircher](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (18619); [2010_cl_pinera](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (16982); [2007_cl_bachelet](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (15514) -- Más corto: [2006_pe_toledo](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (1289); [2006_mx_fox](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (2450); [2008_mx_calderon](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (3317); [2006_co_uribe](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (4709); [2009_co_uribe](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (5807) +- Más largo: `2008_cl_bachelet (20702)`; `2007_ar_kircher (20390)`; `2006_ar_kircher (18619)`; `2010_cl_pinera (16982)`; `2007_cl_bachelet (15514)` +- Más corto: `2006_pe_toledo (1289)`; `2006_mx_fox (2450)`; `2008_mx_calderon (3317)`; `2006_co_uribe (4709)`; `2009_co_uribe (5807)` ##### *Actividad 2* 1. ¿Qué podemos concluir sobre los textos más largos y los más cortos considerando los metadatos en el nombre del archivo (año, país, presidente)? @@ -186,10 +186,8 @@ La densidad de vocubulario se mide dividiendo el número de palabras únicas ent **2)** Lee los datos de densidad léxica de los documentos de nuestro corpus, ¿qué te dicen? - -- Más alto: [2006_pe_toledo](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (0.404); [2006_co_uribe](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (0.340); [2009_co_uribe](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (0.336); [2008_co_uribe](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (0.334); [2006_mx_fox](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (0.328) -- Más bajo: [2008_cl_bachelet](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (0.192); [2007_mx_calderon](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (0.192); [2007_ar_kircher](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (0.206); [2007_pe_garcia](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (0.214); [2010_ar_fernandez](https://voyant-tools.org/?corpus=b6f0e2c5ee1bc9b644ffda6b86a93740&panels=cirrus,reader,trends,summary,contexts#) (0.217) - +- Más alto: `2006_pe_toledo (0.404)`; `2006_co_uribe (0.340)`; `2009_co_uribe (0.336)`; `2008_co_uribe (0.334)`; `2006_mx_fox (0.328)` +- Más bajo: `2008_cl_bachelet (0.192)`; `2007_mx_calderon (0.192)`; `2007_ar_kircher (0.206)`; `2007_pe_garcia (0.214)`; `2010_ar_fernandez (0.217)` **3)** Compáralos con la información sobre su extensión, ¿qué notas? @@ -248,7 +246,7 @@ Voyant tiene ya cargada una lista de _stop words_ o palabras vacías del españo #### Frecuencias con palabras vacías filtradas Volvamos entonces a esta sección del sumario. Como dijimos en el iniciso anterior las palabras filtradas afectan otros campos de Voyant. En este caso, si dejaste seleccionada la caja de "Aplicar a todo", en la lista que aparece debajo de la leyenda: **Palabra más frecuente en el corpus** , se mostrarán las palabras que se repiten más **sin contar** aquéllas que fueron filtradas. En mi caso, muestra: ->[social](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (437); [nacional](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (427); [nuestro](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (393); [inversión](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (376); [ley](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (369) +>`social (437)`; `nacional (427)`; `nuestro (393)`; `inversión (376)`; `ley (369)` ##### *Actividad 7* @@ -307,26 +305,26 @@ Frecuencia Bruta (tf) / Número de Palabras (N) * log10 ( Número de Documento Observa las **palabras diferenciadas (comparado con el resto del corpus)** de cada uno de los documentos y anota qué hipótesis puedes derivar de ellas -1. [2006_ar_kircher](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [uruguay](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (12), [2004](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (13), [2005](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (31), [plata](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (7), [inclusión](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (16). -2. [2006_cl_bachelet](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [innovación](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (15), [rodrigo](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (8), [alegremente](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (4), [barrios](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (9), [cobre](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (10). -3. [2006_co_uribe](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [tutela](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5), [reelección](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6), [regalías](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (7), [iva](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6), [publicación](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5). -4. [2006_mx_fox](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [atenta](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5), [apego](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5), [federalismo](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (3), [intransigencia](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (2), [fundamento](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (3). -5. [2006_pe_toledo](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [entrego](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5), [señor](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (14), [señora](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5), [amigo](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5), [tracemos](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (2). -6. [2007_ar_kircher](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [2006](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (65), [mercosur](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (12), [uruguay](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (9), [provincias](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (16), [interanual](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5). -7. [2007_cl_bachelet](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [macrozona](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (7), [deudores](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (12), [cuna](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (9), [subvención](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (10), [pesimismo](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (4). -8. [2007_co_uribe](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [guerrilla](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (10), [sindicalistas](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (7), [paramilitares](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (8), [inversionista](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (10), [despeje](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (7). -9. [2007_mx_calderon](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [igualar](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (9), [transformar](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (19), [tortilla](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (4), [acuíferos](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (4), [miseria](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (10). -10. [2007_pe_garcia](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [huancavelica](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (9), [redistribución](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (10), [callao](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (8), [407](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (4), [lima](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (7). -11. [2008_ar_fernandez](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [abordar](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (17), [capítulo](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (12), [presupone](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5), [lesa](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (8), [articular](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5). -12. [2008_cl_bachelet](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [desafío](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (18), [mirada](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (10), [aprobamos](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6), [adulto](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6), [diez](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (11). -13. [2008_co_uribe](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [ecopetrol](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6), [revaluación](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (4), [juegos](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (4), [desatrasar](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (3), [billones](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6). -14. [2008_mx_calderon](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [cártel](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5), [noches](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (3), [mexicanas](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6), [controlaba](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (3), [federales](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6). -15. [2008_pe_garcia](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [poblados](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (11), [kilómetros](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (52), [lima](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (11), [carreteras](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (21), [mineros](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (4). -16. [2009_ar_fernandez](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [sosteniendo](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (7), [dirigencia](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5), [coparticipación](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6), [catamarca](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (7), [pbi](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (9). -17. [2009_cl_bachelet](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [sello](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5), [fortalecidos](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5), [crisis](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (48), [gente](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (24), [aplauso](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (4). -18. [2009_co_uribe](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [colombia](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (20), [calzada](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6), [contributivo](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5), [desplazados](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6), [notificado](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (3). -19. [2009_mx_calderon](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [federal](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (27), [organizado](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (10), [cambiar](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (13), [propongo](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (8), [policiacos](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (4). -20. [2009_pe_garcia](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#): [lima](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (11), [1,500](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6), [tingo](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (4), [pampas](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (4), [desorden](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (6). +1. 2006_ar_kircher: `uruguay (12)`, `2004 (13)`, `2005 (31)`, `plata (7)`, `inclusión (16)`. +2. 2006_cl_bachelet: `innovación (15)`, `rodrigo (8)`, `alegremente (4)`, `barrios (9)`, `cobre (10)`. +3. 2006_co_uribe: `tutela (5)`, `reelección (6)`, `regalías (7)`, `iva (6)`, `publicación (5)`. +4. 2006_mx_fox: `atenta (5)`, `apego (5)`, `federalismo (3)`, `intransigencia (2)`, `fundamento (3)`. +5. 2006_pe_toledo: `entrego (5)`, `señor (14)`, `señora (5)`, `amigo (5)`, `tracemos (2)`. +6. 2007_ar_kircher: `2006 (65)`, `mercosur (12)`, `uruguay (9)`, `provincias (16)`, `interanual (5)`. +7. 2007_cl_bachelet: `macrozona (7)`, `deudores (12)`, `cuna (9)`, `subvención (10)`, `pesimismo (4)`. +8. 2007_co_uribe: `guerrilla (10)`, `sindicalistas (7)`, `paramilitares (8)`, `inversionista (10)`, `despeje (7)`. +9. 2007_mx_calderon: `igualar (9)`, `transformar (19)`, `tortilla (4)`, `acuíferos (4)`, `miseria (10)`. +10. 2007_pe_garcia: `huancavelica (9)`, `redistribución (10)`, `callao (8)`, `407 (4)`, `lima (7)`. +11. 2008_ar_fernandez: `abordar (17)`, `capítulo (12)`, `presupone (5)`, `lesa (8)`, `articular (5)`. +12. 2008_cl_bachelet: `desafío (18)`, `mirada (10)`, `aprobamos (6)`, `adulto (6)`, `diez (11)`. +13. 2008_co_uribe: `ecopetrol (6)`, `revaluación (4)`, `juegos (4)`, `desatrasar (3)`, `billones (6)`. +14. 2008_mx_calderon: `cártel (5)`, `noches (3)`, `mexicanas (6)`, `controlaba (3)`, `federales (6)`. +15. 2008_pe_garcia: `poblados (11)`, `kilómetros (52)`, `lima (11)`, `carreteras (21)`, `mineros (4)`. +16. 2009_ar_fernandez: `sosteniendo (7)`, `dirigencia (5)`, `coparticipación (6)`, `catamarca (7)`, `pbi (9)`. +17. 2009_cl_bachelet: `sello (5)`, `fortalecidos (5)`, `crisis (48)`, `gente (24)`, `aplauso (4)`. +18. 2009_co_uribe: `colombia (20)`, `calzada (6)`, `contributivo (5)`, `desplazados (6)`, `notificado (3)`. +19. 2009_mx_calderon: `federal (27)`, `organizado (10)`, `cambiar (13)`, `propongo (8)`, `policiacos (4)`. +20. 2009_pe_garcia: `lima (11)`, `1,500 (6)`, `tingo (4)`, `pampas (4)`, `desorden (6)`. ### Palabras en contexto @@ -384,7 +382,7 @@ Este corpus tiene 2 documentos con un total de palabras de 4 y 3 palabras única ### Actividad 3 -**1)** La primera estrofa tiene 23 palabras y 20 son palabras únicas, por lo que 20/23 da igual a una densidad de vocabulario de 0.870; en realidad de 0.869 pero Voyant Tools redondea estos números: https://voyant-tools.org/?corpus=b6b17408eb605cb1477756ce412de78e. La segunda estrofa tiene 24 palabras y 20 son palabras únicas, por lo que 20/24 da igual a una densidad de vocabulario de 0.833: https://voyant-tools.org/?corpus=366630ce91f54ed3577a0873d601d714. +**1)** La primera estrofa tiene 23 palabras y 20 son palabras únicas, por lo que 20/23 da igual a una densidad de vocabulario de 0.870; en realidad de 0.869 pero Voyant Tools redondea estos números. La segunda estrofa tiene 24 palabras y 20 son palabras únicas, por lo que 20/24 da igual a una densidad de vocabulario de 0.833. Como podemos observar la diferencia entre un verso de Sor Juana Inés de la Cruz y otro compuesto por Érika Ender, Daddy Yankee y Luis Fonsi tienen una diferencia de densidad de 0.037, que no es muy alto. Debemos tener cuidado al interpretar estos números pues sólo son un indicador cuantitativo de la riqueza del vocabulario y no incluye parámetros como la complejidad de la rima o de los términos. @@ -396,7 +394,7 @@ Parece haber una correspondencia entre los discursos más cortos y los más dens Estos resultados parecen indicar que la presidenta Kirchner, además de tener los discursos más largos es la que hace frases más largas; sin embargo tenemos que tener cuidado con las conclusiones de este tipo pues se trata de discursos orales en los que la puntuación depende de quien transcribe el texto. ### Actividad 5 -1. [a](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (5943); [más](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (1946); [no](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (1694); [mil](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (1045); [millones](https://voyant-tools.org/?corpus=77227f21c006f5ef083d820d77667627#) (971) +1. `a (5943)`; `más (1946)`; `no (1694)`; `mil (1045)`; `millones (971)` 2. La primera palabra es una preposición, la segunda un adverbio de comparición y la tercera un adverbio de negación. Estas palabras podrían ser significativas si lo que se busca comprender es el uso de este tipo de palabras funcionales. Sin embargo, si lo que se busca son más bien sustantivos, habrá que hacer un filtrado (ver sección: "Palabras más frecuentes") @@ -407,9 +405,9 @@ Hockey, Susan. 2004 “The History of Humanities Computing”. _A Companion to D Peña, Gilberto Anguiano, y Catalina Naumis Peña. 2015. «Extracción de candidatos a términos de un corpus de la lengua general». _Investigación Bibliotecológica: Archivonomía, Bibliotecología e Información_ 29 (67): 19-45. [https://doi.org/10.1016/j.ibbai.2016.02.035](https://www.sciencedirect.com/science/article/pii/S0187358X16000368). -Sinclair, Stéfan and Geoffrey Rockwell, 2016. _Voyant Tools_. Web. [http://voyant-tools.org/](http://voyant-tools.org/). +Sinclair, Stéfan and Geoffrey Rockwell, 2016. _Voyant Tools_. Web. [https://voyant-tools.org/](https://voyant-tools.org/). -Terras, Melissa, 2013. "For Ada Lovelace Day – Father Busa’s Female Punch Card Operatives". _Melissa Terras' Blog_. Web. [http://melissaterras.blogspot.com/2013/10/for-ada-lovelace-day-father-busas.html](http://melissaterras.blogspot.com/2013/10/for-ada-lovelace-day-father-busas.html). +Terras, Melissa, 2013. "For Ada Lovelace Day – Father Busa’s Female Punch Card Operatives". _Melissa Terras' Blog_. Web. [https://melissaterras.blogspot.com/2013/10/for-ada-lovelace-day-father-busas.html](https://melissaterras.blogspot.com/2013/10/for-ada-lovelace-day-father-busas.html).
    Este tutorial fue escrito gracias al apoyo de la Academia Británica y preparado durante el Taller de escritura de The Programming Historian en la Universidad de los Andes en Bogotá, Colombia, el del 31 de julio al 3 de agosto de 2018. diff --git a/es/lecciones/construir-repositorio-de-fuentes.md b/es/lecciones/construir-repositorio-de-fuentes.md index 6b87cf5e75..da9bbd4039 100644 --- a/es/lecciones/construir-repositorio-de-fuentes.md +++ b/es/lecciones/construir-repositorio-de-fuentes.md @@ -40,7 +40,7 @@ Algunos de los ejercicios planteados en esta lección requieren un repositorio c La abundancia de información documental que tenemos a disposición es cada vez mayor. Las fuentes primarias que se encuentran publicadas por archivos y bibliotecas han facilitado significativamente nuestro trabajo de recolección de información histórica. Sin embargo, esto ha conllevado un problema de abundancia de digitalizaciones y transcripciones de documentos que muchas veces quedan almacenadas anárquicamente en nuestros ordenadores. En esta lección aprovecharemos las capacidades de Omeka para desarrollar repositorios, individuales o colaborativos, para almacenar, analizar y exhibir fuentes primarias; con el propósito de presentar una opción para la sistematización de documentación primaria que posteriormente podrá ser utilizada en tareas de investigación o exhibiciones Web. -Para seguir esta lección requieres una instalación de Omeka. Puedes seguir la lección [*Installing Omeka*](/lessons/installing-omeka) disponible en el sitio en inglés (la versión en español se encuentra en proceso de traducción). En caso de que necesites conocer el funcionamiento básico de la plataforma es importante que entiendas como [crear sitios, elementos, colecciones](/es/lecciones/poniendo-omeka-a-funcionar) y [exhibiciones](/es/lecciones/crear-exposicion-con-omeka). +Para seguir esta lección requieres una instalación de Omeka. Puedes seguir la lección [*Installing Omeka*](/en/lessons/installing-omeka) disponible en el sitio en inglés (la versión en español se encuentra en proceso de traducción). En caso de que necesites conocer el funcionamiento básico de la plataforma es importante que entiendas como [crear sitios, elementos, colecciones](/es/lecciones/poniendo-omeka-a-funcionar) y [exhibiciones](/es/lecciones/crear-exposicion-con-omeka). Asimismo es importante que en caso de querer ampliar la información sobre el funcionamiento de Omeka recurras al manual de usuario de la versión clásica. También es posible que consultes o participes en el foro para obtener información adicional o respuesta a un problema específico. @@ -79,8 +79,8 @@ Tras la instalación, la primera acción que debemos realizar será activar los {% include figure.html filename="img_1.1-modact.jpg" caption="Módulos XAMPP activados" %} -Para probar que todo funciona correctamente, ingresa desde tu navegador a la dirección o . Si la instalación es correcta te mostrará la pantalla de inicio: - +Para probar que todo funciona correctamente, ingresa desde tu navegador a la dirección `` o ``. Si la instalación es correcta te mostrará la pantalla de inicio: +s {% include figure.html filename="img_1.1-xampp-dashboard.jpg" caption="Pantalla de inicio (dashboard) de XAMPP" %} Deberás tener en el menú de inicio de Windows un menú de XAMPP con tres opciones desplegables. Las más útiles para nuestro trabajo serán "XAMPP Control Panel", que abre el panel de control para activar o desactivar los módulos, y "XAMPP htdocs folder", un enlace al directorio donde se guardarán los archivos de Omeka para realizar la instalación, por lo general es `C:\xampp\htdocs` para Windows. En Linux este directorio se encuentra en la ruta `/opt/lampp/htdocs`. @@ -95,7 +95,7 @@ Para la instalación de Omeka es necesario crear una base de datos que albergar Para crear la base de datos es posible utilizar los métodos explicados en el paso 2 de [*Installing Omeka*](/en/lessons/installing-omeka#step-2-install-your-server-and-database). También podemos utilizar *phpMyAdmin* para crear la base de datos de la instalación e incluso para editarla después. -El primer paso consiste en ingresar al entorno de *phpMyAdmin* a través de la dirección XAMPP te dejará ingresar sin contraseña, pero otros servicios (como Bitnami) te exigirán permisos de usuario para ingresar.[^bitnami_ingreso] La página de inicio te mostrará una página con la configuración general del servidor de la base de datos, el servidor web y de la aplicación. Esta pantalla será importante al momento de requerir la versión de MySQL ("Servidor de base de datos >> Versión del servidor"), la versión de PHP ("Servidor web >> Versión de PHP"), o incluso el nombre de usuario del servidor (por lo general "root@localhost"). Esta pantalla es útil no sólo en instalaciones locales, servirá también para comprobar que algún servicio de alojamiento web corresponda con la tecnología necesaria para ejecutar ciertas aplicaciones. +El primer paso consiste en ingresar al entorno de *phpMyAdmin* a través de la dirección XAMPP te dejará ingresar sin contraseña, pero otros servicios (como Bitnami) te exigirán permisos de usuario para ingresar.[^bitnami_ingreso] La página de inicio te mostrará una página con la configuración general del servidor de la base de datos, el servidor web y de la aplicación. Esta pantalla será importante al momento de requerir la versión de MySQL ("Servidor de base de datos >> Versión del servidor"), la versión de PHP ("Servidor web >> Versión de PHP"), o incluso el nombre de usuario del servidor (por lo general "root@localhost"). Esta pantalla es útil no sólo en instalaciones locales, servirá también para comprobar que algún servicio de alojamiento web corresponda con la tecnología necesaria para ejecutar ciertas aplicaciones. En *phpMyAdmin* seleccionaremos la pestaña "Bases de datos" donde veremos un pequeño formulario para crear la base de datos, sólo tenemos que ingresar el *nombre de la base de datos* e indicar el *cotejamiento*. Seleccionaremos el cotejamiento `utf8_spanish_ci` ya que representará una mayor precisión al momento de ordenar los elementos (*items*) en Omeka.[^collate] Esto es particularmente relevante en las instalaciones en Linux que suelen seleccionar de manera predeterminada un cotejamiento `latin1_`. @@ -134,7 +134,7 @@ Es opcional, aunque muy recomendable, que en tanto el repositorio se encuentre e # Un vistazo al "esqueleto" de Omeka -Si vamos a [phpMyAdmin](http://localhost/phpmyadmin) veremos que la base de datos vacía está ahora llena con 19 tablas interdependientes. La estructura de la base de datos (*database schema*) puede describirse de manera sintética agrupando las tablas en cinco grupos de información: datos para los elementos y colecciones, etiquetas, metatados de los tipos de elementos, información de usuarios, texto para búsqueda, y tablas para procesos del sistema. Un mapa resumido de las interdependencias entre las tablas se puede ver en la siguiente imagen: +Si vamos a [phpMyAdmin](https://localhost/phpmyadmin) veremos que la base de datos vacía está ahora llena con 19 tablas interdependientes. La estructura de la base de datos (*database schema*) puede describirse de manera sintética agrupando las tablas en cinco grupos de información: datos para los elementos y colecciones, etiquetas, metatados de los tipos de elementos, información de usuarios, texto para búsqueda, y tablas para procesos del sistema. Un mapa resumido de las interdependencias entre las tablas se puede ver en la siguiente imagen: {% include figure.html filename="img_2.1-omeka_mysql_schema.png" caption="Esquema de la interdependencia de la base de datos de Omeka" %} @@ -189,7 +189,7 @@ Para muchos la palabra "Metadatos" suena oscura y "metafísica", algo que está Los metadatos son independientes del lenguaje de máquina o de programación, es decir, son categorías completamente personalizables que funcionan de manera independiente de la plataforma. Esta libertad conlleva una gran desventaja y es que si cada usuario creara sus elementos de manera arbitraria no habría manera de intercambiar información entre sistemas. Por esa razón, se creó una estrategia de estandarización de los conjuntos de metadatos de tal manera que facilite la interacción entre plataformas, la actualización del software y, sobre todo, el compartir y encontrar información en grandes repositorios. -Omeka Classic se fundamenta en el estándar *Dublin Core*, específicamente en el esquema básico de 15 descriptores Dublin Core Metadata Element Set Version 1.1: +Omeka Classic se fundamenta en el estándar *Dublin Core*, específicamente en el esquema básico de 15 descriptores Dublin Core Metadata Element Set Version 1.1: Título (title) Autor (creator) @@ -241,7 +241,7 @@ Si estamos construyendo un sitio personal no es necesario (aunque sería lo idea # Plugins o complementos -Un plugin es un pequeño programa que añade una función específica a otro programa, por ejemplo, un CMS tipo Wordpress o Joomla puede incorporar una casilla de comentarios, pero un plugin puede hacer que esta casilla se conecte con las redes sociales y comentar desde su perfil de Facebook o Twitter. En esta lección sólo veremos cómo añadir plugins a nuestra instalación de Omeka[^omeka.net], si desea profundizar en la manera de desarrollar un complemento lo más recomendable es consultar la documentación disponible en la página de Omeka. +Un plugin es un pequeño programa que añade una función específica a otro programa, por ejemplo, un CMS tipo Wordpress o Joomla puede incorporar una casilla de comentarios, pero un plugin puede hacer que esta casilla se conecte con las redes sociales y comentar desde su perfil de Facebook o Twitter. En esta lección sólo veremos cómo añadir plugins a nuestra instalación de Omeka[^omeka.net], si desea profundizar en la manera de desarrollar un complemento lo más recomendable es consultar la documentación disponible en la página de Omeka. Las dos fuentes principales de plugins para Omeka son el repositorio oficial de complementos y Github. Ambos listados son dinámicos, por lo que recomendamos visitar periódicamente estos lugares para conocer novedades y actualizaciones. @@ -278,7 +278,7 @@ Para usar Omeka no es realmente necesario ningún complemento, sin embargo, el c La selección de plugins dependerá en buena medida de los objetivos del repositorio, para nuestro caso se requerirán complementos que permitan: 1. Mostrar los documentos y hacerlos legibles a los usuarios, ya sean imágenes, archivos PDF u otros. Por ejemplo, PDF Embed y Universal Viewer. -2. Gestionar los metadatos de cada documento: procedencia, cobertura, fechas, nombres, etc. Por ejemplo Dublin Core Extended y Hide Elements. +2. Gestionar los metadatos de cada documento: procedencia, cobertura, fechas, nombres, etc. Por ejemplo Dublin Core Extended y Hide Elements. 3. Buscar información: complementos que contribuyan a ampliar las capacidades de las búsquedas de Omeka. Por ejemplo Search by Metadata y PDF Text. 4. Interrelacionar elementos y colecciones. Por ejemplo Item Relations, Collection Tree y Geolocation, Reference. 5. Permitir la transcripción de documentos. Scripto es la opción más recomendada, pero también es posible apoyar el proceso de transcripción con el plugin Contribution diff --git a/es/lecciones/contar-frecuencias.md b/es/lecciones/contar-frecuencias.md index c8c802aa82..168409932d 100644 --- a/es/lecciones/contar-frecuencias.md +++ b/es/lecciones/contar-frecuencias.md @@ -20,8 +20,8 @@ translation-reviewer: - Antonio Rojas Castro review-ticket: https://github.com/programminghistorian/ph-submissions/issues/47 layout: lesson -next: crear-y-ver-archivos-html-con-python -previous: normalizar-datos +next: /es/lecciones/crear-y-ver-archivos-html-con-python +previous: /es/lecciones/normalizar-datos original: counting-frequencies difficulty: 2 activity: analyzing @@ -459,8 +459,8 @@ Para seguir a lo largo de las lecciones futuras es importante que tengas los arc - python-es-lecciones5.zip ([zip sync][]) - [lista por comprensión]: http://docs.python.org/tutorial/datastructures.html#list-comprehensions - [informáticos de Glasgow]: http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words - [Regular Expressions]: https://web.archive.org/web/20180416143856/http://www.diveintopython.net/regular_expressions/index.html + [lista por comprensión]: https://docs.python.org/tutorial/datastructures.html#list-comprehensions + [informáticos de Glasgow]: https://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words + [Regular Expressions]: https://web.archive.org/web/20180416143856/https://www.diveintopython.net/regular_expressions/index.html [zip]: /assets/python-es-lecciones4.zip [zip sync]: /assets/python-es-lecciones5.zip diff --git a/es/lecciones/corpus-paralelo-lfaligner.md b/es/lecciones/corpus-paralelo-lfaligner.md index 378ed3e610..338a967258 100644 --- a/es/lecciones/corpus-paralelo-lfaligner.md +++ b/es/lecciones/corpus-paralelo-lfaligner.md @@ -27,7 +27,7 @@ doi: 10.46430/phes0044 Un corpus paralelo o *bitexto* consiste en la recopilación de varias versiones de un texto. En este tutorial aprenderás a alinear el texto original con sus traducciones para poder cotejarlos con facilidad. ## Introducción -LF Aligner es un programa gratuito, basado en un [algoritmo de código abierto de alineación de oraciones](https://github.com/danielvarga/hunalign), que pertenece al conjunto de herramientas digitales llamadas ***CATs*** (*Computer Assisted Translation Tools*, por sus siglas en inglés) o herramientas de traducción asistida. Principalmente, se usa para la creación de bitextos que facilitan la búsqueda de términos especializados y sus traducciones. Sitios como [Linguee](https://www.linguee.es/) utilizan este tipo de herramientas para crear enormes corpus paralelos que el usuario puede consultar fácilmente. En ciencias sociales y humanidades podemos aprovechar este programa para crear textos que faciliten las tareas de lectura distante y [análisis estilístico](http://vocabularios.caicyt.gov.ar/portalthes/42/term/134). La aplicación puede importar texto de documentos en múltiples formatos y de memorias de traducción generadas con programas de código libre o privativo. En este tutorial nos centraremos en la importación de texto de fuentes digitales usadas comunmente por los investigadores como páginas web o documentos de texto plano, ya que, además, agilizan el proceso de alineación del corpus. +LF Aligner es un programa gratuito, basado en un [algoritmo de código abierto de alineación de oraciones](https://github.com/danielvarga/hunalign), que pertenece al conjunto de herramientas digitales llamadas ***CATs*** (*Computer Assisted Translation Tools*, por sus siglas en inglés) o herramientas de traducción asistida. Principalmente, se usa para la creación de bitextos que facilitan la búsqueda de términos especializados y sus traducciones. Sitios como [Linguee](https://www.linguee.es/) utilizan este tipo de herramientas para crear enormes corpus paralelos que el usuario puede consultar fácilmente. En ciencias sociales y humanidades podemos aprovechar este programa para crear textos que faciliten las tareas de lectura distante y [análisis estilístico](https://vocabularios.caicyt.gov.ar/portalthes/42/term/134). La aplicación puede importar texto de documentos en múltiples formatos y de memorias de traducción generadas con programas de código libre o privativo. En este tutorial nos centraremos en la importación de texto de fuentes digitales usadas comunmente por los investigadores como páginas web o documentos de texto plano, ya que, además, agilizan el proceso de alineación del corpus. Para este tutorial necesitarás los siguientes materiales y conocimientos: --- @@ -36,9 +36,9 @@ Para este tutorial necesitarás los siguientes materiales y conocimientos: * Un texto de partida -digitalizado- y por lo menos una traducción de este. En este caso, alinearemos distintas traducciones de un documento que desde 1948 guía el quehacer y la convivencia humana en todos los ámbitos de la vida pública y privada, la [Declaración Universal de Derechos Humanos](https://es.wikipedia.org/wiki/Declaraci%C3%B3n_Universal_de_los_Derechos_Humanos): en [español](/assets/corpus-paralelo-lfaligner/DDHH_es.txt), [inglés](/assets/corpus-paralelo-lfaligner/DDHH_en.txt), [francés](/assets/corpus-paralelo-lfaligner/DDHH_fr.txt) y [portugués](/assets/corpus-paralelo-lfaligner/DDHH_pt.txt) * Conocimiento básico de las lenguas de traducción, ya que en algunos casos tendremos que modificar algunos de los segmentos alineados. -Adicionalmente, podemos utilizar este programa para alinear distintas versiones de un texto en una misma lengua, lo que es útil para [análisis relacional](http://vocabularios.caicyt.gov.ar/portalthes/42/term/136), pero hay otras iniciativas que cumplen mejor con esta tarea como [Collatex](https://collatex.net/). +Adicionalmente, podemos utilizar este programa para alinear distintas versiones de un texto en una misma lengua, lo que es útil para [análisis relacional](https://vocabularios.caicyt.gov.ar/portalthes/42/term/136), pero hay otras iniciativas que cumplen mejor con esta tarea como [Collatex](https://collatex.net/). -Es importante ser sistemático con la clasificación de los documentos. El nombre de nuestros archivos txt debe acompañarse con el código que alude a la lengua del texto. Con ello aseguramos que la información con la que trabajamos siga convenciones oficiales que serán útiles a la hora de comunicar los resultados de nuestra investigación Para ello nos basaremos en el código [ISO 639-1](http://utils.mucattu.com/iso_639-1.html) que identifica a cada lengua con dos letras. Así, el español se identifica con *es*, el inglés con *en*, el francés con *fr* y el portugués con *pt*. +Es importante ser sistemático con la clasificación de los documentos. El nombre de nuestros archivos txt debe acompañarse con el código que alude a la lengua del texto. Con ello aseguramos que la información con la que trabajamos siga convenciones oficiales que serán útiles a la hora de comunicar los resultados de nuestra investigación Para ello nos basaremos en el código [ISO 639-1](https://utils.mucattu.com/iso_639-1.html) que identifica a cada lengua con dos letras. Así, el español se identifica con *es*, el inglés con *en*, el francés con *fr* y el portugués con *pt*. Si trabajas con lenguas que no estén incluidas en ese código, puedes recurrir al código [ISO 639-3](https://es.wikipedia.org/wiki/ISO_639-3) que utiliza descriptores de 3 letras y abarca la totalidad de las lenguas del mundo. diff --git a/es/lecciones/creando-diagramas-de-redes-desde-fuentes-historicas.md b/es/lecciones/creando-diagramas-de-redes-desde-fuentes-historicas.md index 9dc85d473c..980a25c057 100644 --- a/es/lecciones/creando-diagramas-de-redes-desde-fuentes-historicas.md +++ b/es/lecciones/creando-diagramas-de-redes-desde-fuentes-historicas.md @@ -36,7 +36,7 @@ doi: 10.46430/phes0002 Introducción ------------ -La visualizaciones de redes pueden ayudar a los humanistas a revelar patrones complejos escondidos y estructuras en fuentes textuales. Este tutorial explica cómo extraer datos en red (personas, instituciones, lugares, etcétera.) de fuentes históricas a través del uso de métodos no especializados desarrollados en el marco del análisis de datos qualitativos (Qualitative Data Analysis, QDA) y el análisis de redes sociales (Social Network Analysis, SNA), y cómo visualizar estos datos con [*Palladio*](http://hdlab.stanford.edu/palladio/), una aplicación independiente de plataforma y que es particularmente fácil de usar. +La visualizaciones de redes pueden ayudar a los humanistas a revelar patrones complejos escondidos y estructuras en fuentes textuales. Este tutorial explica cómo extraer datos en red (personas, instituciones, lugares, etcétera.) de fuentes históricas a través del uso de métodos no especializados desarrollados en el marco del análisis de datos qualitativos (Qualitative Data Analysis, QDA) y el análisis de redes sociales (Social Network Analysis, SNA), y cómo visualizar estos datos con [*Palladio*](https://hdlab.stanford.edu/palladio/), una aplicación independiente de plataforma y que es particularmente fácil de usar. {% include figure.html caption="Figura 1: Una visualización de redes en Palladio y lo que vas a poder crear al final de este tutorial." filename="diagramas-de-redes-01.png" %} @@ -44,7 +44,7 @@ La gráfica anterior muestra un fragmento de la red de Ralph Neumann, en particu En general, el análisis de redes provee las herramientas para explorar constelaciones muy complejas de relaciones entre entidades. Piensa en tus amigos: sería fácil mapear quiénes son cercanos y quiénes no se llevan bien. Ahora, imagina que quieres explicar estas relaciones a alguien que no conoce a ninguno de tus amigos, o que quieres incluir las relaciones entre los amigos de tus amigos. En situaciones como esta el lenguaje y nuestra capacidad de comprender estructuras sociales llega a sus límites rápidamente. Las visualizaciones gráficas pueden ser medios para comunicar y explorar efectivamente estas complejas constelaciones. En general tu puedes pensar el análisis de redes sociales (ARS) como un medio para transformar la complejidad de un problema en un objeto de investigación. A menudo, los nodos en una red representan humanos conectados con otros humanos por todos los tipos de relaciones sociales imaginables. Pero casi que cualquier cosa puede ser entendida como un nodo: una película, un lugar, un título laboral, un punto en el tiempo, un lugar de reunión. En forma similar el concepto de vínculo (también llamado arista) entre nodos es igualmente flexible: dos teatros pueden estar conectados por una película mostrada en ambos, o por co-propiedad, proximidad geográfica, o haber empezado a funcionar el mismo año. Todo esto depende de tus intereses de investigación y cómo los expresas en forma de nodos y relaciones en una red. -Esta lección no reemplaza ninguno de los muchos manuales genéricos de análisis de redes, como el libro de [John Scott _Social Network Analysis_](https://uk.sagepub.com/en-gb/eur/the-sage-handbook-of-social-network-analysis/book277881). Para una introducción general al campo y sus dificultades para los humanistas recomiendo [ ](https://web.archive.org/web/20240203222438/https://www.scottbot.net/HIAL/index.html@p=6279.html)[*la serie de blog posts de Scott Weingart "Networks Demystified"*](https://web.archive.org/web/20240203222438/https://www.scottbot.net/HIAL/index.html@p=6279.html) así como también[ ](http://hal.archives-ouvertes.fr/docs/00/64/93/16/PDF/lemercier_A_zg.pdf)[*el artículo de Claire Lemercier "Formal network methods in history: why and how?"*](http://hal.archives-ouvertes.fr/docs/00/64/93/16/PDF/lemercier_A_zg.pdf). También podrías querer explorar la bibliografía y calendario de eventos en [_Historical Network Research_](http://historicalnetworkresearch.org/) para darte una idea de cómo los historiadores han usado las redes en sus investigaciones. +Esta lección no reemplaza ninguno de los muchos manuales genéricos de análisis de redes, como el libro de [John Scott _Social Network Analysis_](https://uk.sagepub.com/en-gb/eur/the-sage-handbook-of-social-network-analysis/book277881). Para una introducción general al campo y sus dificultades para los humanistas recomiendo [ ](https://web.archive.org/web/20240203222438/https://www.scottbot.net/HIAL/index.html@p=6279.html)[*la serie de blog posts de Scott Weingart "Networks Demystified"*](https://web.archive.org/web/20240203222438/https://www.scottbot.net/HIAL/index.html@p=6279.html) así como también[ ](https://hal.archives-ouvertes.fr/docs/00/64/93/16/PDF/lemercier_A_zg.pdf)[*el artículo de Claire Lemercier "Formal network methods in history: why and how?"*](https://hal.archives-ouvertes.fr/docs/00/64/93/16/PDF/lemercier_A_zg.pdf). También podrías querer explorar la bibliografía y calendario de eventos en [_Historical Network Research_](https://historicalnetworkresearch.org/) para darte una idea de cómo los historiadores han usado las redes en sus investigaciones. Este tutorial se enfoca en la extracción de datos de un texto desestrucurado y muestra una forma de visulizarlos utilizando Palladio. Está diseñado a propósito para ser lo más simple y robusto posible. Por el alcance limitado de este tutorial, es suficiente decir que un actor se refiere a las personas, instituciones, etcétera., que son el objeto de estudio y que están conectadas por relaciones. Dentro del contexto del Análisis de Redes Sociales (ARS) (también llamada gráfica o grafo de red), a los actores o puntos centrales en cuestión, les llamamos nodos, y a las conexiones que existen entre ellos, les llamamos lazos o vínculos. En todos los casos es importante recordar que los nodos y los lazos son modelos drásticamente simplificados utilizados para representar la complejidad de eventos pasados y en sí mismos muchas veces no son suficientes para generar conocimiento. Pero es posible que el gráfico resalte algunos aspectos interesantes, desafíe tu hipótesis, y/o te lleve a generar nuevas hipótesis. *Los digramas de redes se vuelven más significativos cuando son parte de un diálogo con datos y otras fuentes de información*. @@ -56,7 +56,7 @@ En otras palabras, el reto es sistematizar la interpretación textual. Las redes Sobre el caso de estudio -------------------- -El caso de estudio que utilizo para este tutorial es una narrativa en primera persona de Ralph Neumann, un judío que sobrevivió al Holocausto. Puedes encontrar en texto en [*internet*](http://web.archive.org/web/20180422010025/http://www.gdw-berlin.de/fileadmin/bilder/publ/publikationen_in_englischer_sprache/2006_Neuman_eng.pdf). El esquema de codificación que presento abajo es una versión simplificada del que desarrollé durante [*mi proyecto doctoral sobre redes de apoyo encubierto durante la Segunda Guerra Mundial*](http://martenduering.com/research/covert-networks-during-the-holocaust/). Mi investigación estuvo guiada por tres preguntas: ¿En qué medida las relaciones sociales pueden ayudar a explicar por qué personas comunes tomaron los riesgos asociados a ayudar a otros? ¿Cómo dichas relaciones permitieron a la gente prestar ayuda dado que tenían a su disposición recursos muy limitados? ¿Cómo ayudaron las relaciones sociales a los refugiados judíos a sobrevivir clandestinamente? +El caso de estudio que utilizo para este tutorial es una narrativa en primera persona de Ralph Neumann, un judío que sobrevivió al Holocausto. Puedes encontrar en texto en [*internet*](https://web.archive.org/web/20180422010025/https://www.gdw-berlin.de/fileadmin/bilder/publ/publikationen_in_englischer_sprache/2006_Neuman_eng.pdf). El esquema de codificación que presento abajo es una versión simplificada del que desarrollé durante [*mi proyecto doctoral sobre redes de apoyo encubierto durante la Segunda Guerra Mundial*](https://martenduering.com/research/covert-networks-during-the-holocaust/). Mi investigación estuvo guiada por tres preguntas: ¿En qué medida las relaciones sociales pueden ayudar a explicar por qué personas comunes tomaron los riesgos asociados a ayudar a otros? ¿Cómo dichas relaciones permitieron a la gente prestar ayuda dado que tenían a su disposición recursos muy limitados? ¿Cómo ayudaron las relaciones sociales a los refugiados judíos a sobrevivir clandestinamente? En este proyecto las visualizaciones en red me ayudaron a descubrir intermediarios hasta el momento olvidados pero muy importantes, resaltar la importancia general de los refugiados judíos como intermediarios, y navegar los casi 5,000 actos de ayuda que conectaron alrededor de 1,400 personas entre 1942 y 1945. @@ -142,7 +142,7 @@ Los siguientes pasos explican cómo visualizar datos en red en Palladio, pero ta Paso a paso: -**1. Palladio.** Entra a [*http://hdlab.stanford.edu/palladio/*](http://hdlab.stanford.edu/palladio/)*.* +**1. Palladio.** Entra a [*https://hdlab.stanford.edu/palladio/*](https://hdlab.stanford.edu/palladio/)*.* **2. Comienza.** En el sitio web haz clic en el botón "Start". @@ -164,7 +164,7 @@ Paso a paso: {% include figure.html caption="Figura 9: Enlanzando personas y relaciones." filename="diagramas-de-redes-09.png" %} -**7. Identifica datos temporales.** Palladio tiene una característica especial para visualizar tiempo. La puedes usar si sabes cuándo empieza y cuando termmina cada relación. La muestra de datos contiene dos columnas con los datos necesarios para la categoría de tiempo. Haz clic en "Tiempo en que paso comienza" y selecciona el tipo de datos "Date" (Fecha). Haz lo mismo para "Tiempo en que paso termina" (Figura 10). El equipo de Palladio recomienda que tus datos estén en el formato de YYYY-MM-DD (AAAA-MM-DD), pero mi tiempo en formato más abstracto funciona bien. Si quisieras cargar coordenadas geográficas (no cubiertas en este tutorial pero disponible acá: [*Palladio Simple Map Scenario*](http://hdlab.stanford.edu/doc/scenario-simple-map.pdf)) tendrías que seleccionar el tipo de datos "Coordinates". +**7. Identifica datos temporales.** Palladio tiene una característica especial para visualizar tiempo. La puedes usar si sabes cuándo empieza y cuando termmina cada relación. La muestra de datos contiene dos columnas con los datos necesarios para la categoría de tiempo. Haz clic en "Tiempo en que paso comienza" y selecciona el tipo de datos "Date" (Fecha). Haz lo mismo para "Tiempo en que paso termina" (Figura 10). El equipo de Palladio recomienda que tus datos estén en el formato de YYYY-MM-DD (AAAA-MM-DD), pero mi tiempo en formato más abstracto funciona bien. Si quisieras cargar coordenadas geográficas (no cubiertas en este tutorial pero disponible acá: [*Palladio Simple Map Scenario*](https://hdlab.stanford.edu/doc/scenario-simple-map.pdf)) tendrías que seleccionar el tipo de datos "Coordinates". {% include figure.html caption="Figura 10: Cambiando el tipo de datos a 'Date' (Fecha)" filename="diagramas-de-redes-10.png"%} @@ -210,7 +210,7 @@ Ten en cuenta que si quisieras ver "Proveedor" y "Receptor" como un tipo de nodo {% include figure.html caption="Figure 18: Vizualización de pasos en el tiempo en línea del tiempo." filename="diagramas-de-redes-18.png" %} -**15. Tamaño del nodo.** Palladio te deja cambiar el tamaño de tus nodos con base en los atributos de los actores. Ten en cuenta que esto no tiene sentido para los datos de la muestra dado que los valores numéricos representan categorías. Sin embargo, los tamaños de los nodos puedes ser útiles si fueras a representar las suma de los actos de ayuda de una persona, lo que en este caso correspondería a su [*Grado de salida*](http://en.wikipedia.org/wiki/Directed_graph#Indegree_and_outdegree), el número de relaciones salientes para un nodo. +**15. Tamaño del nodo.** Palladio te deja cambiar el tamaño de tus nodos con base en los atributos de los actores. Ten en cuenta que esto no tiene sentido para los datos de la muestra dado que los valores numéricos representan categorías. Sin embargo, los tamaños de los nodos puedes ser útiles si fueras a representar las suma de los actos de ayuda de una persona, lo que en este caso correspondería a su [*Grado de salida*](https://en.wikipedia.org/wiki/Directed_graph#Indegree_and_outdegree), el número de relaciones salientes para un nodo. **16. Exporta tu visualización.** Palladio te deja exportar tus redes como archivos .svg, un formato de imagen hecho con vectores. Utiliza tu navegador preferido para abrirlas. @@ -245,15 +245,15 @@ Finalmente, cualquiera de las visualizaciones que puedes crear con el conjunto d Otras herramientas de visualización para tener en cuenta ------------------------------------------------------------ -[*Nodegoat*](http://nodegoat.net/) – similar a Palladio en cuanto que hace fácil la recolección de datos, el mapeo y la visualización en gráficas. Permite confirgurar fácilmente bases de datos relacionales y deja a los usuarios almacenar sus datos en servidores. [*El tutorial está disponible acá*](http://nodegoat.net/cms/UPLOAD/AsmallguidebyYanan11082014.pdf). +[*Nodegoat*](https://nodegoat.net/) – similar a Palladio en cuanto que hace fácil la recolección de datos, el mapeo y la visualización en gráficas. Permite confirgurar fácilmente bases de datos relacionales y deja a los usuarios almacenar sus datos en servidores. [*El tutorial está disponible acá*](https://nodegoat.net/cms/UPLOAD/AsmallguidebyYanan11082014.pdf). -[*NodeXL*](https://www.smrfoundation.org/nodexl/) – capaz de hacer varias tareas comunes en el análisis de redes sociales, fácil de usar, de código abierto pero requiere Windows y MS Office 2007 o más nuevo para correr.[ ](https://www.youtube.com/watch?v=pwsImFyc0lE)[*Tutorial 1*](https://www.youtube.com/watch?v=pwsImFyc0lE), [*Tutorial 2*](http://www.youtube.com/watch?v=xKhYGRpbwOc). +[*NodeXL*](https://www.smrfoundation.org/nodexl/) – capaz de hacer varias tareas comunes en el análisis de redes sociales, fácil de usar, de código abierto pero requiere Windows y MS Office 2007 o más nuevo para correr.[ ](https://www.youtube.com/watch?v=pwsImFyc0lE)[*Tutorial 1*](https://www.youtube.com/watch?v=pwsImFyc0lE), [*Tutorial 2*](https://www.youtube.com/watch?v=xKhYGRpbwOc). -[*Gephi*](https://gephi.github.io/) – programa de código abierto para cualquier plataforma. Es la más versátil y mejor conocida herramienta de visualización excepto por una curva de aprendizaje muy alta. Los desarrolladores anuncian soporte para lados paralelos en la versión 1.0. Tutoriales: por [*Clement Levallois*](http://www.clementlevallois.net/training.html) y [*Sebastien Heymann*](http://www.youtube.com/watch?v=L6hHv6y5GsQ). +[*Gephi*](https://gephi.github.io/) – programa de código abierto para cualquier plataforma. Es la más versátil y mejor conocida herramienta de visualización excepto por una curva de aprendizaje muy alta. Los desarrolladores anuncian soporte para lados paralelos en la versión 1.0. Tutoriales: por [*Clement Levallois*](https://www.clementlevallois.net/training.html) y [*Sebastien Heymann*](https://www.youtube.com/watch?v=L6hHv6y5GsQ). [*VennMaker*](https://www.vennmaker.com) – es independiente de plataforma y puede probarse de manera gratuita. VennMaker invierte el proceso de recolección de datos: los usuarios comienzan con un lienzo personalizable y dibujan los nodos auto-definidos en él. La herramienta recolecta los datos correspondientes tras bastidores. -Las herramientas más comunmente utilizadas para análisis más matemáticos son [*UCINET*](https://sites.google.com/site/ucinetsoftware/home) (tiene licencia y turoriales disponibles en su página web) y [*Pajek*](http://pajek.imfm.si/doku.php) (gratuito) por el cual existe un muy buen [*libro de guía*](http://www.cambridge.org/us/academic/subjects/sociology/research-methods-sociology-and-criminology/exploratory-social-network-analysis-pajek-2nd-edition). Ambos fueron desarrollados por Windows pero corren bien en otros sistemas utilizando Wine. +Las herramientas más comunmente utilizadas para análisis más matemáticos son [*UCINET*](https://sites.google.com/site/ucinetsoftware/home) (tiene licencia y turoriales disponibles en su página web) y [*Pajek*](https://pajek.imfm.si/doku.php) (gratuito) por el cual existe un muy buen [*libro de guía*](https://www.cambridge.org/us/academic/subjects/sociology/research-methods-sociology-and-criminology/exploratory-social-network-analysis-pajek-2nd-edition). Ambos fueron desarrollados por Windows pero corren bien en otros sistemas utilizando Wine. Para usuarios de Python el muy bien documentado paquete[ ](https://networkx.github.io/)[*Networkx*](https://networkx.github.io/) es un gran punto de partida; existen otros paquetes para otros lenguajes de programación. diff --git a/es/lecciones/crear-y-ver-archivos-html-con-python.md b/es/lecciones/crear-y-ver-archivos-html-con-python.md index c11060195f..5296404d76 100644 --- a/es/lecciones/crear-y-ver-archivos-html-con-python.md +++ b/es/lecciones/crear-y-ver-archivos-html-con-python.md @@ -19,8 +19,8 @@ translation-reviewer: - Antonio Rojas Castro review-ticket: https://github.com/programminghistorian/ph-submissions/issues/48 layout: lesson -next: salida-de-datos-como-archivo-html -previous: contar-frecuencias +next: /es/lecciones/salida-de-datos-como-archivo-html +previous: /es/lecciones/contar-frecuencias original: creating-and-viewing-html-files-with-python difficulty: 2 activity: presenting @@ -150,7 +150,7 @@ Para seguir a lo largo de las lecciones futuras es importante que tengas los arc - python-es-lecciones6.zip [zip sync] [archivo zip de las lecciones anteriores]: /assets/python-es-lecciones5.zip - [Zotero]: http://zotero.org - [tutorial de HTML de W3 Schools]: http://www.w3schools.com/html/default.asp - [declaración doctype]: http://www.w3schools.com/tags/tag_doctype.asp + [Zotero]: https://zotero.org + [tutorial de HTML de W3 Schools]: https://www.w3schools.com/html/default.asp + [declaración doctype]: https://www.w3schools.com/tags/tag_doctype.asp [zip sync]: /assets/python-es-lecciones6.zip diff --git a/es/lecciones/datos-de-investigacion-con-unix.md b/es/lecciones/datos-de-investigacion-con-unix.md index 2aaebcc8a3..63f6c076a7 100644 --- a/es/lecciones/datos-de-investigacion-con-unix.md +++ b/es/lecciones/datos-de-investigacion-con-unix.md @@ -25,7 +25,7 @@ activity: transforming topics: [data-manipulation] review-ticket: https://github.com/programminghistorian/ph-submissions/issues/138 abstract: "En esta lección aprenderás cómo los datos de tu investigación pueden ser contados y extraídos mediante el shell Unix, cuando están organizados de manera clara y predecible." -previous: introduccion-a-bash +previous: /es/lecciones/introduccion-a-bash avatar_alt: Grabado en blanco y negro de un minero trabajando sobre una plataforma adentro de una mina. doi: 10.46430/phes0004 --- @@ -49,7 +49,7 @@ _____ ## *Software* y configuración -Los usuarios de Windows deben instalar Git Bash. Lo pueden hacer descargando el más reciente instalador de la [página web de Git para Windos](http://msysgit.github.io/). Las instrucciones para su instalación están disponibles en [Open Hatch](https://web.archive.org/web/20190114082523/https://openhatch.org/missions/windows-setup/install-git-bash) (en inglés). +Los usuarios de Windows deben instalar Git Bash. Lo pueden hacer descargando el más reciente instalador de la [página web de Git para Windos](https://msysgit.github.io/). Las instrucciones para su instalación están disponibles en [Open Hatch](https://web.archive.org/web/20190114082523/https://openhatch.org/missions/windows-setup/install-git-bash) (en inglés). Los usuarios de OS X y Linux necesitarán utilizar la Terminal, o intérprete de línea de comandos, como se explica en la "[Introducción a la línea de comandos de Bash](/es/lecciones/introduccion-a-bash)." @@ -57,7 +57,7 @@ Esta lección se escribió utilizando Git Bash 1.9.0 en sistema operativo Window Los archivos utilizados en esta lección están disponibles en "[Figshare](https://doi.org/10.6084/m9.figshare.1172094)". Estos contienen metadatos de artículos académicos catalogados en el rubro 'Historia' en la base de datos ESTAR de la Biblioteca Británica. Los datos son distribuidos bajo una renuncia de derechos de autor CC0. -Descarga los datos requeridos en tu ordenador y descomprime el archivo zip. Si no cuentas con un software adecuado para descomprimir archivos .zip, te recomendamos [7-zip](http://www.7-zip.org/). En Windows, te aconsejamos descomprimir la carpeta en tu disco C: para que los archivos queden en tu directorio `c:\proghist\`. No obstante, cualquier locación trabajará bien, pero entonces es posible que tengas que ajustar tus comandos conforme vayas siguiendo la lección. En OS X o Linux, también te aconsejamos descomprimir en tu directorio de usuario para que aparezcan en `/user/NOMBREDEUSUARIO/proghist/`. En ambos casos, esto significa que cuando abras una nueva ventana de tu terminal, con solamente teclear `cd proghist` te podrás mover al directorio correcto. +Descarga los datos requeridos en tu ordenador y descomprime el archivo zip. Si no cuentas con un software adecuado para descomprimir archivos .zip, te recomendamos [7-zip](https://www.7-zip.org/). En Windows, te aconsejamos descomprimir la carpeta en tu disco C: para que los archivos queden en tu directorio `c:\proghist\`. No obstante, cualquier locación trabajará bien, pero entonces es posible que tengas que ajustar tus comandos conforme vayas siguiendo la lección. En OS X o Linux, también te aconsejamos descomprimir en tu directorio de usuario para que aparezcan en `/user/NOMBREDEUSUARIO/proghist/`. En ambos casos, esto significa que cuando abras una nueva ventana de tu terminal, con solamente teclear `cd proghist` te podrás mover al directorio correcto. _____ @@ -73,9 +73,9 @@ Escribe `ls` y oprime Enter. Esto imprime o muestra una lista que incluye dos ar Los archivos en este directorio son: el conjunto de datos `2014-01_JA.csv` que contiene los metadatos de los artículos académicos y un archivo con documentación acerca de `2014-01_JA.csv`, llamado `2014-01_JA.txt`. -El subdirectorio se llama `derived_data`. Contiene cuatro archivos [.tsv](http://en.wikipedia.org/wiki/Tab-separated_values) derivados del archivo `2014-01_JA.csv`. Cada uno de estos incluye los datos en los que aparece una palabra clave como `africa` o `america` en el campo 'Title' de `2014-01_JA.csv`. El directorio `derived_data` también incluye un subdirectorio llamado `results`. +El subdirectorio se llama `derived_data`. Contiene cuatro archivos [.tsv](https://en.wikipedia.org/wiki/Tab-separated_values) derivados del archivo `2014-01_JA.csv`. Cada uno de estos incluye los datos en los que aparece una palabra clave como `africa` o `america` en el campo 'Title' de `2014-01_JA.csv`. El directorio `derived_data` también incluye un subdirectorio llamado `results`. -*Nota: Los archivos [CSV](http://en.wikipedia.org/wiki/Comma-separated_values) son aquellos en los que las unidades de datos, o celdas de una tabla, están separados por comas (valores separados por comas) y los archivos TSV son aquellos en los que están separados por tabuladores. Ambos se pueden leer en cualquier editor de texto o en programas de hoja de cálculo como Libre Office Calc o Microsoft Excel.* +*Nota: Los archivos [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) son aquellos en los que las unidades de datos, o celdas de una tabla, están separados por comas (valores separados por comas) y los archivos TSV son aquellos en los que están separados por tabuladores. Ambos se pueden leer en cualquier editor de texto o en programas de hoja de cálculo como Libre Office Calc o Microsoft Excel.* Antes de que comiences a trabajar con estos archivos debes moverte al directorio en el que están almacenados. Navega a `c:\proghist\data\derived_data` en Windows o a `~/users/NOMBREDEUSUARIO/proghist/data/derived_data` en OS X. diff --git a/es/lecciones/datos-tabulares-en-r.md b/es/lecciones/datos-tabulares-en-r.md index 4ba9addb19..c27310dcbb 100644 --- a/es/lecciones/datos-tabulares-en-r.md +++ b/es/lecciones/datos-tabulares-en-r.md @@ -550,10 +550,10 @@ Para más información sobre R, visita el [Manual de R](https://cran.r-project. También hay numerosos tutoriales de R online, incluyendo: -* [R: A self-learn tutorial](http://web.archive.org/web/20191015004305/https://www.nceas.ucsb.edu/files/scicomp/Dloads/RProgramming/BestFirstRTutorial.pdf) (en inglés) - este tutorial cubre varias funciones y provee ejercicios para practicar. +* [R: A self-learn tutorial](https://web.archive.org/web/20191015004305/https://www.nceas.ucsb.edu/files/scicomp/Dloads/RProgramming/BestFirstRTutorial.pdf) (en inglés) - este tutorial cubre varias funciones y provee ejercicios para practicar. * [DataCamp Introducción a R](https://www.datacamp.com/community/open-courses/introduccion-a-r) (en español) - este es un curso online gratuito que te ofrece comentarios sobre tu código para ayudarte a identificar errores y aprender a escribir código más eficientemente. -Finalmente, un buen recurso para los historiadores digitales es el libro [_Digital History Methods in R_](http://dh-r.lincolnmullen.com) de Lincoln Mullen. +Finalmente, un buen recurso para los historiadores digitales es el libro [_Digital History Methods in R_](https://dh-r.lincolnmullen.com) de Lincoln Mullen. ## Notas diff --git a/es/lecciones/datos-urbanos-demograficos-r-ggplot2.md b/es/lecciones/datos-urbanos-demograficos-r-ggplot2.md index 10cd1fad36..f5f84a56d9 100644 --- a/es/lecciones/datos-urbanos-demograficos-r-ggplot2.md +++ b/es/lecciones/datos-urbanos-demograficos-r-ggplot2.md @@ -38,7 +38,7 @@ Después de la Segunda Guerra Mundial, las ciudades europeas se enfrentaron a un Las relaciones entre ciudades hermanadas enfrentan a los historiadores con oportunidades y desafíos. La oportunidad radica en su potencial para revelar patrones de reconciliación y diplomacia posbélica. El desafío proviene de su escala y complejidad: hay cientos de ciudades en Europa y cada una podría haber formado decenas de acuerdos de hermanamientos a lo largo de múltiples décadas. Al convertir estas complejas redes de relaciones de hermanamiento en patrones visuales, podemos explorar preguntas difíciles de responder únicamente con métodos tradicionales. Por ejemplo, ¿prefirieron las ciudades de [Alemania Occidental](https://perma.cc/K5TT-Z876) establecer relaciones con ciudades francesas inmediatamente después de la guerra? ¿Creó el [Telón de Acero](https://perma.cc/J383-CSC2) patrones distintos de relaciones entre Europa del Este y del Oeste? ¿Cómo influyeron el tamaño de la ciudad y la distancia geográfica en las conexiones diplomáticas? Este caso es un buen ejemplo de cómo puede ser útil la visualización de datos para la investigación histórica. -El paquete de R [ggplot2 (en inglés)](http://ggplot2.tidyverse.org) proporciona herramientas poderosas para investigar preguntas de esta índole a través de la visualización de datos. Aunque las hojas de cálculo y los gráficos básicos pueden ocultar patrones, las capacidades de visualización avanzadas de ggplot2 permiten a los historiadores descubrir relaciones ocultas en los datos. Por ejemplo, los [gráficos de dispersión](https://perma.cc/P9DH-CCSR) pueden revelar correlaciones entre variables numéricas como tamaños poblacionales y distancias geográficas, los [gráficos de barras](https://perma.cc/QP3N-VP2N) pueden mostrar la distribución de los hermanamientos en diferentes categorías de ciudades, y los [histogramas](https://perma.cc/MNY7-4FC4) pueden exponer patrones en los datos demográficos que de otro modo podrían permanecer invisibles. +El paquete de R [ggplot2 (en inglés)](https://ggplot2.tidyverse.org) proporciona herramientas poderosas para investigar preguntas de esta índole a través de la visualización de datos. Aunque las hojas de cálculo y los gráficos básicos pueden ocultar patrones, las capacidades de visualización avanzadas de ggplot2 permiten a los historiadores descubrir relaciones ocultas en los datos. Por ejemplo, los [gráficos de dispersión](https://perma.cc/P9DH-CCSR) pueden revelar correlaciones entre variables numéricas como tamaños poblacionales y distancias geográficas, los [gráficos de barras](https://perma.cc/QP3N-VP2N) pueden mostrar la distribución de los hermanamientos en diferentes categorías de ciudades, y los [histogramas](https://perma.cc/MNY7-4FC4) pueden exponer patrones en los datos demográficos que de otro modo podrían permanecer invisibles. Esta lección se diferencia de las guías estándar de ggplot2 porque se enfoca específicamente en las necesidades de los historiadores urbanos. En lugar de utilizar conjuntos de datos generales, trabajaremos con datos históricos sobre relaciones entre ciudades hermanadas para demostrar cómo las técnicas visuales pueden iluminar patrones y procesos históricos. A través de este enfoque, aprenderás a crear visualizaciones que revelen alianzas complejas y hacer que los procesos históricos sean más accesibles a un público más amplio. @@ -192,7 +192,7 @@ ggplot(data = eudata.porcentaje, aes(x = tipopais, y = porcentaje)) + Hay una diferencia importante entre el primer gráfico (Figura 1) y este. En el primero, ggplot2 contó el número de ciudades en cada grupo (doméstico, UE, no-UE). En el segundo, el tibble ya contiene el valor numérico de cada barra, almacenado en la columna **porcentaje**. Por esta razón especificamos `y = porcentaje` como un parámetro de `aes()` (es decir, `aesthetics`). De forma predeterminada, `geom_bar()` utiliza el parámetro `stat = "count"`. Esto significa que contará cuántas veces aparece cada valor. En otras palabras, agrupará los datos para ti. Sin embargo, puedes informar a ggplot2 que ya has calculado tus valores utilizando el parámetro `stat = "identity"`. -El gráfico 2 muestra que la mayoría de las ciudades hermanas son de un país diferente al de origen, aún así dentro de la UE (cerca del 68%). Esto podría deberse a la proximidad geográfica, similitudes culturales o vínculos económicos dentro de la Unión Europea. Puedes obtener más detalle agregando el nombre de cada país de origen al gráfico. Puedes decidir visualizar esto, por ejemplo, dividiendo cada barra en porcentajes por país de origen (gráfico 3), o creando gráficos separados para cada uno (esto se llama 'faceting' en el lenguaje ggplot2, que [abordaremos más abajo](#faceteando-un-grafico)). Intentemos la primera opción, agrupando los datos por país y por tipo de país y agregando una nueva columna con porcentajes: +El gráfico 2 muestra que la mayoría de las ciudades hermanas son de un país diferente al de origen, aún así dentro de la UE (cerca del 68%). Esto podría deberse a la proximidad geográfica, similitudes culturales o vínculos económicos dentro de la Unión Europea. Puedes obtener más detalle agregando el nombre de cada país de origen al gráfico. Puedes decidir visualizar esto, por ejemplo, dividiendo cada barra en porcentajes por país de origen (gráfico 3), o creando gráficos separados para cada uno (esto se llama 'faceting' en el lenguaje ggplot2, que [abordaremos más abajo](#facetando-un-gráfico)). Intentemos la primera opción, agrupando los datos por país y por tipo de país y agregando una nueva columna con porcentajes: ``` eudata.porcentaje.pais <- eudata %>% @@ -318,7 +318,7 @@ ggplot(data = eudata.sample, aes(x = log(origenpoblacion), y = log(destinopoblacion))) + geom_point(size = 0.8, color = "#4B0000") + labs(title = "Población de la ciudad de origen y destino", - caption = "Datos: [www.wikidata.org](http://www.wikidata.org)", + caption = "Datos: [www.wikidata.org](https://www.wikidata.org)", x = "Población de la ciudad de origen (log)", y = "Población de la ciudad destino (log)") ``` @@ -345,7 +345,7 @@ ggplot(data = eudata.sample, aes(x = log(origenpoblacion), y = log(destinopoblacion))) + geom_point(size = 0.8, alpha = 0.7, aes( color = tipopais )) + labs(title = "Población de la ciudad de origen y destino", - caption = "Datos: [www.wikidata.org](http://www.wikidata.org)", + caption = "Datos: [www.wikidata.org](https://www.wikidata.org)", x = "Población de la ciudad de origen (log)", y = "Población de la ciudad destino (log)") ``` @@ -373,7 +373,7 @@ p1 <- ggplot(data = eudata.sample, aes(x = log(origenpoblacion), y = log(destinopoblacion))) + geom_point(size = 0.8, alpha = 0.7, aes( color = tipopais )) + labs(title = "Población de la ciudad de origen y destino", - caption = "Datos: [www.wikidata.org](http://www.wikidata.org)", + caption = "Datos: [www.wikidata.org](https://www.wikidata.org)", x = "Población de la ciudad de origen (log)", y = "Población de la ciudad destino (log)") ``` @@ -387,7 +387,7 @@ p1 + {% include figure.html filename="es-tr-datos-urbanos-demograficos-r-ggplot2-11.png" alt="Gráfico de dispersión que usa scale_colour_manual() para cambiar los colores de los puntos." caption="Figura 11. Uso de scale_colour_manual() para especificar los colores de los puntos." %} -Sin embargo, también puedes basarte en escalas de colores predefinidas, como las paletas [de color brewer (en inglés)](http://colorbrewer2.org). Es mejor utilizar estas cuando sea posible, porque elegir los colores adecuados para las visualizaciones es un problema muy complicado (por ejemplo, evitar colores que no son distinguibles para personas con visión deficiente). Afortunadamente, ggplot2 incluye la función `scale_colour_brewer()` ya [integrada (en inglés)](https://perma.cc/BST9-7GMG). +Sin embargo, también puedes basarte en escalas de colores predefinidas, como las paletas [de color brewer (en inglés)](https://colorbrewer2.org). Es mejor utilizar estas cuando sea posible, porque elegir los colores adecuados para las visualizaciones es un problema muy complicado (por ejemplo, evitar colores que no son distinguibles para personas con visión deficiente). Afortunadamente, ggplot2 incluye la función `scale_colour_brewer()` ya [integrada (en inglés)](https://perma.cc/BST9-7GMG). ``` p1 + @@ -404,7 +404,7 @@ p2 <- ggplot(data = eudata.sample, geom_point(size = 0.8, aes( color = log(dist) )) + labs(title = "Población de la ciudad de origen y destino", subtitle = "Coloreado según la distancia entre ciudades", - caption = "Datos: [www.wikidata.org](http://www.wikidata.org)", + caption = "Datos: [www.wikidata.org](https://www.wikidata.org)", x = "Población de la ciudad de origen (log)", y = "Población de la ciudad destino (log)") @@ -497,7 +497,7 @@ p3 + theme_wsj() ### Extendiendo ggplot2 con otros paquetes -Una de las fortalezas de ggplot2 es su amplia colección de [extensiones (en inglés)](http://www.ggplot2-exts.org/) que pueden ayudar a enriquecer tu análisis con visualizaciones especializadas como gráficos de red (útiles para mostrar relaciones entre ciudades, por ejemplo), series de tiempo (para rastrear cambios demográficos a lo largo del tiempo), y gráficos de ridgeline, también llamados gráficos de cresta en español (para comparar distribuciones poblacionales en diferentes áreas urbanas). +Una de las fortalezas de ggplot2 es su amplia colección de [extensiones (en inglés)](https://www.ggplot2-exts.org/) que pueden ayudar a enriquecer tu análisis con visualizaciones especializadas como gráficos de red (útiles para mostrar relaciones entre ciudades, por ejemplo), series de tiempo (para rastrear cambios demográficos a lo largo del tiempo), y gráficos de ridgeline, también llamados gráficos de cresta en español (para comparar distribuciones poblacionales en diferentes áreas urbanas). Vamos a explorar un ejemplo que muestra un paquete de extensión de ggplot2 capaz de crear gráficos más avanzados e impactantes. En este caso, vamos a crear un [gráfico de ridgeline (en inglés)](https://perma.cc/D9Z2-XHAV) – también conocido como 'joyplot' – diseñado para visualizar los cambios en las distribuciones a lo largo del tiempo, en distintas categorías. Los gráficos de ridgeline son particularmente efectivos para comparar múltiples distribuciones de manera compacta y atractiva. @@ -513,7 +513,7 @@ ggplot(eudata, aes(x=log(origenpoblacion), y = origenpais)) + geom_density_ridges() + theme_ridges() + labs(title = "Población (log) de las ciudades de origen", - caption = "Datos: [www.wikidata.org](http://www.wikidata.org)", + caption = "Datos: [www.wikidata.org](https://www.wikidata.org)", x = "Población (log)", y = "País") ``` diff --git a/es/lecciones/de-html-a-lista-de-palabras-1.md b/es/lecciones/de-html-a-lista-de-palabras-1.md index b8fa34e098..51fc92281b 100644 --- a/es/lecciones/de-html-a-lista-de-palabras-1.md +++ b/es/lecciones/de-html-a-lista-de-palabras-1.md @@ -20,8 +20,8 @@ translation-reviewer: - Antonio Rojas Castro review-ticket: https://github.com/programminghistorian/ph-submissions/issues/44 layout: lesson -next: de-html-a-lista-de-palabras-2 -previous: manipular-cadenas-de-caracteres-en-python +next: /es/lecciones/de-html-a-lista-de-palabras-2 +previous: /es/lecciones/manipular-cadenas-de-caracteres-en-python original: from-html-to-list-of-words-1 python_warning: false difficulty: 2 @@ -156,10 +156,10 @@ Para seguir a lo largo de las lecciones futuras es importante que tengas los arc -[transcripción del juicio criminal contra Benjamin Bowsey de 1780]: http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33 +[transcripción del juicio criminal contra Benjamin Bowsey de 1780]: https://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33 [Descargar páginas web con Python]: /es/lecciones/trabajar-con-paginas-web -[HTML]: http://www.w3schools.com/html/ -[1]: http://www.w3schools.com/html/ +[HTML]: https://www.w3schools.com/html/ +[1]: https://www.w3schools.com/html/ [Manipular cadenas de caracteres en Python]: /es/lecciones/manipular-cadenas-de-caracteres-en-python [Reutilizacion de código y modularidad]: /es/lecciones/reutilizacion-de-codigo-y-modularidad [zip]: /assets/python-es-lecciones2.zip diff --git a/es/lecciones/de-html-a-lista-de-palabras-2.md b/es/lecciones/de-html-a-lista-de-palabras-2.md index abbcbbf76b..a1dabe52e7 100644 --- a/es/lecciones/de-html-a-lista-de-palabras-2.md +++ b/es/lecciones/de-html-a-lista-de-palabras-2.md @@ -20,8 +20,8 @@ translation-reviewer: - Antonio Rojas Castro review-ticket: https://github.com/programminghistorian/ph-submissions/issues/45 layout: lesson -next: normalizar-datos -previous: de-html-a-lista-de-palabras-1 +next: /es/lecciones/normalizar-datos +previous: /es/lecciones/de-html-a-lista-de-palabras-1 original: from-html-to-list-of-words-2 python_warning: false difficulty: 2 @@ -258,7 +258,7 @@ Para seguir a lo largo de las lecciones futuras es importante que tengas los arc - python-es-lecciones3.zip ([zip sync][]) [De HTML a lista de palabras (parte 1)]: /es/lecciones/de-html-a-lista-de-palabras-1 - [entero]: http://docs.python.org/2.4/lib/typesnumeric.html - [tipos]: http://docs.python.org/3/library/types.html + [entero]: https://docs.python.org/2.4/lib/typesnumeric.html + [tipos]: https://docs.python.org/3/library/types.html [zip]: /assets/python-es-lecciones2.zip [zip sync]: /assets/python-es-lecciones3.zip diff --git a/es/lecciones/descarga-automatizada-con-wget.md b/es/lecciones/descarga-automatizada-con-wget.md index 50e31a64ec..e69f12ffba 100644 --- a/es/lecciones/descarga-automatizada-con-wget.md +++ b/es/lecciones/descarga-automatizada-con-wget.md @@ -191,7 +191,7 @@ En este punto, las personas usuarias de las tres plataformas deben estar en la m La documentación completa para wget se puede encontrar en la página del [Manual de wget de GNU][]. -Tomemos un ejemplo de conjunto de datos. Digamos que deseas descargar todos los documentos alojados en el sitio web ActiveHistory.ca. Todos están ubicados en: ; en el sentido de que están todos contenidos en el directorio `/papers/`. Por ejemplo, el noveno documento publicado en el sitio web es . Piensa en esta estructura de la misma forma que los directorios en tu propia computadora. Si tienes una carpeta con la etiqueta `/Historia/`, es probable que contenga varios archivos dentro de ella. La misma estructura es válida para los sitios web y estamos usando esta lógica para decirle a nuestra computadora qué archivos queremos descargar. +Tomemos un ejemplo de conjunto de datos. Digamos que deseas descargar todos los documentos alojados en el sitio web ActiveHistory.ca. Todos están ubicados en: ; en el sentido de que están todos contenidos en el directorio `/papers/`. Por ejemplo, el noveno documento publicado en el sitio web es . Piensa en esta estructura de la misma forma que los directorios en tu propia computadora. Si tienes una carpeta con la etiqueta `/Historia/`, es probable que contenga varios archivos dentro de ella. La misma estructura es válida para los sitios web y estamos usando esta lógica para decirle a nuestra computadora qué archivos queremos descargar. Si deseas descargarlos todos manualmente deberás escribir un programa personalizado o hacer clic derecho en cada papel para hacerlo. Si los archivos están organizados de una manera que se ajuste a tus necesidades de investigación, wget es el abordaje más rápido. @@ -228,7 +228,7 @@ Saving to: `index.html.1' 2012-05-15 15:50:26 (374 KB/s) - `index.html.1' saved [37668] ``` -Wget descargó la primera página de , que contiene el índice de los documentos, a tu nuevo directorio. Si lo abres, verás el texto principal en la página de inicio de ActiveHistory.ca. Así que de un golpe ya hemos descargado algo rápidamente. +Wget descargó la primera página de , que contiene el índice de los documentos, a tu nuevo directorio. Si lo abres, verás el texto principal en la página de inicio de ActiveHistory.ca. Así que de un golpe ya hemos descargado algo rápidamente. Pero lo que queremos hacer ahora es descargar cada uno de los papeles. Así que necesitamos agregar algunos comandos a wget. @@ -244,7 +244,7 @@ Acabamos de aprender cosas sobre el componente [URL] en el ejemplo anterior, ya -r ``` -La recuperación recursiva es la parte más importante de wget. Lo que esto significa es que el programa comienza siguiendo los enlaces del sitio web y también los descarga. Entonces, por ejemplo, tiene un enlace a , por lo que también se descargará si utilizamos la recuperación recursiva. Sin embargo, también seguirá a cualquier otro enlace: si hubiera un enlace a en algún lugar de esa página, seguiría eso y lo descargaría también. De forma predeterminada, `-r` envía wget a una profundidad de cinco sitios después del primero. Esto es siguiendo los enlaces, hasta un límite de cinco clics después del primer sitio web. En este punto, será bastante indiscriminado. Así que necesitamos más comandos: +La recuperación recursiva es la parte más importante de wget. Lo que esto significa es que el programa comienza siguiendo los enlaces del sitio web y también los descarga. Entonces, por ejemplo, tiene un enlace a , por lo que también se descargará si utilizamos la recuperación recursiva. Sin embargo, también seguirá a cualquier otro enlace: si hubiera un enlace a en algún lugar de esa página, seguiría eso y lo descargaría también. De forma predeterminada, `-r` envía wget a una profundidad de cinco sitios después del primero. Esto es siguiendo los enlaces, hasta un límite de cinco clics después del primer sitio web. En este punto, será bastante indiscriminado. Así que necesitamos más comandos: ``` bash @@ -253,7 +253,7 @@ La recuperación recursiva es la parte más importante de wget. Lo que esto sign (El doble guión indica el texto completo de un comando. Todos los comandos también tienen una versión corta, éste podría iniciarse usando -np). -Esto es muy importante. Quiere decir que wget debe seguir los enlaces pero no más allá del último directorio principal. En nuestro caso, eso significa que no irá a ninguna sitio que no sea parte de la jerarquía de http://activehistory.ca/papers/. Si se tratara de una ruta larga como http://niche-canada.org/projects/events/new-events/not-yet-happened-events/, solo encontraría archivos en la carpeta `/not-yet-happened-events/`. Es un comando crítico para delimitar tu búsqueda. +Esto es muy importante. Quiere decir que wget debe seguir los enlaces pero no más allá del último directorio principal. En nuestro caso, eso significa que no irá a ninguna sitio que no sea parte de la jerarquía de https://activehistory.ca/papers/. Si se tratara de una ruta larga como https://niche-canada.org/projects/events/new-events/not-yet-happened-events/, solo encontraría archivos en la carpeta `/not-yet-happened-events/`. Es un comando crítico para delimitar tu búsqueda. Aquí una representación gráfica: @@ -320,12 +320,12 @@ Aquí solo he dado una instantánea de algunas de las funcionalidades de wget. P [^2]: La versión más reciente es wget 1.19, desde el 3 de febrero de 2017. -[Command Line Bootcamp]: http://praxis.scholarslab.org/scratchpad/bash/ +[Command Line Bootcamp]: https://praxis.scholarslab.org/scratchpad/bash/ [descargar XCode a través de este enlace]: https://itunes.apple.com/us/app/xcode/id497799835?mt=12 [sitio web de desarrolladores de Apple]: https://developer.apple.com/xcode/ [Ver descargas]: https://developer.apple.com/downloads/ -[sitio web de GNU]: http://www.gnu.org/software/wget/ -[HTTP]: http://ftp.gnu.org/gnu/wget/ +[sitio web de GNU]: https://www.gnu.org/software/wget/ +[HTTP]: https://ftp.gnu.org/gnu/wget/ [FTP]: ftp://ftp.gnu.org/gnu/wget/ -[ugent website]: http://users.ugent.be/~bpuype/wget/ -[Manual de wget de GNU]: http://www.gnu.org/software/wget/manual/wget.html +[ugent website]: https://users.ugent.be/~bpuype/wget/ +[Manual de wget de GNU]: https://www.gnu.org/software/wget/manual/wget.html diff --git a/es/lecciones/descarga-multiples-registros-usando-cadenas-de-consulta.md b/es/lecciones/descarga-multiples-registros-usando-cadenas-de-consulta.md index 3bffccfe1e..a5eca66661 100644 --- a/es/lecciones/descarga-multiples-registros-usando-cadenas-de-consulta.md +++ b/es/lecciones/descarga-multiples-registros-usando-cadenas-de-consulta.md @@ -24,7 +24,7 @@ difficulty: 2 activity: acquiring topics: [web-scraping, python] abstract: "Descargar un solo registro de un sitio web es fácil, aunque la descarga de muchos registros a la vez –una necesidad cada vez más frecuente– es mucho más eficiente utilizando un lenguaje de programación como Python. En esta lección escribiremos un programa que descargará una serie de registros del Old Bailey Online usando criterios de búsqueda personalizada y los guardará en un directorio de nuestro equipo." -previous: salida-palabras-clave-contexto-ngrams +previous: /es/lecciones/salida-palabras-clave-contexto-ngrams review-ticket: https://github.com/programminghistorian/ph-submissions/issues/205 avatar_alt: Figuras trabajando en una mina, empujando carros doi: 10.46430/phes0059 diff --git a/es/lecciones/editar-audio-con-audacity.md b/es/lecciones/editar-audio-con-audacity.md index df452247a1..57fc5ee841 100644 --- a/es/lecciones/editar-audio-con-audacity.md +++ b/es/lecciones/editar-audio-con-audacity.md @@ -33,7 +33,7 @@ doi: 10.46430/phes0007 Para aquellos interesados en audio, las habilidades básicas de edición de sonido les serán de mucha ayuda. Ser capaz de manipular los materiales puede ayudarte a dominar tu objeto de estudio: puedes ampliar y extraer momentos específicos para analizar, procesar el audio, y subir los materiales a un servidor para complementar la entrada de un blog en la materia. En un nivel más práctico, estas habilidades te permitirán grabar y comprimir grabaciones, tuyas o de otros, para su distribución. ¿Esa conferencia de un profesor invitado a tu facultad? ¡Grábala y edítala tú mismo! Hacerlo así es una forma sencilla de distribuir recursos entre varias instituciones, y también ayuda a hacer los materiales más accesibles pera lectores y escuchas con una amplia variedad de necesidades de aprendizaje. -En esta lección aprenderás a utilizar [Audacity](http://www.audacityteam.org/) para cargar, grabar, editar, mezclar y exportar archivos de audio. Con frecuencia, las plataformas de edición de audio son costosas y ofrecen numerosas funciones que pueden ser abrumadoras para el usuario que no tiene experiencia previa, al contrario, *Audacity* es una alternativa gratuita y de código abierto que ofrece gran funcionalidad y fácil acceso para editar archivos de audio. +En esta lección aprenderás a utilizar [Audacity](https://www.audacityteam.org/) para cargar, grabar, editar, mezclar y exportar archivos de audio. Con frecuencia, las plataformas de edición de audio son costosas y ofrecen numerosas funciones que pueden ser abrumadoras para el usuario que no tiene experiencia previa, al contrario, *Audacity* es una alternativa gratuita y de código abierto que ofrece gran funcionalidad y fácil acceso para editar archivos de audio. Para esta lección vamos a trabajar con dos archivos de audio: una grabación de las [Variaciones Goldberg de Bach](/assets/editing-audio-with-audacity/bach-goldberg-variations.mp3), y otra grabación de tu propia voz que se hará en el transcurso de la lección. @@ -46,7 +46,7 @@ Primero, descarga los archivos necesarios. Vas a necesitar el [archivo en .mp3 de las Variaciones Goldberg de Bach](/assets/editing-audio-with-audacity/bach-goldberg-variations.mp3). Para descargarlo, haz click con el botón derecho [aquí](/assets/editing-audio-with-audacity/bach-goldberg-variations.mp3) y selecciona "guardar como" para guardar el archivo en tu computadora como un MP3. -A continuación, descarga e instala *Audacity*, que está disponible en el [sitio del proyecto]( http://www.audacityteam.org/). *Audacity* puede utilizarse en Mac OSX, Windows o Linux. +A continuación, descarga e instala *Audacity*, que está disponible en el [sitio del proyecto]( https://www.audacityteam.org/). *Audacity* puede utilizarse en Mac OSX, Windows o Linux. Descarga el programa y haz doble clic para instalar. @@ -58,7 +58,7 @@ La interfaz cargará y mostrará los archivos cargados: *Audacity* convierte el sonido en un diagrama de onda, una forma frecuentemente utilizada para representar sonido. El eje horizontal representa el tiempo en forma de segundos (o minutos y segundos, dependiendo de la extensión del clip). El inicio del sonido se visualiza del lado izquierdo de la interfaz y *Audacity* coloca marcadores a lo largo de la onda hacia la derecha. Si damos clic en el botón de reproducir *Audacity* se moverá sobre el sonido de izquierda a derecha, entre tanto una línea vertical representará nuestra posición en el clip de audio. -El eje vertical representa la amplitud, que experimentamos como intensidad sonora o volumen. De manera predeterminada, el eje vertical mide el volumen en una regla vertical de -1 a 1: los extremos de -1 y 1 representan la intensidad sonora posible de la grabación sin distorsión, mientras que 0 representa silencio. Así, el silencio comienza como una línea plana desde la cual el sonido será más alto y más profundo a medida que aumente su intensidad. Para mayor información acerca del porqué algunos de los números son negativos, revisa la [**introducción a la acústica**](http://web.archive.org/web/20161119231053/http://www.indiana.edu:80/~emusic/acoustics/amplitude.htm) de Jeffrey Hass (en inglés). +El eje vertical representa la amplitud, que experimentamos como intensidad sonora o volumen. De manera predeterminada, el eje vertical mide el volumen en una regla vertical de -1 a 1: los extremos de -1 y 1 representan la intensidad sonora posible de la grabación sin distorsión, mientras que 0 representa silencio. Así, el silencio comienza como una línea plana desde la cual el sonido será más alto y más profundo a medida que aumente su intensidad. Para mayor información acerca del porqué algunos de los números son negativos, revisa la [**introducción a la acústica**](https://web.archive.org/web/20161119231053/https://www.indiana.edu:80/~emusic/acoustics/amplitude.htm) de Jeffrey Hass (en inglés). La representación de tiempo y amplitud de *Audacity* es tu primer y más fácil punto de referencia para la edición de sonido, y la herramienta facilita la navegación por el mismo. Sigo llamándole a esto una onda, pero aún no se parece mucho a una. Vamos a echar un vistazo más de cerca al seleccionar una parte de la pieza de audio. @@ -96,7 +96,7 @@ Se mostrará algo parecido a esto: Nuestra grabación original de “Bach” se mantiene en la parte superior de la interface, mientras que nuestra nueva grabación está por debajo de ella. De forma predeterminada, *Audacity* no sobreescribirá una grabación anterior. Por el contrario, aísla ambos sonidos o pistas, permitiéndonos manipular componentes separados antes de mezclarlos en una grabación final. Podemos hacer cambios a uno sin afectar al otro. Observa cómo, con respecto al tiempo, la nueva pista se grabó de manera predeterminada al principio del proyecto de Audacity. Por ahora, las pistas de “Bach” y la vocal comienzan al mismo tiempo. Existen otras imperfecciones potenciales en tu grabación única, algunas de las cuales podemos corregir. -Finalmente, observa cómo en mi ejemplo existen dos formas de onda para la grabación de Bach, pero solo una para la grabación de mi voz. La grabación de Bach fue hecha en estéreo, lo que significa que había dos canales de entrada, mientras que la grabación de mi voz fue hecha en *monoauraL*. *Audacity* permite grabar en ambos, y cualquiera de las dos funcionará para esta lección, así que no te preocupes si tu grabación aparece en estéreo. Puedes cambiar de mono a estéreo y viceversa desde “Editar”, disponible en la sección “Barra de herramientas” del menú “ver”. Para más información sobre mono contra estéreo, revista esta [*lectura*](http://www.diffen.com/difference/Mono_vs_Stereo/) (en inglés). +Finalmente, observa cómo en mi ejemplo existen dos formas de onda para la grabación de Bach, pero solo una para la grabación de mi voz. La grabación de Bach fue hecha en estéreo, lo que significa que había dos canales de entrada, mientras que la grabación de mi voz fue hecha en *monoauraL*. *Audacity* permite grabar en ambos, y cualquiera de las dos funcionará para esta lección, así que no te preocupes si tu grabación aparece en estéreo. Puedes cambiar de mono a estéreo y viceversa desde “Editar”, disponible en la sección “Barra de herramientas” del menú “ver”. Para más información sobre mono contra estéreo, revista esta [*lectura*](https://www.diffen.com/difference/Mono_vs_Stereo/) (en inglés). Aparte: a menudo puede ser de utilidad convertir la salida de sonido de tu laptop en entrada, para que puedas grabar los sonidos que se reproducen en tu computadora sin preocuparte del ruido externo o volver a grabar audio digital. Para obtener información sobre cómo llevar a cabo éste proceso, consulta [*Soundflower*](https://github.com/mattingalls/Soundflower/). @@ -152,7 +152,7 @@ Pero eventualmente vamos a querer cambiar el enfoque de la pista por completo de - Seleccionar “Crossfade Tracks”, del menú Efecto, esto le indicará a Audacity que realice el desvanecimiento de salida de la pista superior mientras hace el desvanecimiento de entrada de la pista inferior; en este caso, el posicionamiento de las pistas es importante. -*Audacity* te ofrecerá opciones para el *crossfade* de la pista, pero por ahora está bien mantener la configuración preestablecida en “Fade type:constant gain”. Ésta configuración garantiza que ambas pistas se desvanecerán o alinearán (para mayor información, revisa la documentación de *["crossfades” de Audacity](http://manual.audacityteam.org/man/crossfade_clips.html)* +*Audacity* te ofrecerá opciones para el *crossfade* de la pista, pero por ahora está bien mantener la configuración preestablecida en “Fade type:constant gain”. Ésta configuración garantiza que ambas pistas se desvanecerán o alinearán (para mayor información, revisa la documentación de *["crossfades” de Audacity](https://manual.audacityteam.org/man/crossfade_clips.html)* ![Post-crossfade](/images/editing-audio-with-audacity/editing-audio-with-audacity-13.png) @@ -166,6 +166,6 @@ De forma predeterminada, todo lo que hagas en *Audacity* es guardado en el forma Al hacer esto, mezclarás las múltiples pistas en un solo archivo de audio, y te dará la oportunidad de proporcionar metadatos a tu trabajo. -Existe un rango de diferentes opciones para refinar el proceso de exportación, pero el más importante es “tipo de archivo”. MP3 y Ogg son buenas opciones para el audio destinado a ser mostrado en la web, ya que ambos comprimen los archivos para que sean rápidos de cargar. Para mejores resultados, puedes incluir ambos formatos y sólo mostrar uno como una alternativa cuando alguno no sea compatible con el navegador web del usuario. Para mayor información, *NCH Software* ofrece un [buen desglose técnico para sus diferentes opciones](http://www.nch.com.au/acm/formats.html), mientras que Jonathan Sterne ha hecho un [trabajo fascinante](https://www.dukeupress.edu/mp3/) sobre las implicaciones culturales de tales decisiones de formato. Y la W3Schools ofrece una [buena comparación](https://www.w3schools.com/html/html5_audio.asp) de estos formatos usados en el desarrollo web. +Existe un rango de diferentes opciones para refinar el proceso de exportación, pero el más importante es “tipo de archivo”. MP3 y Ogg son buenas opciones para el audio destinado a ser mostrado en la web, ya que ambos comprimen los archivos para que sean rápidos de cargar. Para mejores resultados, puedes incluir ambos formatos y sólo mostrar uno como una alternativa cuando alguno no sea compatible con el navegador web del usuario. Para mayor información, *NCH Software* ofrece un [buen desglose técnico para sus diferentes opciones](https://www.nch.com.au/acm/formats.html), mientras que Jonathan Sterne ha hecho un [trabajo fascinante](https://www.dukeupress.edu/mp3/) sobre las implicaciones culturales de tales decisiones de formato. Y la W3Schools ofrece una [buena comparación](https://www.w3schools.com/html/html5_audio.asp) de estos formatos usados en el desarrollo web. ¡Felicidades! Has producido exitosamente un pequeño podcast. Puede que no parezca mucho, pero con frecuencia yo uso estas mismas recomendaciones para presentaciones, sitios web y cuestiones académicas. De ninguna manera esta lección pretende agotar los múltiples temas al respecto, pero debe haberte proporcionado algunas herramientas básicas para trabajar con sonido en proyectos de humanidades digitales. diff --git a/es/lecciones/escritura-sostenible-usando-pandoc-y-markdown.md b/es/lecciones/escritura-sostenible-usando-pandoc-y-markdown.md index 18dab2d5a8..144d5caeda 100644 --- a/es/lecciones/escritura-sostenible-usando-pandoc-y-markdown.md +++ b/es/lecciones/escritura-sostenible-usando-pandoc-y-markdown.md @@ -57,7 +57,7 @@ Aquí es donde brilla Markdown. Markdown es una sitaxis para el marcado semánti Escribir en esta forma libera al autor de la herramienta. Markdown se puede escribir en cualquier editor de texto y ofrece un rico ecosistema de *software* que puede representar ese texto en documentos con aspecto atractivo. Por esta razón, Markdown está experimentando un periodo de crecimiento, no solamente como un medio para la escritura de documentos académicos sino como una convención para la edición en línea en general. -Los editores de texto para todo prósito más populares incluyen [Atom](https://atoms.io/) (para todas las plataformas) y [Notepad++](http://notepad-plus-plus.org) (para Windows). +Los editores de texto para todo prósito más populares incluyen [Atom](https://atoms.io/) (para todas las plataformas) y [Notepad++](https://notepad-plus-plus.org) (para Windows). Es importante entender que Markdown no es más que una convención. Los archivos Markdown se almacenan como texto plano, además de añadir la flexibilidad del formato. Los archivos de texto plano han existido desde los tiempos de las máquinas de escribir eléctrónicas. La longevidad de este estándar hace, de manera inherente, que sean más sostenibles y más estables que los formatos propietarios. Mientras que los archivos producidos hace diez años en Microsfot Word o en Pages de Apple pueden causar serios problemas cuando se abren con la última versión del programa, aún es posible abrir un archivo de texto plano escrito en alguno de los editores de texto "muertos", del pasado, muchas décadas después: AlphaPlus, Perfect Writer, Text Wizard, Spellbinder, WordStar o SCRIPSIT2.0, el favorito de Isaac Asimov producido por Radio Shack. Escribir en texto plano te garantiza que tus archivos permanecerán legibles diez, quince o veinte años a partir de ahora. En esta lección se describe un flujo de trabajo que libera al investigador de programas de procesamiento de texto propietarios y archivos de formatos frágiles. @@ -186,7 +186,7 @@ Y como veremos en breve, este archivo de texto plano se puede representar como u {% include figure.html filename="Screen-Shot-2014-11-06.png" caption="Captura de pantalla de un PDF interpretado por Pandoc" %} -Si quieres tener una idea de cómo serán interpretado en un fomato HTML este tipo de marcado, prueba este [sitio de prueba en línea](http://daringfireball.net/projects/markdown/dingus) y juega con varios tipos de sintaxis. Recuerda que ciertos elementos del *Pandoc-flavored markdown* (como el bloque de título o las notas al pie) no funcionan en esta versión web ya que solamente acepta lo básico. +Si quieres tener una idea de cómo serán interpretado en un fomato HTML este tipo de marcado, prueba este [sitio de prueba en línea](https://daringfireball.net/projects/markdown/dingus) y juega con varios tipos de sintaxis. Recuerda que ciertos elementos del *Pandoc-flavored markdown* (como el bloque de título o las notas al pie) no funcionan en esta versión web ya que solamente acepta lo básico. En este punto, deberás ocupar algún tiempo explorando algunas de las características de Markdown como las citas de texto (referidas con el símbolo `>`), los listados que empiezan con `*` o `-`, los saltos de línea literales que empiezan con `|` (útiles para poesía), las tablas y algunas otras funciones señaladas en la página sobre Markdown de Pandoc. @@ -314,7 +314,7 @@ El filtro "citeproc" compila todas tus etiquetas de citas. El resultado debe ser ## Cambiar los estilos de citación -El estilo de citación por defecto en Pandoc es el de Chicago Autor-fecha. Podemos especificar un estilo diferente utilizando una hoja de estilo escrita en "lenguaje de estilo de citación" (CSL por *citation style language*, otra convención en texto plano utilizada para describir estilos de citas) y que es designado por la extensión de archivo `.csl`. Afortunadamente, el proyecto CSL mantiene un repositorio de estilos de citaciones comunes, algunas incluso ajustadas a ciertas revistas en específico. Visita para encontrar el archivo `.csl` para el estilo Modern Language Association (MLA), descarga el archivo `modern-language-association.csl` y guárdalo en la carpeta de tu proyecto como `mla.csl`. Ahora, necesitamos indicarle a Pandoc que utilice la hoja de estilo de MLA en vez de la de Chicago que tiene por defecto. Haremos esto actualizando el encabezado o bloque YAML: +El estilo de citación por defecto en Pandoc es el de Chicago Autor-fecha. Podemos especificar un estilo diferente utilizando una hoja de estilo escrita en "lenguaje de estilo de citación" (CSL por *citation style language*, otra convención en texto plano utilizada para describir estilos de citas) y que es designado por la extensión de archivo `.csl`. Afortunadamente, el proyecto CSL mantiene un repositorio de estilos de citaciones comunes, algunas incluso ajustadas a ciertas revistas en específico. Visita para encontrar el archivo `.csl` para el estilo Modern Language Association (MLA), descarga el archivo `modern-language-association.csl` y guárdalo en la carpeta de tu proyecto como `mla.csl`. Ahora, necesitamos indicarle a Pandoc que utilice la hoja de estilo de MLA en vez de la de Chicago que tiene por defecto. Haremos esto actualizando el encabezado o bloque YAML: ``` --- @@ -349,19 +349,19 @@ Trata tus archivos de origen como versiones autorizadas de tu texto y los archiv ## Recursos útiles -En caso de meterte en problemas no hay un mejor lugar para empezar a buscar soluciones que el [sitio web de Pandoc](https://pandoc.org/) de John MacFarlane y la [lista de correos](https://groups.google.com/forum/#!forum/pandoc-discuss) afiliada (en inglés). Al menos en dos sitios de tipo "Pregunta y respuesta" puedes encontrar respuestas a preguntas sobre Pandoc: [Stack Overflow](http://stackoverflow.com/questions/tagged/pandoc) y [Digital Humanities Q&A](http://web.archive.org/web/20190203062832/http://digitalhumanities.org/answers/). Puedes hacer preguntas en vivo en Freenode IRC, \#Pandoc channel, frecuentado por un amistoso grupo de asiduos. A medida que aprendas más acerca de Pandoc, puedes explorar una de sus particularidades más poderosa: [filtros](https://github.com/jgm/pandoc/wiki/Pandoc-Filters). +En caso de meterte en problemas no hay un mejor lugar para empezar a buscar soluciones que el [sitio web de Pandoc](https://pandoc.org/) de John MacFarlane y la [lista de correos](https://groups.google.com/forum/#!forum/pandoc-discuss) afiliada (en inglés). Al menos en dos sitios de tipo "Pregunta y respuesta" puedes encontrar respuestas a preguntas sobre Pandoc: [Stack Overflow](https://stackoverflow.com/questions/tagged/pandoc) y [Digital Humanities Q&A](https://web.archive.org/web/20190203062832/https://digitalhumanities.org/answers/). Puedes hacer preguntas en vivo en Freenode IRC, \#Pandoc channel, frecuentado por un amistoso grupo de asiduos. A medida que aprendas más acerca de Pandoc, puedes explorar una de sus particularidades más poderosa: [filtros](https://github.com/jgm/pandoc/wiki/Pandoc-Filters). -Aunque te sugerimos comenzar con un simple editor de texto plano, hay muchas más alternativas (más de 70, de acuerdo con [esta entrada de blog](http://web.archive.org/web/20140120195538/http://mashable.com/2013/06/24/markdown-tools/) a MS Word para trabajar específicamente con Markdown, disponibles en línea y a menudo sin costo. Para las autónomas nos gustan [Mou](http://mouapp.com/), [Write Monkey](https://web.archive.org/web/20260327163157/http://writemonkey.com/), y [Sublime Text](http://www.sublimetext.com/). Varias plataformas web que han surgido recientemente proporcionan interfaces gráficas adecuadas para desarrollar una escritura colaborativa con seguimiento de cambios en las versiones utilizando Markdown. Éstas incluyen: [prose.io](http://prose.io), [Authorea](http://www.authorea.com), [Draft](http://www.draftin.com), y [StackEdit](https://stackedit.io). +Aunque te sugerimos comenzar con un simple editor de texto plano, hay muchas más alternativas (más de 70, de acuerdo con [esta entrada de blog](https://web.archive.org/web/20140120195538/http://mashable.com/2013/06/24/markdown-tools/) a MS Word para trabajar específicamente con Markdown, disponibles en línea y a menudo sin costo. Para las autónomas nos gustan [Mou](https://mouapp.com/), [Write Monkey](https://web.archive.org/web/20260327163157/http://writemonkey.com/), y [Sublime Text](https://www.sublimetext.com/). Varias plataformas web que han surgido recientemente proporcionan interfaces gráficas adecuadas para desarrollar una escritura colaborativa con seguimiento de cambios en las versiones utilizando Markdown. Éstas incluyen: [prose.io](https://prose.io), [Authorea](https://www.authorea.com), [Draft](https://www.draftin.com), y [StackEdit](https://stackedit.io). -Pero el ecosistema no está limitado sólo a editores. [Gitit](http://gitit.net/) e [Ikiwiki](https://github.com/dubiousjim/pandoc-iki) soportan escritura en Markdown utilizando Pandoc como compilador. A esta lista se puede agregar una serie de herramientas que generan páginas web estáticas de manera rápida: [Yst](https://github.com/jgm/yst), [Jekyll](http://github.com/fauno/jekyll-pandoc-multiple-formats), [Hakyll](http://jaspervdj.be/hakyll/) y [bash shell script](https://github.com/wcaleb/website) por el historiador Caleb McDaniel. +Pero el ecosistema no está limitado sólo a editores. [Gitit](https://gitit.net/) e [Ikiwiki](https://github.com/dubiousjim/pandoc-iki) soportan escritura en Markdown utilizando Pandoc como compilador. A esta lista se puede agregar una serie de herramientas que generan páginas web estáticas de manera rápida: [Yst](https://github.com/jgm/yst), [Jekyll](https://github.com/fauno/jekyll-pandoc-multiple-formats), [Hakyll](https://jaspervdj.be/hakyll/) y [bash shell script](https://github.com/wcaleb/website) por el historiador Caleb McDaniel. -Finalmente, se están creando plataformas de publicación enteras basadas en el uso de Markdown. La plataforma de mercado [Leanpub](https://leanpub.com) puede ser una alternativa interesante al modelo tradicional de publicación y nosotros mismos estamos experimentando con el diseño de una revista académica en GitHub y [readthedocs.org](http://readthedocs.org) (herramientas que suelen utilizarse para técnicas de documentación). +Finalmente, se están creando plataformas de publicación enteras basadas en el uso de Markdown. La plataforma de mercado [Leanpub](https://leanpub.com) puede ser una alternativa interesante al modelo tradicional de publicación y nosotros mismos estamos experimentando con el diseño de una revista académica en GitHub y [readthedocs.org](https://readthedocs.org) (herramientas que suelen utilizarse para técnicas de documentación). [^1]: ¡No te preocupes si no entiendes aún esta terminología! [^2]: [GitHub](https://github.com/dhcolumbia/pandoc-workflow). Utiliza la opción "raw" cuando lo veas en GitHub para observar la fuente de Markdown. Los autores queremos agradecer a Alex Gil y sus colegas del Columbia's Digital Humanities Center, y a los participantes de openLab en el Studio de la Bilioteca Butler por probar el código de este tutorial en diversas plataformas. -[^3]: Véase la excelente discusión sobre este tema, por Charlie Stross, en [Why Microsoft Word Must Die](http://www.antipope.org/charlie/blog-static/2013/10/why-microsoft-word-must-die.html). +[^3]: Véase la excelente discusión sobre este tema, por Charlie Stross, en [Why Microsoft Word Must Die](https://www.antipope.org/charlie/blog-static/2013/10/why-microsoft-word-must-die.html). [^4]: Considera que la extensión `.bib` debe estar "vinculada" a Zotero en tu sistema operativo. Esto significa que si haces doble click en un archivo `.bib`, es probable que Zotero intente abrir el archivo mientras que nosotros queremos abrirlo con un editor de texto. Es posible que en el futuro quieras asociar la extensión `.bib` a tu editor de texto. diff --git a/es/lecciones/exhibicion-con-collection-builder.md b/es/lecciones/exhibicion-con-collection-builder.md index 8cb992a095..5fa42e92e9 100644 --- a/es/lecciones/exhibicion-con-collection-builder.md +++ b/es/lecciones/exhibicion-con-collection-builder.md @@ -64,7 +64,7 @@ El objetivo de la metodología Lib-STATIC es utilizar la tecnología de webs est CollectionBuilder-GH (CB-GH) es una de las alternativas de tipo computación mínima (minimal computing), que se refiere a "la actividad computacional que se realiza bajo restricciones significativas de hardware, software, educación, capacidad de red, energía u otros factores" ([_Minimal Computing: a working group of GO::DH_](https://go-dh.github.io/mincomp/about/)). Precisamente, por su formato, las exhibiciones digitales creadas con CollectionBuilder en GitHub necesitarán de menos hardware o tecnología y menos ancho de banda de Internet. Además, está totalmente adaptada a sistemas celulares. Este sistema es una buena alternativa a sistemas de exhibiciones digitales como [Omeka](https://es.wikipedia.org/wiki/Omeka) y al algo más complejo sistema [Wax](https://minicomp.github.io/wax/), para aquellos que no tengan recursos informáticos avanzados a su alcance, que no dispongan del tiempo para aprender a utilizar algo más complicado y, en resumidas cuentas, que quieran reutilizar de forma rápida las colecciones digitalizadas en sus archivos para dar acceso a su comunidad. -El trabajo para crear CB-GH está financiada por una beca National Leadership Grants for Libraries Planning Grant ofrecida por el Instituto de Servicios de Museos y Bibliotecas ([IMLS](https://www.imls.gov), por sus siglas en inglés). Varias bibliotecas y museos ya han utilizado esta herramienta para la diseminación de sus colecciones u otros propósitos, como son [Colors of Ozu](https://drodz11.github.io/colors-of-ozu/), de Dave Rodriguez o la [Namibia Heritage Week 2020](http://dna.nust.na/heritage_week/), de Namibia University of Science and Technology. +El trabajo para crear CB-GH está financiada por una beca National Leadership Grants for Libraries Planning Grant ofrecida por el Instituto de Servicios de Museos y Bibliotecas ([IMLS](https://www.imls.gov), por sus siglas en inglés). Varias bibliotecas y museos ya han utilizado esta herramienta para la diseminación de sus colecciones u otros propósitos, como son [Colors of Ozu](https://drodz11.github.io/colors-of-ozu/), de Dave Rodriguez o la [Namibia Heritage Week 2020](https://dna.nust.na/heritage_week/), de Namibia University of Science and Technology. ## 1. Preparar los archivos básicos para la colección @@ -72,7 +72,7 @@ CollectionBuilder-GH está basado en cuatro componentes básicos que generan la ### Colección de objetos -CollectionBuilder-GH está pensado para ser utilizado con colecciones pequeñas. Para poder utilizar la versión gratuita de GitHub, el total de los archivos de la exhibición no puede superar 1GB de peso. Por eso, se recomienda que la carpeta de imágenes no supere los 500MB. Puedes realizar la edición necesaria en imágenes de alta resolución que ya tengas con cualquier software de editado de imágenes, como [GIMP](http://www.gimp.org.es/descargar-gimp.html). Es importante tener en cuenta las siguientes consideraciones: +CollectionBuilder-GH está pensado para ser utilizado con colecciones pequeñas. Para poder utilizar la versión gratuita de GitHub, el total de los archivos de la exhibición no puede superar 1GB de peso. Por eso, se recomienda que la carpeta de imágenes no supere los 500MB. Puedes realizar la edición necesaria en imágenes de alta resolución que ya tengas con cualquier software de editado de imágenes, como [GIMP](https://www.gimp.org.es/descargar-gimp.html). Es importante tener en cuenta las siguientes consideraciones: - Formato de los objetos: GitHub y esta herramienta aceptan los formatos más comunes de imágenes y audio con los que ya estarás familiarizado: jpg, png y mp3. También puedes utilizar enlaces externos a objetos en YouTube o Vimeo, pero estos no aparecerán dentro de la exhibición diff --git a/es/lecciones/generadores-aventura.md b/es/lecciones/generadores-aventura.md index c1de01084b..a9246573f9 100644 --- a/es/lecciones/generadores-aventura.md +++ b/es/lecciones/generadores-aventura.md @@ -63,7 +63,7 @@ En esta lección nos concentraremos en un tipo particular de literatura electró Existe una larga tradición del uso de procesos mecánicos combinados con elementos aleatorios para la creación en las artes y la literatura. Con mecánicos nos referimos a que están guiados por sistemas de reglas claramente definidos, es decir, si usamos términos computacionales, por algoritmos. Y por aleatorios queremos decir que alguna parte del proceso creativo está definido por una fuente de incertidumbre, como el lanzamiento de una moneda o la selección espontánea de elementos de una lista. Esta combinación creativa entre orden y azar permite un equilibrio entre el control sobre los resultados de una obra creativa y la sorpresa con respecto a la configuración final de la misma obra. -Pensemos, por ejemplo, en el ejercicio [S + 7](https://perma.cc/S6LR-U5AN) propuesto por el poeta surrealista Jean Lescure en los años sesenta: el ejercicio consiste en tomar un texto preexistente, por ejemplo un poema, y reemplazar cada sustantivo por la séptima palabra que se encuentre después de este en un diccionario de sustantivos. En [este enlace](http://www.spoonbill.org/n+7/) encuentras un programa en inglés que genera textos con el ejercicio S + 7. Aquí podemos ver que hay una parte mecánica, las reglas que definen cómo proceder con el ejercicio, y una parte aleatoria, el resultado impredecible de cómo resultará el nuevo texto causado por el orden fortuito del diccionario usado. +Pensemos, por ejemplo, en el ejercicio [S + 7](https://perma.cc/S6LR-U5AN) propuesto por el poeta surrealista Jean Lescure en los años sesenta: el ejercicio consiste en tomar un texto preexistente, por ejemplo un poema, y reemplazar cada sustantivo por la séptima palabra que se encuentre después de este en un diccionario de sustantivos. En [este enlace](https://www.spoonbill.org/n+7/) encuentras un programa en inglés que genera textos con el ejercicio S + 7. Aquí podemos ver que hay una parte mecánica, las reglas que definen cómo proceder con el ejercicio, y una parte aleatoria, el resultado impredecible de cómo resultará el nuevo texto causado por el orden fortuito del diccionario usado. Este tipo de estrategias creativas, que en principio no requieren de un computador, han sido posteriormente adaptadas por la literatura electrónica, pues sus autoras y autores comúnmente aprovechan los sistemas algorítmicos que permiten los lenguajes de programación y el azar que proveen los generadores de números aleatorios para dar lugar a la conjunción mecánica-aleatoria. Un ejemplo concreto de esta estrategia en el campo computacional es la producción de generadores de texto —y otros medios como imágenes o sonido— por medio de sistemas algorítmicos llamados gramáticas libres de contexto; este es justamente el sistema que usaremos en esta lección. Cabe anotar que existen otros métodos para la generación de textos, como las [cadenas de Márkov](https://perma.cc/Y7FK-FM3X) o los modelos de lenguaje basados en [aprendizaje automático](https://perma.cc/D73Q-MMXM), pero no nos ocuparemos de ellos aquí. diff --git a/es/lecciones/georreferenciacion-visualizacion-con-recogito-y-visone.md b/es/lecciones/georreferenciacion-visualizacion-con-recogito-y-visone.md index 4f5c9777c9..dc1de7e62a 100644 --- a/es/lecciones/georreferenciacion-visualizacion-con-recogito-y-visone.md +++ b/es/lecciones/georreferenciacion-visualizacion-con-recogito-y-visone.md @@ -155,7 +155,7 @@ Visone permite generar y visualizar diferentes tipos de redes. Las redes son est ### Descarga e instala Visone -A diferencia de Recogito, necesitaremos instalar [Visone](http://visone.ethz.ch/html/download.html). La versión de descarga recomendada para todos los sistemas operativos es **visone-2.26.jar**. +A diferencia de Recogito, necesitaremos instalar [Visone](https://visone.ethz.ch/html/download.html). La versión de descarga recomendada para todos los sistemas operativos es **visone-2.26.jar**.
    Antes de inciar la instalación de Visone, debemos asegurarnos de tener instalado en nuestra computadora Java 8 o posterior. Si no tienes Java instalado en tu computadora puedes descargarlo aquí. @@ -255,7 +255,7 @@ Hay muchos tutoriales adicionales disponibles para Recogito y Visone. Te recomen - El sitio oficial de Visone tiene [varios tutoriales en inglés](https://visone.info/wiki/index.php/Tutorials#Basic_tutorials) sobre las diferentes aplicaciones de esta herramienta. -- El tutorial de Recogito de Gimena del Río y Valeria Vitale, [Recogito-in-a-Box: From Annotation to Digital Edition](http://dx.doi.org/10.3828/mlo.v0i0.299) (en inglés). +- El tutorial de Recogito de Gimena del Río y Valeria Vitale, [Recogito-in-a-Box: From Annotation to Digital Edition](https://dx.doi.org/10.3828/mlo.v0i0.299) (en inglés). ### Nota diff --git a/es/lecciones/georreferenciar-qgis.md b/es/lecciones/georreferenciar-qgis.md index 20ad982fdf..b4e8bc8c89 100644 --- a/es/lecciones/georreferenciar-qgis.md +++ b/es/lecciones/georreferenciar-qgis.md @@ -205,17 +205,17 @@ En procesos más avanzados, puedes incluso cubrir esta imagen georreferenciada c *Este tutorial es parte de [Geospatial Historian][].* - [Introducción a Google Maps y Google Earth]: /es/lecciones/intro-a-google-maps-y-google-earth - [rubber-sheeting]: http://en.wikipedia.org/wiki/Rubbersheeting - [National Topographic System Maps]: http://maps.library.utoronto.ca/datapub/digital/3400s_63_1929/maptile/Halifax/googlemaps.html - [1]: http://maps.library.utoronto.ca/datapub/PEI/NTS/west/ - [2]: http://maps.library.utoronto.ca/datapub/PEI/NTS/east/ - [Coordinate Reference System]: http://en.wikipedia.org/wiki/Spatial_reference_system - [Installing QGIS 2.0 and adding Layers]: /lessons/qgis-layers - [can be downloaded here]: http://geospatialhistorian.files.wordpress.com/2013/02/pei_lakemap1863.jpg - [Island Imagined]: https://web.archive.org/web/20180922004858/http://www.islandimagined.ca:80/fedora/repository/imagined:208687 - [in Atlantic Canada]: http://books.google.ca/books?id=TqCNZYXWXAUC&dq=tilting&source=gbs_navlinks_s - [world file]: http://en.wikipedia.org/wiki/World_file - [Tif]: http://en.wikipedia.org/wiki/Tagged_Image_File_Format - [Creating New Vector Layers in QGIS]: /lessons/vector-layers-qgis - [Geospatial Historian]: http://geospatialhistorian.wordpress.com/ +- [Introducción a Google Maps y Google Earth](/es/lecciones/intro-a-google-maps-y-google-earth) +- [rubber-sheeting](https://en.wikipedia.org/wiki/Rubbersheeting) +- [National Topographic System Maps](https://maps.library.utoronto.ca/datapub/digital/3400s_63_1929/maptile/Halifax/googlemaps.html) +- [1](https://maps.library.utoronto.ca/datapub/PEI/NTS/west/) +- [2](https://maps.library.utoronto.ca/datapub/PEI/NTS/east/) +- [Coordinate Reference System](https://en.wikipedia.org/wiki/Spatial_reference_system) +- [Installing QGIS 2.0 and adding Layers](/en/lessons/qgis-layers) +- [can be downloaded here](https://geospatialhistorian.files.wordpress.com/2013/02/pei_lakemap1863.jpg) +- [Island Imagined](https://web.archive.org/web/20180922004858/https://www.islandimagined.ca:80/fedora/repository/imagined:208687) +- [in Atlantic Canada](https://books.google.ca/books?id=TqCNZYXWXAUC&dq=tilting&source=gbs_navlinks_s) +- [world file](https://en.wikipedia.org/wiki/World_file) +- [Tif](https://en.wikipedia.org/wiki/Tagged_Image_File_Format) +- [Creating New Vector Layers in QGIS](/en/lessons/vector-layers-qgis) +- [Geospatial Historian](https://geospatialhistorian.wordpress.com/) \ No newline at end of file diff --git a/es/lecciones/gestionar-fuentes-primarias-digitales-con-tropy.md b/es/lecciones/gestionar-fuentes-primarias-digitales-con-tropy.md index 9e98761db5..ac3a0cb9d4 100644 --- a/es/lecciones/gestionar-fuentes-primarias-digitales-con-tropy.md +++ b/es/lecciones/gestionar-fuentes-primarias-digitales-con-tropy.md @@ -35,7 +35,7 @@ La meta de este tutorial es aprender a crear proyectos en Tropy basados tanto en - Tropy fue lanzado en 2016 por el Roy Rosenzweig Center for History and New Media en George Mason University ([RRCHNM](https://perma.cc/3TPV-4DFM))  en Fairfax, Virgínia, (EE. UU.). Su desarrollo prosiguió con el apoyo del RRCHNM y el Center for Contemporary and Digital Scholarship ([C2DH](https://perma.cc/ZQK6-7VEM)) de la Universidad de Luxemburgo, y [Digital Scholar](https://perma.cc/Z6ML-CXYV), la misma organización sin fines de lucro que administra otras herramientas de código abierto establecidas como Omeka y Zotero. -- Tropy cuenta con diferentes recursos para orientar al principiante, la mayoría de los cuales son accesibles por la página de web principal, [tropy.org](http://tropy.org). El código de Tropy reside en [el repositorio del proyecto en GitHub](https://github.com/tropy). Tropy tiene una [documentación completa](https://docs.tropy.org/) en inglés que proporciona una orientación textual a la herramienta. Además, es posible hacer preguntas, comentarios o sugerencias a través del [foro comunitario](https://forums.tropy.org/). El [canal de Youtube](https://www.youtube.com/tropy) de Tropy aloja videotutoriales en inglés y español — además, es posible encontrar grabaciones de talleres, tutoriales y webinars hechos por otros usuarios. +- Tropy cuenta con diferentes recursos para orientar al principiante, la mayoría de los cuales son accesibles por la página de web principal, [tropy.org](https://tropy.org). El código de Tropy reside en [el repositorio del proyecto en GitHub](https://github.com/tropy). Tropy tiene una [documentación completa](https://docs.tropy.org/) en inglés que proporciona una orientación textual a la herramienta. Además, es posible hacer preguntas, comentarios o sugerencias a través del [foro comunitario](https://forums.tropy.org/). El [canal de Youtube](https://www.youtube.com/tropy) de Tropy aloja videotutoriales en inglés y español — además, es posible encontrar grabaciones de talleres, tutoriales y webinars hechos por otros usuarios. - Tropy es una herramienta útil para historiadores y para estudiosos de humanidades digitales. También es de gran utilidad para cualquier investigador que trabaje con archivos, incluso sin conocimientos de programación. Aprender a usarlo es intuitivo, en especial porque este organizador de fotografías fue diseñado para historiadores. @@ -49,7 +49,7 @@ A modo de ejemplo, este tutorial utilizará una colección de expedientes judici ### Instalación -Tropy es una aplicación de escritorio. Para instalarla, basta entrar a [la página web principal](http://tropy.org) y hacer clic en el botón _Download Tropy for [nombre de sistema auto-detectado]_. Después, busca el instalador en tu disco duro y sigue las instrucciones según tu sistema operativo (Mac, Windows o Linux). Siempre encontrarás la versión más actualizada junto con versiones previas y beta en [el repositorio de GitHub](https://github.com/tropy/tropy/releases) (busca la etiqueta **Latest**). +Tropy es una aplicación de escritorio. Para instalarla, basta entrar a [la página web principal](https://tropy.org) y hacer clic en el botón _Download Tropy for [nombre de sistema auto-detectado]_. Después, busca el instalador en tu disco duro y sigue las instrucciones según tu sistema operativo (Mac, Windows o Linux). Siempre encontrarás la versión más actualizada junto con versiones previas y beta en [el repositorio de GitHub](https://github.com/tropy/tropy/releases) (busca la etiqueta **Latest**). ### Tipo de Proyecto  diff --git a/es/lecciones/instalar-modulos-python-pip.md b/es/lecciones/instalar-modulos-python-pip.md index a9bbb023e9..017b0d49ae 100644 --- a/es/lecciones/instalar-modulos-python-pip.md +++ b/es/lecciones/instalar-modulos-python-pip.md @@ -110,6 +110,6 @@ sudo pip install requests ¡Listo para trabajar! [pip]: https://pip.pypa.io/en/stable/ -[curl]: http://www.thegeekstuff.com/2012/04/curl-examples/ +[curl]: https://www.thegeekstuff.com/2012/04/curl-examples/ [aquí]: https://bootstrap.pypa.io/get-pip.py -[StackOverflow]: http://stackoverflow.com/questions/4750806/how-to-install-pip-on-windows +[StackOverflow]: https://stackoverflow.com/questions/4750806/how-to-install-pip-on-windows diff --git a/es/lecciones/intro-a-google-maps-y-google-earth.md b/es/lecciones/intro-a-google-maps-y-google-earth.md index 55a50ae3d0..6438a99651 100644 --- a/es/lecciones/intro-a-google-maps-y-google-earth.md +++ b/es/lecciones/intro-a-google-maps-y-google-earth.md @@ -29,7 +29,7 @@ topics: [mapping] abstract: "Google My Maps y Google Earth son una buena manera de comenzar a crear mapas digitales. Con una cuenta de Google puedes crear y editar mapas personales haciendo clic en Mis Sitios" -next: /lessons/qgis-layers +next: /en/lessons/qgis-layers doi: 10.46430/phes0036 --- @@ -69,13 +69,13 @@ Earth o Quantum GIS. - Identifícate con tu cuenta de Google si no estás conectado ya (si es necesario, sigue las sencillas instrucciones para crear una cuenta). -{% include figure.html caption="Figura 1" filename="geo-es1.png" %} +{% include figure.html caption="Figura 1" filename="es-tr-intro-a-google-maps-y-google-earth-01.png" %} - Haz clic en el signo de interrogación en la esquina inferior derecha y luego en "Visita guiada" para una introducción de cómo funciona My Maps. -{% include figure.html caption="Figura 2" filename="geo-es2.png" %} +{% include figure.html caption="Figura 2" filename="es-tr-intro-a-google-maps-y-google-earth-02.png" %} - En la esquina superior izquierda aparece un menú con el título "Mapa sin nombre". Haciendo clic en el título puedes renombrarlo como "Mi mapa de @@ -93,14 +93,14 @@ Earth o Quantum GIS. los nombres modernos de los lugares para evitar el riesgo de que Google elija la Constantinopla equivocada. -{% include figure.html filename="geo-es3.png" caption="Figura 3" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-03.png" caption="Figura 3" %} -{% include figure.html filename="geo-es4.png" caption="Figura 4" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-04.png" caption="Figura 4" %} - Luego, puedes importar un set de datos. Haz clic en "Importar" debajo de "Capa sin título". -{% include figure.html filename="geo-es5.png" caption="Figura 5" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-05.png" caption="Figura 5" %} - Se abrirá una nueva ventana que te dará la opción de importar un archivo CSV (valores separados por coma), un XLSX (Microsoft Excel), un KML @@ -110,30 +110,30 @@ Earth o Quantum GIS. También puedes utilizar una hoja de cálculo de Google a través de tu cuenta de Drive. -{% include figure.html filename="geo-es6.png" caption="Figura 6" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-06.png" caption="Figura 6" %} - A continuación, descarga el - [Archivo CSV del Suministro global de grasa de Reino Unido][] + [Archivo CSV del Suministro global de grasa de Reino Unido](/assets/intro-a-google-maps-y-google-earth/Suministro_global_de_grasa_de_Reino_Unido_1894_1896.zip) y guárdalo en tu computadora. Si abres el archivo en Excel u otro programa de hojas de cálculo, encontrarás un set de datos sencillo de dos columnas con una lista de diferentes tipos de grasas con los lugares asociados. Estos datos fueron construidos utilizando tablas de importaciones británicas de 1896. -{% include figure.html filename="geo-es7.png" caption="Figura 7" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-07.png" caption="Figura 7" %} - Arrastra el archivo al recuadro provisto por Google Maps. - Te pedirá que indiques qué columna debe utilizar Google para colocar las marcas de posición. Elige "Lugar". -{% include figure.html filename="geo-es8.png" caption="Figura 8" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-08.png" caption="Figura 8" %} - Luego, te solicitará que que elijas qué columna utilizar para los marcadores. Elige "Producto". - Ahora deberías tener un mapa global de los mayores exportadores de grasa a Gran Bretaña a mediados de la década de 1890. -{% include figure.html filename="geo-es9.png" caption="Figura 9: Clic para ver imagen en tamaño completo" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-09.png" caption="Figura 9" %} - A continuación puedes explorar los datos en mayor detalle y modificar el Estilo para distinguir entre tipos diferentes de grasas. @@ -142,9 +142,9 @@ Earth o Quantum GIS. Producto". A la izquierda, la leyenda mostrará la cantidad de ocurrencias de cada estilo entre paréntesis, por ejemplo: "Semillas de lino (4)". -{% include figure.html filename="geo-es10.png" caption="Figura 10" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-10.png" caption="Figura 10" %} -{% include figure.html filename="geo-es11.png" caption="Figura 11" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-11.png" caption="Figura 11" %} - Sigue jugando con las opciones. - Esta funcionalidad es una herramienta poderosa para mostrar sets de datos @@ -154,7 +154,7 @@ Earth o Quantum GIS. items. [//]: # PENDIENTE -{% include figure.html filename="geo12.png" caption="Figura 12" %} +{% include figure.html filename="en-or-googlemaps-googleearth-12.png" caption="Figura 12" %} ### Crear capas de vectores @@ -172,7 +172,7 @@ crece tu investigación con mapas digitales, pero no es un problema cuando está comenzando. En Google Maps puedes agregar un marcador, un texto de descripción y enlaces a un sitio web o una foto. Encontrarás más información acerca de cómo crear vectores históricos en un SIG completo en -[Creating New Vector Layers in QGIS 2.0][]. +[Creating New Vector Layers in QGIS 2.0](/en/lessons/vector-layers-qgis) (en inglés). - Para agregar una capa puedes utilizar la que ya ha sido creada con el nombre "Capa sin título", haciendo clic en ella y renombrándola a "Capa 1". @@ -180,7 +180,7 @@ de cómo crear vectores históricos en un SIG completo en se creará una nueva "Capa sin título" que podrás renombrar como "Capa 2". Debería verse así: -{% include figure.html filename="geo-es13.png" caption="Figura 13" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-13.png" caption="Figura 13" %} - Fíjete que a la izquierda de la capa hay una casilla de verificación: al desmarcarla se desactiva una capa (es decir, deja de verse en el mapa) y @@ -207,13 +207,13 @@ de cómo crear vectores históricos en un SIG completo en la parte superior de la ventana y, a continuación, haz clic en el lugar del mapa donde quieres que aparezca el marcador. -{% include figure.html filename="geo-es14.png" caption="Figura 14" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-14.png" caption="Figura 14" %} - Aparecerá un recuadro para etiquetar el marcador y agregar una descripción en un campo de texto. Agregamos Charlottetown y anotamos en la descripción que fue fundada en 1765. -{% include figure.html filename="geo-es15.png" caption="Figura 15" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-15.png" caption="Figura 15" %} - Agrega algunos puntos más, incluyendo etiquetas y descripciones. @@ -227,7 +227,7 @@ de cómo crear vectores históricos en un SIG completo en texto "Estilos individuales" que abre un menú para controlar distintos aspectos de la apariencia de la capa. -{% include figure.html filename="geo-es16.png" caption="Figura 16" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-16.png" caption="Figura 16" %} - Ahora agregaremos algunas líneas y formas (llamadas polígonos en el programa de SIG). Agregar líneas y polígonos es un proceso bastante similar. @@ -239,7 +239,7 @@ de cómo crear vectores históricos en un SIG completo en - Haz clic en el ícono de "Trazar una línea" a la derecha del símbolo de marcador y luego en "Agregar línea o forma": -{% include figure.html filename="geo-es17.png" caption="Figura 17" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-17.png" caption="Figura 17" %} - Elige una calle y haz clic con el mouse a lo largo de ella, calcando un poco la ruta. Aprieta "Enter" cuando quieras terminar la línea. @@ -250,7 +250,7 @@ de cómo crear vectores históricos en un SIG completo en busca la calle que acabas de dibujar en la Capa 2 en el menú y haz clic a la derecha de su nombre. -{% include figure.html filename="geo-es18.png" caption="Figura 18" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-18.png" caption="Figura 18" %} - Para crear un polígono (una forma) puedes conectar los puntos de la línea hasta alcanzar una forma cerrada. Para hacer esto, comienza a dibujar y @@ -259,9 +259,9 @@ de cómo crear vectores históricos en un SIG completo en los límites de una ciudad (ver ejemplos abajo). Te recomendamos experimentar por tu cuenta creando líneas y polígonos. -{% include figure.html filename="geo-es19.png" caption="Figura 19" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-19.png" caption="Figura 19" %} -{% include figure.html filename="geo-es20.png" caption="Figura 20" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-20.png" caption="Figura 20" %} - Al igual que con los marcadores y líneas, puedes cambiar el nombre y la descripción de un polígono. También puedes cambiar el color y el ancho de la @@ -306,9 +306,9 @@ de cómo crear vectores históricos en un SIG completo en la lección aquí. Si no, a continuación, aprenderás acerca de Google Earth y, en la lección 2, sobre Quantum GIS. -{% include figure.html filename="geo-es21.png" caption="Figura 21" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-21.png" caption="Figura 21" %} -{% include figure.html filename="geo-es22.png" caption="Figura 22" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-22.png" caption="Figura 22" %} ## Google Earth @@ -328,13 +328,13 @@ en la nube, aunque los mapas que crees pueden ser exportados. fronteras políticas (polígonos), rutas (líneas) y lugares (puntos). Mira las flechas rojas en la siguiente imagen para ver la ubicación de estas capas. -{% include figure.html filename="geo-es23.png" caption="Figura 23: Clic para ver la imagen en tamaño completo" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-23.png" caption="Figura 23" %} - Fíjate que bajo el título "Capas" en el costado inferior izquierdo del margen de la ventana, Google ofrece una serie de capas listas para usar que se activan seleccionando la casilla correspondiente. -{% include figure.html filename="geo-es24.png" caption="Figura 24" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-24.png" caption="Figura 24" %} - Google Earth también incluye algunos mapas históricos escaneados y fotografías aéreas (en SIG este tipo de mapas, que están hechos de píxeles, @@ -348,22 +348,22 @@ en la nube, aunque los mapas que crees pueden ser exportados. Google Earth). Tómate un tiempo para explorar algunos mapas históricos. Verifica si hay algún mapa incluido en la colección Rumsey que pueda ser útil para tu investigación o tus clases. (Para obtener más mapas digitalizados - pero no georeferenciados, visita [www.davidrumsey.com][].) + pero no georeferenciados, visita [www.davidrumsey.com](https://www.davidrumsey.com/).) -{% include figure.html filename="geo-es25.png" caption="Figura 25" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-25.png" caption="Figura 25" %} - Posiblemente necesites hacer zoom para ver todos los íconos de mapas. ¿Puedes encontrar el globo terráqueo de 1812? -{% include figure.html filename="geo-es26.png" caption="Figura 26" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-26.png" caption="Figura 26" %} - Al hacer clic en un ícono se abre un panel de información. Haz clic en la miniatura del mapa para verlo adherido al globo terráqueo digital. - Aprenderás a georeferenciar mapas correctamente en [Georeferencing in QGIS 2.0][]. + Aprenderás a georeferenciar mapas correctamente en [Georreferenciar con QGIS 2.0](/es/lecciones/georreferenciar-qgis). -{% include figure.html filename="geo-es27.png" caption="Figura 27" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-27.png" caption="Figura 27" %} -{% include figure.html filename="geo-es28.png" caption="Figura 28: Clic para ver imagen en tamaño completo" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-28.png" caption="Figura 28" %} ## KML: archivos de Keyhole Markup Language @@ -380,10 +380,10 @@ en la nube, aunque los mapas que crees pueden ser exportados. - Por ejemplo, puedes importar los datos de Google Maps Engine Lite. Si creaste un mapa en el ejercicio anterior, lo encontrarás haciendo clic - en "Mi mapa de prueba" en la página de inicio de [Maps Engine Lite][]. Haz + en "Mi mapa de prueba" en la página de inicio de [Maps Engine Lite](https://mapsengine.google.com). Haz clic en el ícono con tres puntos a la derecha del título del mapa y luego selecciona "Exportar a KML". (También puedes descargar y explorar el - [mapa de la vía marítima][] de Dan Macfarlane para esta parte del ejercicio). + [mapa de la vía marítima](/assets/googlemaps-googleearth/seaway.zip) de Dan Macfarlane para esta parte del ejercicio). **Importar tu archivo KML en Google Earth** @@ -396,7 +396,7 @@ Las actualizaciones realizadas en Google Earth desde la publicación de esta lec - Haz doble clic en el archivo KML en tu carpeta de Descargas. - Busca los datos en la carpeta de "Lugares Temporales" de Google Earth. -{% include figure.html filename="geo-es29.png" caption="Figura 29: Clic para ver imagen en tamaño completo" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-29.png" caption="Figura 29" %} - Ahora puedes explorar estos recursos cartográficos en 3D o agregar nuevas líneas, puntos y polígonos utilizando los distintos íconos ubicados @@ -412,7 +412,7 @@ Las actualizaciones realizadas en Google Earth desde la publicación de esta lec (cuando se selecciona el ícono de "Guarda un viaje" las opciones de grabación aparecen en la sección inferior izquierda de la ventana). -{% include figure.html filename="geo-es30.png" caption="Figura 30" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-30.png" caption="Figura 30" %} - Prueba agregar un nuevo recurso a los datos de la vía marítima de Dan. Hemos creado un polígono (en la terminología de SIG, un polígono es una forma @@ -421,9 +421,9 @@ Las actualizaciones realizadas en Google Earth desde la publicación de esta lec imagen. Busca el lago St. Clair (al este de Detroit) e intenta agregar un polígono. -{% include figure.html filename="geo-es31.png" caption="Figura 31: Clic para ver la imagen en tamaño completo" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-31.png" caption="Figura 31" %} -{% include figure.html filename="geo-es32.png" caption="Figura 32" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-32.png" caption="Figura 32" %} - Etiqueta el nuevo recurso como Lago St Claire. Luego, arrástralo encima de los datos de la vía marítima de Dan y agregalo a la colección. @@ -432,18 +432,18 @@ Las actualizaciones realizadas en Google Earth desde la publicación de esta lec estos datos a QGIS. Utiliza la opción de buscar haciendo clic derecho en la colección de la vía marítima y elige "Guardar lugar como". -{% include figure.html filename="geo-es33.png" caption="Figura 33" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-33.png" caption="Figura 33" %} -{% include figure.html filename="geo-es34.png" caption="Figura 34" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-34.png" caption="Figura 34" %} -{% include figure.html filename="geo-es35.png" caption="Figura 35" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-35.png" caption="Figura 35" %} ## Agregar mapas históricos escaneados Google Earth permite utilizar una copia digital de un mapa histórico. Éste puede ser un mapa que ha sido escaneado o una imagen que ya está en formato digital (para consejos sobre cómo encontrar mapas históricos en línea vea: -[Mobile Mapping and Historical GIS in the Field][]). El principal objetivo de +[Mobile Mapping and Historical GIS in the Field](https://niche-canada.org/2011/12/14/mobile-mapping-and-historical-gis-in-the-field/) – en inglés). El principal objetivo de utilizar un mapa digital, desde un punto de vista histórico, es ubicarlo encima de una imagen de Google Earth en el navegador, lo cual se conoce como superposición (*overlay*). Realizar superposiciones nos permite realizar comparaciones útiles @@ -459,19 +459,19 @@ de cambios a través del tiempo. imágenes históricas" en la barra superior y luego ajustando el control deslizable de la escala temporal que aparecerá. -{% include figure.html filename="geo-es36.png" caption="Figura 36" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-36.png" caption="Figura 36" %} -{% include figure.html filename="geo-es37.png" caption="Figura 37" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-37.png" caption="Figura 37" %} - Una vez que hayas identificado las imágenes que quieres utilizar, haz clic en el ícono de "Añadir superposición de imagen" en la barra superior. -{% include figure.html filename="geo-es38.png" caption="Figura 38" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-38.png" caption="Figura 38" %} - Aparecerá una nueva ventana. Comienza poniéndole un título diferente si lo deseas (por defecto es "Superposición de imágenes sin título"). -{% include figure.html filename="geo-es39.png" caption="Figura 39: Clic en la imagen para ver en tamaño completo" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-39.png" caption="Figura 39" %} - Haz clic en el botón "Examinar", a la derecha del campo "Vínculo", para seleccionar de tus archivos el mapa que desees que sea la imagen a superponer. @@ -484,14 +484,14 @@ de cambios a través del tiempo. - Hay marcadores en verde fosforescente en el medio y en los bordes del mapa subido, que pueden ser utilizados para estirar, achicar y mover el mapa para que se alinee correctamente con la imagen del satélite. Éste es un modo - sencillo de georeferenciar (mira [Georeferencing in QGIS 2.0][]). La imagen + sencillo de georeferenciar (mira [Georreferenciar con QGIS 2.0](/es/lecciones/georreferenciar-qgis)). La imagen de abajo muestra los pasos anteriores utilizando un viejo mapa de la ciudad de Aultsville superpuesto a imágenes satelitales de Google de 2008 en el cual se ven los restos de las calles y los cimientos de los edificios en el río St. Lawrence (Aultsville fue uno de los "pueblos perdidos" que fueron inundados por el proyecto de Vía Marítima y Energía de St. Lawrence). -{% include figure.html filename="geo-es40.png" caption="Figura 40: Clic en la imagen para ver en tamaño completo" %} +{% include figure.html filename="es-tr-intro-a-google-maps-y-google-earth-40.png" caption="Figura 40" %} - Volviendo a la ventana de "Nueva Superposición de Imágenes", fíjate que hay una serie de opciones para seleccionar ("Descripción", "Ver", "Altitud", @@ -518,52 +518,3 @@ de cambios a través del tiempo. trabajo!** *Esta lección es parte de [Geospatial Historian](https://geospatialhistorian.wordpress.com/)* - - [Google Maps Engine Lite]: https://mapsengine.google.com - [geo-es1]: /images/intro-a-google-maps-y-google-earth/geo-es1.png - [geo-es2]: /images/intro-a-google-maps-y-google-earth/geo-es2.png - [geo-es3]: /images/intro-a-google-maps-y-google-earth/geo-es3.png - [geo-es4]: /images/intro-a-google-maps-y-google-earth/geo-es4.png - [geo-es5]: /images/intro-a-google-maps-y-google-earth/geo-es5.png - [geo-es6]: /images/intro-a-google-maps-y-google-earth/geo-es6.png - [Archivo CSV del Suministro global de grasa de Reino Unido]: /assets/intro-a-google-maps-y-google-earth/Suministro_global_de_grasa_de_Reino_Unido_1894_1896.zip - [geo-es7]: /images/intro-a-google-maps-y-google-earth/geo-es7.png - [geo-es8]: /images/intro-a-google-maps-y-google-earth/geo-es8.png - [geo-es9]: /images/intro-a-google-maps-y-google-earth/geo-es9.png - [geo-es10]: /images/intro-a-google-maps-y-google-earth/geo-es10.png - [geo-es11]: /images/intro-a-google-maps-y-google-earth/geo-es11.png - [geo-es12]: /images/intro-a-google-maps-y-google-earth/geo-es12.png - [Creating New Vector Layers in QGIS 2.0]: /lessons/vector-layers-qgis - [geo-es13]: /images/intro-a-google-maps-y-google-earth/geo-es13.png - [geo-es14]: /images/intro-a-google-maps-y-google-earth/geo-es14.png - [geo-es15]: /images/intro-a-google-maps-y-google-earth/geo-es15.png - [geo-es16]: /images/intro-a-google-maps-y-google-earth/geo-es16.png - [geo-es17]: /images/intro-a-google-maps-y-google-earth/geo-es17.png - [geo-es18]: /images/intro-a-google-maps-y-google-earth/geo-es18.png - [geo-es19]: /images/intro-a-google-maps-y-google-earth/geo-es19.png - [geo-es20]: /images/intro-a-google-maps-y-google-earth/geo-es20.png - [geo-es21]: /images/intro-a-google-maps-y-google-earth/geo-es21.png - [geo-es22]: /images/intro-a-google-maps-y-google-earth/geo-es22.png - [geo-es23]: /images/intro-a-google-maps-y-google-earth/geo-es23.png - [geo-es24]: /images/intro-a-google-maps-y-google-earth/geo-es24.png - [www.davidrumsey.com]: http://www.davidrumsey.com/ - [geo-es25]: /images/intro-a-google-maps-y-google-earth/geo-es25.png - [geo-es26]: /images/intro-a-google-maps-y-google-earth/geo-es26.png - [Georeferencing in QGIS 2.0]: /lessons/georeferencing-qgis - [geo-es27]: /images/intro-a-google-maps-y-google-earth/geo-es27.png - [geo-es28]: /images/intro-a-google-maps-y-google-earth/geo-es28.png - [Maps Engine Lite]: https://mapsengine.google.com/map/ - [mapa de la vía marítima]: /assets/googlemaps-googleearth/seaway.zip - [geo-es29]: /images/intro-a-google-maps-y-google-earth/geo-es29.png - [geo-es30]: /images/intro-a-google-maps-y-google-earth/geo-es30.png - [geo-es31]: /images/intro-a-google-maps-y-google-earth/geo-es31.png - [geo-es32]: /images/intro-a-google-maps-y-google-earth/geo-es32.png - [geo-es33]: /images/intro-a-google-maps-y-google-earth/geo-es33.png - [geo-es34]: /images/intro-a-google-maps-y-google-earth/geo-es34.png - [geo-es35]: /images/intro-a-google-maps-y-google-earth/geo-es35.png - [Mobile Mapping and Historical GIS in the Field]: http://niche-canada.org/2011/12/14/mobile-mapping-and-historical-gis-in-the-field/ - [geo-es36]: /images/intro-a-google-maps-y-google-earth/geo-es36.png - [geo-es37]: /images/intro-a-google-maps-y-google-earth/geo-es37.png - [geo-es38]: /images/intro-a-google-maps-y-google-earth/geo-es38.png - [geo-es39]: /images/intro-a-google-maps-y-google-earth/geo-es39.png - [geo-es40]: /images/intro-a-google-maps-y-google-earth/geo-es40.png diff --git a/es/lecciones/introduccion-a-bash.md b/es/lecciones/introduccion-a-bash.md index b28fb882fc..534e8e01b8 100644 --- a/es/lecciones/introduccion-a-bash.md +++ b/es/lecciones/introduccion-a-bash.md @@ -42,7 +42,7 @@ Muchas de las lecciones en *The Programming Historian en español* requieren que {% include figure.html filename="en-or-intro-to-bash-01.png" caption="Figura 1. GUI de la computadora de Ian Milligan" %} -Las interfaces de línea de comandos ofrecen ventajas para los usuarios de computadoras que necesitan mayor precisión en su trabajo -como los historiadores digitales. Permiten un uso más detallado a la hora de ejecutar algunos programas, ya que puedes agregar parámetros para especificar *exactamente* cómo deseas ejecutar tu programa. Además, se pueden automatizar procesos fácilmente mediante [scripts](http://www.tldp.org/LDP/Bash-Beginners-Guide/html/chap_01.html), que son esencialmente recetas de órdenes escritas en un archivo de texto. +Las interfaces de línea de comandos ofrecen ventajas para los usuarios de computadoras que necesitan mayor precisión en su trabajo -como los historiadores digitales. Permiten un uso más detallado a la hora de ejecutar algunos programas, ya que puedes agregar parámetros para especificar *exactamente* cómo deseas ejecutar tu programa. Además, se pueden automatizar procesos fácilmente mediante [scripts](https://www.tldp.org/LDP/Bash-Beginners-Guide/html/chap_01.html), que son esencialmente recetas de órdenes escritas en un archivo de texto. Hay dos interfaces de línea de comandos principales, o *shells*, que utilizan muchos historiadores digitales. En OS X, así como en muchas de las distribuciones de Linux, el *shell* se conoce como `bash` (*Bourne-again shell*). Para los usuarios de sistemas Windows, la interfaz de línea de comandos está basada en MS-DOS por defecto, y aunque utiliza diferentes comandos y [sintaxis](https://es.wikipedia.org/wiki/Sintaxis), puede realizar tareas similares. Este tutorial proporciona una introducción básica a la terminal `bash`. Los usuarios de Windows pueden seguir instalando algún *shell* popular como [Cygwin](https://www.cygwin.com/) o Git Bash (ver más adelante). @@ -64,7 +64,7 @@ Cuando lo ejecutes verás esto en la ventana: {% include figure.html filename="en-or-intro-to-bash-03.png" caption="Figura 3. Pantalla de Terminal en blanco en nuestra estación de trabajo de OS X" %} -Quizá quieras cambiar la apariencia que por defecto tiene la terminal para no esforzarte de más al mirar continuamente texto negro sobre fondo blanco. En la aplicación por defecto de OS X puedes abrir el menú 'Perfiles' en 'Preferencias', bajo 'Terminal'. Haz clic en la pestaña 'Perfiles' y cámbialo por un nuevo esquema de color. Personalmente preferimos algo con menor contraste entre el fondo y el primer plano, pues lo estarás viendo durante mucho tiempo. 'Novel' es uno muy relajante ya que es la paleta de colores de la popular *suite* [Solarized](http://ethanschoonover.com/solarized). Los usuarios de Windows pueden obtener un efecto similar utilizando la pestaña 'Properties' de Git bash. Para llegar a ella, haz click con el botón derecho en cualquier lugar de la barra superior y seleciona 'Properties'. +Quizá quieras cambiar la apariencia que por defecto tiene la terminal para no esforzarte de más al mirar continuamente texto negro sobre fondo blanco. En la aplicación por defecto de OS X puedes abrir el menú 'Perfiles' en 'Preferencias', bajo 'Terminal'. Haz clic en la pestaña 'Perfiles' y cámbialo por un nuevo esquema de color. Personalmente preferimos algo con menor contraste entre el fondo y el primer plano, pues lo estarás viendo durante mucho tiempo. 'Novel' es uno muy relajante ya que es la paleta de colores de la popular *suite* [Solarized](https://ethanschoonover.com/solarized). Los usuarios de Windows pueden obtener un efecto similar utilizando la pestaña 'Properties' de Git bash. Para llegar a ella, haz click con el botón derecho en cualquier lugar de la barra superior y seleciona 'Properties'. {% include figure.html filename="en-or-intro-to-bash-04.png" caption="Figura 4. Pantalla de configutación en Terminal de OS X" %} @@ -136,7 +136,7 @@ Cuando quieres utilizar dos banderas puedes simplemente ejecutarlas juntas. Así obtendrás una salida en un formato legible para seres humanos; aprenderás que 6020 bits son también 5.9KB, que otro archivo tiene 1 megabite y así sucesivamente. -Estas opciones son *muy* importantes. Lo verás en otras lecciones de *The Programming Historian en español*. [Wget](/lessons/applied-archival-downloading-with-wget), [MALLET](/lessons/topic-modeling-and-mallet) y [Pandoc](/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) utilizan la misma sintaxis. Afortunadamente no necesitas memorizar la sintaxis; en lugar de ello, mantén estas lecciones a mano para que puedas echar un vistazo rápido si es necesario ajustar algo. Estas lecciones se pueden hacer en cualquier orden. +Estas opciones son *muy* importantes. Lo verás en otras lecciones de *The Programming Historian en español*. [Wget](/en/lessons/applied-archival-downloading-with-wget), [MALLET](/en/lessons/topic-modeling-and-mallet) y [Pandoc](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) utilizan la misma sintaxis. Afortunadamente no necesitas memorizar la sintaxis; en lugar de ello, mantén estas lecciones a mano para que puedas echar un vistazo rápido si es necesario ajustar algo. Estas lecciones se pueden hacer en cualquier orden. Ya has estado mucho tiempo en tu directorio personal. Vamos a otro lugar; puedes hacerlo a través del comando `cd` que significa 'Cambiar de directorio'. @@ -168,7 +168,7 @@ O en Windows algo como: `cd c:\mallet-2.0.7\` -e ir a nuestro directorio MALLET para [modelado de tópicos](/lessons/topic-modeling-and-mallet). +e ir a nuestro directorio MALLET para [modelado de tópicos](/en/lessons/topic-modeling-and-mallet). Finalmente, prueba: @@ -182,7 +182,7 @@ en Windows. Este comando abrirá tu GUI en el directorio actual. Asegúrate de d ## Interactuar con archivos -Además de navegar por directorios, puedes interactuar con archivos a través de la línea de comandos: puedes leerlos, abrirlos, ejecutarlos e incluso editarlos sin tener que salir de la interfaz. Hay cierto debate sobre por qué alguien querría hacer todo esto; la razón principal es la extrema comodidad de trabajar con la línea de comandos: nunca tienes que tocar el ratón o el *track pad* de la computadora y, aunque tiene una curva de aprendizaje pronunciada, eventualmente puede convertirse en el único entorno de escritura. Además, muchos programas requieren la utilización de la línea de comandos para operar con ellos. Puesto que vas a utilizar programas a través de la línea de comandos, a menudo puede ser más rápido crear pequeñas ediciones sin necesidad de cambiar a un programa separado. Para algunos de estos argumentos véase el texto de Jon Beltran de Heredia, ["Why, oh WHY, do those #?@! nutheads use vi?"](http://www.viemu.com/a-why-vi-vim.html). +Además de navegar por directorios, puedes interactuar con archivos a través de la línea de comandos: puedes leerlos, abrirlos, ejecutarlos e incluso editarlos sin tener que salir de la interfaz. Hay cierto debate sobre por qué alguien querría hacer todo esto; la razón principal es la extrema comodidad de trabajar con la línea de comandos: nunca tienes que tocar el ratón o el *track pad* de la computadora y, aunque tiene una curva de aprendizaje pronunciada, eventualmente puede convertirse en el único entorno de escritura. Además, muchos programas requieren la utilización de la línea de comandos para operar con ellos. Puesto que vas a utilizar programas a través de la línea de comandos, a menudo puede ser más rápido crear pequeñas ediciones sin necesidad de cambiar a un programa separado. Para algunos de estos argumentos véase el texto de Jon Beltran de Heredia, ["Why, oh WHY, do those #?@! nutheads use vi?"](https://www.viemu.com/a-why-vi-vim.html). A continuación, presentaremos unas formas básicas de interactuar con archivos. @@ -194,7 +194,7 @@ Esto crea un directorio llamado (¡adivinaste!) `ProgHist-Textos`. En general, e Pero ¡espera! Hay un truco para hacer las cosas un poco más rápido. Ve arriba un directorio (`cd ..`, lo cual te llevará de regreso al escritorio). Para navegar al directorio `ProgHist-Textos` puedes escribir `cd ProgHist-Textos`. Alternativamente puedes escribir `cd Prog` y luego pulsar la tecla de tabulador. Te darás cuenta de que la interfaz completa la línea como `cd ProgHist-Textos`. **Si pulsas el tabulador en cualquier momento dentro del *shell* le pedirás que intente completar automáticamente la línea en función de los archivos o subdirectorios que estén en el directorio actual. Sin embargo, la función es sensible a mayúsculas (así, en el ejemplo anterior, `cd prog` no podrá autocompletarse como `cd ProgHist-Textos`). En donde haya dos archivos con los mismos caracteres, autocompletar solamente llenará la línea hasta el primer punto de diferencia. Sugerimos utilizar este método a lo largo de la lección para ver cómo se comporta.** -Ahora necesitas encontrar un archivo de texto básico para que nos ayude con el ejemplo. ¿Por qué no utilizar un libro que sabes que es largo, como la épica "Guerra y Paz" de Leon Tolstói? El archivo de texto está disponible en [Project Gutenberg](http://www.gutenberg.org/ebooks/2600). Si ya instalaste [wget](/lessons/applied-archival-downloading-with-wget), puedes escribir: +Ahora necesitas encontrar un archivo de texto básico para que nos ayude con el ejemplo. ¿Por qué no utilizar un libro que sabes que es largo, como la épica "Guerra y Paz" de Leon Tolstói? El archivo de texto está disponible en [Project Gutenberg](https://www.gutenberg.org/ebooks/2600). Si ya instalaste [wget](/en/lessons/applied-archival-downloading-with-wget), puedes escribir: `wget http://www.gutenberg.org/files/2600/2600-0.txt` @@ -272,7 +272,7 @@ Verás aparecer Vim frente a ti, un editor de texto en línea de comandos. {% include figure.html filename="en-or-intro-to-bash-06.png" caption="Figura 6. Vim" %} -Si quieres aprender más de Vim, aquí tienes una [buena guía](http://vimdoc.sourceforge.net/htmldoc/quickref.html) disponible. +Si quieres aprender más de Vim, aquí tienes una [buena guía](https://vimdoc.sourceforge.net/htmldoc/quickref.html) disponible. El uso de Vim para leer archivos es relativamente simple. Puedes usar las teclas de flechas para navegar alrededor y teóricamente leer *Guerra y Paz* a través de línea de comandos (lo cual sería todo un logro, por cierto). A continuación hay algunos comandos de navegación básica: @@ -308,7 +308,7 @@ Para abandonar Vim o guardar cambios, tienes que introducir una serie de comando Si deseas salir del programa, escribe de nuevo `:` y luego `q`. Esto te regresará a la línea de comandos. Al igual que con el resto de *bash*, también podrías haber combinado los dos comandos. Presionando `:` y luego poniendo `wq` habríamos guardado el archivo y luego habríamos salido del programa. O, si querías salir **sin** guardar, `q!`, habrías salido de Vim y cancelado la preferencia de sobreescribir, por defecto, para guardar tus cambios. -Vim es diferente a los procesadores de texto a los que estás acostumbrado y requerirá más trabajo y práctica para llegar a tener fluidez en su uso. Pero si estás ajustando cosas menores en archivos, es una buena manera de empezar. A medida que te sientas más cómodo podrías incluso escribir documentos con él, aprovechando el potencial de [formar y poner notas a pie de Pandoc y Markdown](/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown). +Vim es diferente a los procesadores de texto a los que estás acostumbrado y requerirá más trabajo y práctica para llegar a tener fluidez en su uso. Pero si estás ajustando cosas menores en archivos, es una buena manera de empezar. A medida que te sientas más cómodo podrías incluso escribir documentos con él, aprovechando el potencial de [formar y poner notas a pie de Pandoc y Markdown](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown). ## Mover, copiar y borrar archivos diff --git a/es/lecciones/introduccion-a-ffmpeg.md b/es/lecciones/introduccion-a-ffmpeg.md index 9d4650c9c0..6d0959856e 100644 --- a/es/lecciones/introduccion-a-ffmpeg.md +++ b/es/lecciones/introduccion-a-ffmpeg.md @@ -1,445 +1,445 @@ ---- -title: Introducción a la transcodificación, edición y visualización de datos audiovisuales con FFmpeg -authors: -- Dave Rodriguez -editors: -- Brandon Walsh -reviewers: -- Tesla Cariani -- Josh Romphf -original: introduction-to-ffmpeg -date: 2018-12-20 -translator: -- Dave Rodriguez -- Sebastian Fiori -translation_date: 2020-12-11 -translation-editor: -- Antonio Rojas Castro -translation-reviewer: -- Jennifer Isasi -- José Antonio Motilla -original: introduction-to-ffmpeg -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/302 -difficulty: 2 -activity: analyzing -topics: [data-manipulation, data-visualization] -abstract: Esta lección introduce las funciones básicas de FFmpeg, una herramienta libre de línea de comandos utilizada para manipular y analizar materiales audiovisuales. -avatar_alt: Una cámara antigua -doi: 10.46430/phes0049 -layout: lesson ---- - -{% include toc.html %} - -# Introducción -Historicamente, las Humanidades Digitales se han enfocado casi exclusivamente en el analisis de fuentes textuales a través de métodos computacionales (Hockey, 2004). Sin embargo, hay un interés creciente en el campo de la utilización de métodos computacionales para el análisis de materiales audiovisuales de patrimonio cultural, como refleja la creación de la [Alianza de Organizaciones de Humanidades Digitales Grupo de Interés Especial: Materiales audiovisuales en Humanidades Digitales](https://avindhsig.wordpress.com/) y [el aumento de las presentaciones relacionadas con temas audiovisuales en la conferencia global de AOHD](https://figshare.com/articles/AV_in_DH_State_of_the_Field/5680114) en los años anteriores. Investigaciones recientes, tal como [Distant Viewing TV](https://distantviewing.org), indican un cambio en el campo hacia proyectos relacionados con el uso de técnicas computacionales para ampliar el alcance de los materiales que los y las humanistas digitales pueden explorar. Como afirma Erik Champion, "la audiencia de Humanidades Digitales no siempre está enfocada en la literatura o está interesada en las formas tradicionales de alfabetización" y la aplicación de metodologías digitales para estudiar cultura audiovisual es una faceta emergente y emocionante de las humanidades digitales (Champion, 2017, traducido por el autor). Hay muchas herramientas valiosas, gratuitas y de código abierto disponibles para aquellos interesados en trabajar con materiales audiovisuales (por ejemplo, el tutorial de _Programming Historian_ [Editar Audio con Audacity](/es/lecciones/editar-audio-con-audacity)). Este tutorial presentará otra: FFmpeg. - -[FFmpeg](https://www.ffmpeg.org/about.html) es el _framework_ multimedia de código abierto líder para transcodificar, editar, filtrar y reproducir casi cualquier tipo de formato audiovisual digital (sitio web de FFmpeg - "About"). Muchos programas comunes y sitios web usan FFmpeg para leer y escribir archivos audiovisuales, por ejemplo, VLC, Google Chrome, YouTube y [muchos más](https://trac.ffmpeg.org/wiki/Projects). Además de ser una herramienta de programa y de desarrollo web, FFmpeg se puede usar en la interfaz de la línea de comandos para realizar muchas tareas comunes, complejas e importantes, relacionadas con la gestión, modificación y análisis de archivos audiovisuales. Estos tipos de procesos, tales como editar, transcodificar o extraer los metadatos de archivos, generalmente requieren acceso a otro programa (tal como editores de vídeo no lineal, como Adobe Premiere o Final Cut Pro); sin embargo, FFmpeg permite a un usuario operar directamente en archivos audiovisuales sin el uso de interfaces o programa de terceros. Como tal, el conocimiento del _framework_ permite a los usuarios manipular materiales audiovisuales para satisfacer sus necesidades con una solución de código abierto y gratuita, que ofrece gran parte de la funcionalidad de un costoso programa de audio y vídeo. Este tutorial ofrece una introducción a la lectura y escritura de comandos de FFmpeg y una guía paso a paso a partir de un caso práctico para aprender a utilizar el _framework_ en un trabajo específico para los humanistas digitales. Específicamente, se mostrará cómo FFmpeg puede ser utilizado para extraer y analizar datos de color en un video archivístico. - -## Objetivos de aprendizaje -* Instalar FFmpeg en tu computadora o usar una versión "demo" en el navegador web -* Comprender la estructura básica y la sintaxis de los comandos de FFmpeg -* Aprender varios comandos útiles, tales como: - * "Re-wrap" (cambiar el contenedor) y transcodificar (recodificar archivos) - * "Demux" de archivos (separar audio y vídeo) - * Recortar/Editar archivos - * Usar FFplay para reproducir archivos - * Crear vectorscopios para visualizar los datos de color - * Usar FFprobe para generar informes de los datos de color -* Introducir recursos para mayor exploración y experimentación - -## Requisitos previos -Antes de comenzar con este tutorial, es necesario que localices la [Terminal](https://es.wikipedia.org/wiki/Terminal_(macOS)) de tu computadora u otra interfaz de línea de comandos, ya que ahí es donde ingresarás y ejecutarás los comandos de FFmpeg. Si necesitas instrucción para acceder y usar la interfaz de línea de comandos, te recomendamos la lección de _Programming Historian_ [Introducción a la línea de comandos en Bash](/es/lecciones/introduccion-a-bash) para usarios de Mac y Linux o, para usarios de Windows, [Introducción a la línea de comandos de Windows con PowerShell](/es/lecciones/introduccion-a-powershell). Adicionalmente, será de utilidad tener conocimientos básicos de [códecs](https://es.wikipedia.org/wiki/C%C3%B3dec) y [contenedores](https://es.wikipedia.org/wiki/Formato_contenedor) audiovisuales para entender con mayor detalle el funcionamiento de FFmpeg. Proporcionaremos información adicional y revisaremos con mayor detalle sobre códecs y contenedores en la sección sobre ejemplos de comandos preliminares de este tutorial. - -# Cómo instalar FFmpeg -La instalación de FFmpeg es posiblemente la parte más difícil de usar esta herramienta. Afortunadamente, existen algunas guías y recursos disponibles para instalar el _framework_ para cada sistema operativo. - -
    -Nuevas versiones de FFmpeg son lanzadas aproximadamente cada seis meses. Para mantenerse al tanto de ellas, es recomendable seguir a FFmpeg en Twitter o en su sitio web. Las nuevas versiones de FFmpeg generalmente contienen características tales como filtros nuevos y actualizados, compatibilidades de códecs y corrección de errores. La sintaxis de FFmpeg no cambia con estas actualizaciones y las capacidades antiguas rara vez se eliminan. Puedes aprender más sobre estas actualizaciones consultando los anuncios de actualizaciones anteriores en la sección de News en el sitio web de FFmpeg. -
    - -## Para usuarios de Mac OS -La opción más simple es usar un administrador de paquetes como [Homebrew](https://brew.sh/) para instalar FFmpeg y asegurar que permanezca en la versión más reciente. Para completar este tipo de instalación, sigue estos pasos: -* Instala Homebrew de acuerdo a las instrucctiones en el enlace de arriba -* Para comenzar con una instalación básica, ejecuta `brew install ffmpeg` en tu Terminal para comenzar una instalación básica - **Nota**: generalmente se recomienda instalar FFmpeg con opciones adicionales a la incluidas en la instalación básica; esto proporcionará acceso a más herramientas y funciones. [La Guía de Instalación de Apple de Reto Kromer](https://avpres.net/FFmpeg/install_Apple.html) proporciona un buen conjunto de opciones adicionales: - - ```bash - brew install ffmpeg --with-freetype --with-openjpeg --with-x265 --with-rubberband --with-tesseract - ``` - - * Para una explicación de estas opciones adicionales, revisa [La Guía FFmpeg de Ashley Blewer](https://training.ashleyblewer.com/presentations/ffmpeg.html#10). - * Además, puedes ejecutar `brew options ffmpeg` para ver qué características están o han estado disponibles en la versión actual de FFmpeg - * Para actualizar tu instalación a la versión más reciente, ejecuta: - - ```bash - brew update && brew upgrade ffmpeg - ``` - -* Para más opciones de instalación para Mac OS, revisa [La Guía de Compilación de FFmpeg para Mac OS](https://trac.ffmpeg.org/wiki/CompilationGuide/macOS) (la guía solo está disponible en inglés). - -## Para usuarios de Windows -Los usarios de Windows pueden usar el adminstratdor de paquetes [Chocolately](https://chocolatey.org/) para instalar y mantener FFmpeg. [La Guía de Instalación de Windows de Reto Kromer](https://avpres.net/FFmpeg/install_Windows.html) proporciona toda la información necesaria para usar Chocolately o construir el _framework_ a partir del código fuente (la guía solo está disponible en inglés). - -## Para usuarios de Linux -[Linuxbrew](ttp://linuxbrew.sh/) es un programa similar a Homebrew que se puede utilizar para instalar y mantener FFmepg en Linux. Reto Kromer también proporciona una guía útil, [la Guía de Instalación de Linux](https://avpres.net/FFmpeg/install_Linux.html), que es similar a la instalación en Mac OS. Tu distribución de Linux puede tener su [propio administrador de paquetes](https://www.linode.com/docs/tools-reference/linux-package-management/) que incluye paquetes FFmpeg (la guía solo está disponible en inglés). Dependiendo de tu distribución de Linux (Ubuntu, Fedora, Arch Linux, etc.) estas versiones pueden variar, así que usar Linuxbrew podría ser útil para asegurar que la versión es la misma independientemente del tipo de Linux que utilices. - -## Otros recursos de instalación - -* [Descarga de paquetes](https://www.ffmpeg.org/download.html) - * FFmpeg permite el accesso a archivos binarios, código fuente y versiones estáticas para Mac, Windows y Linux directamente en su sitio web. Los usuarios pueden construir el _framework_ sin un administrador de paquetes con estos recursos. Es probable que solo los usuarios avanzados quieran usar esta opción. -* [La Guía de Compilación de FFmpeg](https://trac.ffmpeg.org/wiki/CompilationGuide) - * La página Wiki de FFmpeg también proporciona un compendio de guías y estrategias para instalar FFmpeg en tu computadora (la guía solo está disponible en inglés). - -## Probando la instalación -* Para asegurarte de que FFmpeg se haya instalado correctamente, ejecuta: - - ```bash - ffmpeg -version - ``` - -* Si ves una lista larga con información, ¡la instalación fue exitosa! Debe ser similar a lo siguiente: - -```bash -ffmpeg version 4.0.1 Copyright (c) 2000-2018 the FFmpeg developers -built with Apple LLVM version 9.1.0 (clang-902.0.39.1) -configuration: --prefix=/usr/local/Cellar/ffmpeg/4.0.1 --enable-shared --enable-pthreads --enable-version3 --enable-hardcoded-tables --enable-avresample --cc=clang --host-cflags= --host-ldflags= --enable-gpl --enable-ffplay --enable-libfreetype --enable-libmp3lame --enable-librubberband --enable-libtesseract --enable-libx264 --enable-libx265 --enable-libxvid --enable-opencl --enable-videotoolbox --disable-lzma --enable-libopenjpeg --disable-decoder=jpeg2000 --extra-cflags=-I/usr/local/Cellar/openjpeg/2.3.0/include/openjpeg-2.3 -libavcodec 58. 18.100 / 58. 18.100 -libavformat 58. 12.100 / 58. 12.100 -libavdevice 58. 3.100 / 58. 3.100 -libavfilter 7. 16.100 / 7. 16.100 -libavresample 4. 0. 0 / 4. 0. 0 -libswscale 5. 1.100 / 5. 1.100 -libswresample 3. 1.100 / 3. 1.100 -libpostproc 55. 1.100 / 55. 1.100 -``` - -* Si el sistema arroja `-bash: ffmpeg: command not found`, algo ha ido mal. - * Nota: Si estás usando un administrador de paquetes, es improbable que encuentres este mensaje de error. Sin embargo, si hay un problema después de instalar con un administrador de paquetes, es probable que haya un problema con el administrador de paquetes y no con FFmpeg. Consulta la solución de problemas en [Homebrew](https://docs.brew.sh/Troubleshooting), [Chocolatey](https://chocolatey.org/docs/troubleshooting), o [Linuxbrew](http://linuxbrew.sh/) para asegurar que el administrador de paquetes está funcionando correctamente en tu computadora (las guías solo está disponible en inglés). Si estás intentando instalar sin un administrador de paquetes y ves este mensaje de error, haz una referencia cruzada de tu método con la La Guía de Compilación de FFmpeg anterior. - -## Usando FFmpeg en el navegador -Si no quieres instalar FFmepg en tu computadora pero te gustaría familiarizarte con el _framework_ y usarlo en la interfaz de línea de comandos, [videoconverter.js](https://bgrins.github.io/videoconverter.js/demo/) de Brian Grinstead proporciona un método para ejecutar los comandos FFmpeg en tu navegador (la interfaz está en inglés). -
    - Esta interfaz del navegador no tiene las funcionalidades como para completar todo este tutorial, pero es útil para aprender los comandos esenciales de FFmpeg. Adicionalmente, este recurso opera en una versión anterior de FFmpeg y posiblemente no tenga todas las características de la versión más reciente. -
    - -## Estructura básica y sintaxis de los comandos FFmpeg -El comando básico tiene cuatro partes: - -```bash -[Símbolo del Sistema] [Archivo de Entrada] [Banderas/Acciones] [Archivo de Salida] -``` - -* Cada comando comenzará con un símbolo del sistema. Dependiendo del uso, este será `ffmpeg` (cambiar archivos), `ffprobe` (generar metadatos de archivos) o `ffplay` (reproducir archivos). -* Los archivos de entradas son los archivos que están siendo leídos, editados o examinados. -* Las banderas y acciones son las cosas que le estás diciendo a FFmpeg que haga con los archivos de entrada. La mayoría de los comandos contendrán múltiples banderas y acciones de complejidad variable. -* Los archivos de salida son los archivos creados por el comando o los informes creados por los commandos de `ffprobe`. - -Escrito genéricamente, el comando básico es parecido a lo siguiente: - -```bash - ffmpeg -i /ruta_de_archivo/archivo_de_entrada.ext -bandera alguna_acción /ruta_de_archivo/archivo_de_salida.ext - ``` -
    -Como con cualquier interfaz de línea de comandos, tendrás que escribir las rutas de los archivos de entrada y de salida dependiendo de las ubicaciones de tus directorios de trabajo. En los ejemplos proporcionados en este tutorial, las rutas de archivos no estarán escritas completamente y se supone que el usuario ha navegado al directorio de trabajo para ejecutar los comandos.
    - - -A continuación, examinaremos algunos ejemplos de varios comandos diferentes que usan esta estructura y sintaxis. Adicionalmente, estos comandos demostrarán algunas de las características más útiles de FFmpeg y nos permitirán familiarizarnos con la forma en que se construyen los archivos audiovisuales digitales. - -# Para empezar -Para este tutorial, utilizaremos una película archivística que se llama [*Destination Earth*](https://archive.org/details/4050_Destination_Earth_01_47_33_28) como nuestro objeto de estudio. Esta película está publicada por los [Archivos Prelinger](https://es.wikipedia.org/wiki/Archivos_Prelinger) y en el [Internet Archive](https://archive.org/). Esta película, estrenada en 1956 y producida por [El American Petroleum Institute](https://es.wikipedia.org/wiki/American_Petroleum_Institute) y [John Sutherland Productions](https://en.wikipedia.org/wiki/John_Sutherland_(producer)), es un excelente ejemplo de la propaganda de la época de la Guerra Fría que exalta las virtudes del capitalismo y el estilo de vida estadounidense. Utilizando el proceso de [Technicolor](https://es.wikipedia.org/wiki/Technicolor), este corto animado de ciencia ficción cuenta la historia de una sociedad marciana que vive bajo un gobierno opresivo y sus esfuerzos para mejorar sus métodos industriales. Envían un emisario a la Tierra que descubre que la clave para esto es la refinación de petróleo y la libre empresa. Utilizaremos el vídeo para introducir algunas de las funcionalidades básicas de FFmpeg y analizar sus propiedades de color con relación a su retórica propagandística. - -{% include figure.html filename="destEarth_titlecard.png" caption="Destination Earth (1956)" %} - -En este tutorial se llevarán a cabo los siguientes pasos: -* Navegar a la página de [*Destination Earth*](https://archive.org/details/4050_Destination_Earth_01_47_33_28) en el Internet Archive -* Descargar dos archivos vídeos: las versiones "MPEG4" (extensión de archivo `.m4v`) y "OGG" (extensión de archivo `.ogv`) de la película -* Guardar estos archivos en la misma carpeta en algún lugar de tu computadora. Guárdalos con los nombres de archivos `destEarth`, seguido por su extensión. - -Tómate unos minutos para ver el vídeo y tener una idea de su estructura, mensaje y motivos visuales antes de continuar con los siguientes comandos. - -# Ejemplos de comandos preliminares - -## Ver metadatos básicos con FFprobe -Antes de comenzar a manipular nuestros archivos `destEarth`, usemos FFmpeg para examinar información básica sobre el archivo utilizando un simple comando de `ffprobe`. Esto ayudará a comprender cómo se construyen los archivos audiovisuales digitales y proporcionará una base para el resto del tutorial. Navega hasta el directorio del archivo y ejecuta: - -```bash -ffprobe destEarth.ogv -``` - -Verás los metadatos técnicos básicos del archivo impresos en `stdout`: - -{% include figure.html filename="ffprobe_ogg_es.png" caption="El output de un comando básico `ffprobe` con destEarth.ogv" %} - -La línea `Input # 0` del informe identifica el **contenedor** como [ogg](https://es.wikipedia.org/wiki/Ogg). Los contenedores (también llamados "envoltorios" o "wrappers", en inglés) proporcionan al archivo la estructura de sus diversas pistas. Los diferentes contenedores (otros más comunes incluyen `.mkv`, `.avi` y `.flv`) tienen diferentes características y compatibilidad con diversos programas. Examinaremos cómo y por qué es posible que desees cambiar el contenedor de un archivo en el siguiente comando. - -Las líneas `Stream #0:0` y `Stream #0:1` proporcionan información sobre las pistas del archivo (es decir, el contenido que ves en la pantalla y escuchas a través de sus altavoces) y también identifican el **códec** de cada pista. Los códecs especifican cómo se codifica/comprime (se escribe y almacena) y se decodifica (se reproduce) la información. La pista vídeo (`Stream #0:0`) de nuestro archivo `.ogv` usa el códec [theora](https://es.wikipedia.org/wiki/Theora) y la pista audio (`Stream #0:1`) usa el códec [vorbis](https://es.wikipedia.org/wiki/Vorbis). Estas líneas también proporcionan información importante relacionada con el espacio de color de la pista de vídeo (`yuv420p`), resolución (`400x300`) y marcos por segundo (`29.97 fps`). Adicionalmente, proporcionan información de audio como la tasa de muestreo (`44100 Hz`) y la tasa de bits (`128 kb/s`). - -Los códecs, en mayor medida que los contenedores, determinan la calidad y la compatibilidad de un archivo audiovisual con diferentes programas y plataformas (otros códecs comunes incluyen `DNxHD` y` ProRes` para vídeo y `mp3` y` FLAC` para audio). Examinaremos cómo y por qué es posible que también desees cambiar el códec de un archivo en el siguiente comando. - -Ejecuta otro comando de `ffprobe`, esta vez con el archivo `.m4v`: - -```bash -ffprobe destEarth.m4v -``` - -Una vez más, verás los metadatos técnicos básicos impresos en el `stdout`: - -{% include figure.html filename="ffprobe_mp4_es.png" caption="El output de un comando básico `ffprobe` con destEarth.m4v" %} - -También notarás que el informe para el archivo `.m4v` contiene múltiples contenedores en la línea `Input # 0` como `mov` y `m4a`. No es necesario profundizar en los detalles para los fines de este tutorial, pero ten en cuenta que los contenedores `mp4` y` mov` se presentan en múltiples "sabores" y diferentes extensiones de archivo. Sin embargo, todos son muy similares en su construcción técnica y, como tal, pueden verse agrupados en metadatos técnicos. De manera similar, el archivo `ogg` tiene la extensión` .ogv`, un "sabor" o variante del formato `ogg`. - -Al igual que en nuestro comando anterior, las líneas `Stream # 0: 0` y` Stream # 0: 1` identifican el códec de cada pista. Podemos ver que nuestro archivo `.m4v` usa el códec vídeo [H.264](https://es.wikipedia.org/wiki/H.264/MPEG-4_AVC) y el códec audio [aac](https://es.wikipedia.org/wiki/Advanced_Audio_Coding). Ten en cuenta que se nos proporcionan metadatos similares a nuestro archivo `.ogv`, pero algunas características importantes relacionadas con el análisis visual (como la resolución) son significativamente diferentes. Nuestro `.m4v` tiene una resolución más alta (`640x480`) y, por lo tanto, utilizaremos esta versión de *Destination Earth* como nuestro vídeo de origen. - -Ahora que sabemos más sobre la composición técnica de nuestro archivo, podemos comenzar a explorar las características y funcionalidades transformadoras de FFmpeg (volveremos a utilizar `ffprobe` más adelante en el tutorial para realizar una extracción de metadatos de color más avanzada). - -## Cambiar el contenedor (volver a envolver, "re-wrap") -Dependiendo de tu sistema operativo, puedes tener uno o más reproductores de medios instalados. Para efectos de demostración veamos qué sucede si intentas abrir `destEarth.ogv` usando el reproductor de medios QuickTime que viene con Mac OSX: - -{% include figure.html filename="QT_fail.png" caption="Los reproductores multimedia patentados como Quicktime a menudo están limitados en los tipos de archivos con los que pueden trabajar" %} - -Una opción cuando te enfrentas a un mensaje de este tipo es simplemente usar otro reproductor de medios. [VLC](https://www.videolan.org/vlc/index.es.html), que está construido con FFmpeg, es una excelente alternativa de código abierto, pero simplemente "usar otro programa" puede no ser siempre una solución viable (y es posible que no siempre tengas otra versión de archivo con la que trabajar). Muchos editores de vídeo populares, como Adobe Premiere, Final Cut Pro y DaVinci Resolve, tienen sus propias limitaciones en cuanto a los tipos de formatos con los que son compatibles. Además, las diferentes plataformas web y sitios de alojamiento/transmisión, como Vimeo, [también tienen sus propios requisitos.](https://help.vimeo.com/hc/es/articles/12426043233169-Video-and-audio-compression-guidelines) Por lo tanto, es importante poder volver a envolver y transcodificar tus archivos para cumplir con las diversas especificaciones para la reproducción, edición, publicación digital y ajuste de archivos a los estándares requeridos por las plataformas de archivo o preservación digital. - -
    -Para obtener una lista completa de los códecs y contenedores compatibles con tu instalación de FFmpeg, ejecuta ffmpeg -codecs y ffmpeg -formats, respectivamente, para ver la lista impresa de tu stdout. -
    - - -Como un ejercicio para aprender la sintaxis básica de FFmpeg y aprender a transcodificar entre formatos, comenzaremos con nuestro archivo `destEarth.ogv` y escribiremos un nuevo archivo con vídeo codificado en` H.264`, audio en `AAC` y envuelto en un contenedor `.mp4`, una combinación muy común y altamente portátil de códecs y contenedores que es prácticamente idéntico al archivo` .m4v` que originalmente descargamos. Aquí está el comando que ejecutarás, junto con una explicación de cada parte de la sintaxis: - -```bash -ffmpeg -i destEarth.ogv -c:v libx264 -c:a aac destEarth_transcoded.mp4 -``` - -* `ffmpeg` = comienza el comando -* `-i destEarth.ogv` = especifica el archivo de entrada -* `-c:v libx264` = transcodifica la pista de vídeo al codec H.264 -* `-c:a aac` = transcodifica la pista de audio al codec AAC -* `destEarth_transcoded.mp4` = especifica el archivo de salida. Ten en cuenta que aquí es donde se especifica el nuevo tipo de contenedor. - -Si ejecutas como está escrito y en el mismo directorio que `destEarth.ogv`, verás un nuevo archivo llamado` destEarth_transcoded.mp4`, que aparecerá en el directorio. Si estás operando en Mac OSX, también podrás reproducir este nuevo archivo con QuickTime. Una exploración completa de los convenios de códecs, contenedores, compatibilidad y extensión de archivos está más allá del alcance de este tutorial; sin embargo, este conjunto de ejemplos preliminares debería darles a aquellos que no estén familiarizados con la forma en que se construyen los archivos audiovisuales digitales un conjunto de conocimientos de referencia que les permitirá completar el resto del tutorial. - -## Creación de extractos y "demuxing" de audio y vídeo -Ahora que tenemos un mejor entendimiento de las pistas, códecs, y contenedores, veamos formas en que FFmpeg puede trabajar con materiales de vídeo a un nivel más granular. Para este tutorial, examinaremos dos secciones separadas de *Destination Earth* para comparar cómo se usa el color en relación con la retórica propagandística de la película. Crearemos y prepararemos estos extractos para el análisis utilizando un comando que realiza dos funciones diferentes simultáneamente: - -* Primero, el comando creará dos extractos de `destEarth.m4v`. -* Segundo, el comando eliminará ("demux") los componentes de audio (`Stream # 0: 1`) de estos extractos. -
    - Estamos eliminando el audio para ahorrar espacio de almacenamiento (la información de audio no es necesaria para el análisis de color). Esto probablemente será útil si esperas utilizar este tipo de análisis a escalas más grandes. Cerca del final del tutorial, se discutirá más información sobre la ampliación del análisis de color. -
    - -El primer extracto que haremos contiene una secuencia correspondiente al comienzo de la película que describe las difíciles condiciones y la vida oprimida de la sociedad marciana. El siguiente comando especifica los puntos de inicio y finalización del extracto, le dice a FFmpeg que retenga toda la información en la pista de vídeo sin transcodificar nada y le indica que escriba nuestro nuevo archivo sin la pista de audio: - -```bash -ffmpeg -i destEarth.m4v -ss 00:01:00 -to 00:04:35 -c:v copy -an destEarth_Mars_video.mp4 -``` -* `ffmpeg` = comienza el comando -* `-i destEarth.m4v` = especifica el archivo de entrada -* `-ss 00:01:00` = establece el punto de inicio a 1 minuto del inicio del archivo -* `-to 00:04:45` = establece el punto final a 4 minutos y 45 segundos desde el inicio del archivo -* `-c:v copy` = copia la pista de vídeo directamente, sin transcodificar -* `-an` = le dice a FFmpeg que ignore la pista de audio al escribir el archivo de salida. -* `destEarth_Mars_video.mp4` = especifica el archivo de salida - -{% include figure.html filename="Mars_screenshot.png" caption="Vida en Marte" %} - -Ahora, ejecutaremos un comando similar para crear un extracto de "Tierra". Esta parte de la película tiene una secuencia similar que describe las maravillas de la vida en la Tierra y la riqueza de su sociedad gracias al capitalismo de libre empresa y al uso de petróleo y productos derivados de este: - -```bash -ffmpeg -i destEarth.m4v -ss 00:07:30 -to 00:11:05 -c:v copy -an destEarth_Earth_video.mp4 -``` - -{% include figure.html filename="Earth_screenshot.png" caption="La abundancia de la Tierra" %} - - -Ahora deberías tener dos archivos nuevos en tu directorio llamados `destEarth_Mars_video.mp4` y` destEarth_Earth_video.mp4`. Puedes probar uno o ambos archivos (o cualquiera de los otros archivos en el directorio) usando la función `ffplay` de FFmpeg. Simplemente ejecuta: - -```bash -ffplay destEarth_Mars_video.mp4 -``` - -y/o - -```bash -ffplay destEarth_Earth_video.mp4 -``` - -Verás una ventana abierta y el vídeo comenzará en el punto de iniicio especificado. Se reproducirá una vez y luego la ventana se cerrará (además, notarás que no hay sonido en tu vídeo). También notarás que los comandos `ffplay` no requieren que se especifique una entrada (`-i`) o una salida porque la reproducción en sí misma es la salida. -
    -FFplay es un reproductor multimedia muy versátil que viene con una serie de opciones para personalizar la reproducción. Por ejemplo, si agregas `-loop 0` al comando se reproducirá en bucle indefinidamente.
    - - -Ahora hemos creado nuestros dos extractos para el análisis. Si vemos estos clips por separado, parece haber diferencias significativas en la forma en que se utilizan el color y la variedad de colores. En la siguiente parte del tutorial examinaremos y extraeremos datos de los archivos de vídeo para cuantificar y apoyar esta hipótesis. - -## Análisis de datos de color -El uso de herramientas digitales para analizar la información de color en películas es otra faceta emergente de las Humanidades Digitales que se superpone con los estudios cinematográficos tradicionales. En particular, el proyecto [FilmColors](https://filmcolors.org/) de la Universidad de Zurich cuestiona la intersección crítica de las "características estéticas formales de los aspectos semánticos, históricos y tecnológicos" de su producción, recepción y difusión a través del uso de herramientas de análisis y anotación digital (Flueckiger, 2017, traducido por el autor). Aunque no hay un método estandarizado para este tipo de investigación, en el momento de escribir esta lección el comando `ffprobe` que se describe a continuación es una una herramienta útil para extraer información de color que se puede usar en el análisis computacional. Primero, veamos otra manera estandarizada de representar la información de color que informa este enfoque cuantitativo, basado en datos, para el análisis de color: los vectorscopios. - -### Vectorscopios -Durante años, profesionales del vídeo han confiado en los [vectorscopios](https://es.wikipedia.org/wiki/Vectorscopio) para ver la información del color de una manera estandarizada y fácilmente legible. Un vectorscopio grafica información de color en una gratícula circular. La posición del gráfico corresponde a los [tonos](https://es.wikipedia.org/wiki/Tono_(color)) particulares encontrados en una señal de vídeo. Otros factores, como la saturación, determinan también el tamaño de un gráfico. A continuación se presenta un ejemplo de un vectorscopio que muestra los valores de color de las barras SMPTE. - -{% include figure.html filename="vectorscope.png" caption="Una lectura de vectorescopio que representa las barras SMPTE NTSC estándar. Fuente: Wikimedia Commons" %} - -{% include figure.html filename="smpte_bars.png" caption="Las barras SMPTE. Fuente: Wikimedia Commons" %} - -FFmpeg se puede utilizar para reproducir y crear archivos de vídeo con vectorscopios integrados en ellos para proporcionar una referencia en tiempo real para la información de color del vídeo. Los siguientes comandos `ffplay` incorporarán un vectorscopio en la esquina inferior derecha del marco. A medida que se reproduce el vídeo, notarás el cambio en el gráfico del vectorscopio a medida que cambia el color en pantalla: - -```bash -ffplay destEarth_Mars_video.mp4 -vf "split=2[m][v], [v]vectorscope=b=0.7:m=color3:g=green[v],[m][v]overlay=x=W-w:y=H-h" -``` - -* `ffplay` = comienza el comando -* `-i entrada_archivo.ext` = la ruta y el nombre del archivo de entrada -* `-vf` = crea un [*filter-graph*](https://trac.ffmpeg.org/wiki/FilteringGuide) para usar con las pistas -* `"` = una comilla para comenzar el *filter-graph.* La información entre las comillas - especifica los parámetros de la apariencia y posición del vectorscopio -* `split=2[m][v]` = divide la entrada en dos salidas idénticas llamadas `[m]` y `[v]` -* `,` = la coma indica que viene otro parámetro -* `[v]vectorscope=b=0.7:m=color3:g=green[v]` = asigna la salida `[v]` al filtro del vectorscopio -* `[m][v]overlay=x=W-w:y=H-h` = superpone el vectorscopio encima de la imagen de vídeo en una cierta ubicación (en este caso, en la esquina inferior derecha de la pantalla) -* `"` = termina el *filter-graph* - -
    -Para obtener más información sobre las diversas opciones para crear vectorscopios, consulta la documentación oficial y la página Wiki FFmpeg Vectorscope. Además, puedes encontrar más información sobre cómo colocar las superposiciones en la documentación del filtro de superposición FFmpeg. -
    - -{% include figure.html filename="Mars_screenshot_vector.png" caption="Captura de pantalla de la ventana de FFplay con vectorscopio incorporado" %} - -Y para el extracto de "Tierra": - -```bash -ffplay destEarth_Earth_video.mp4 -vf "split=2[m][v], [v]vectorscope=b=0.7:m=color3:g=green[v],[m][v]overlay=x=W-w:y=H-h" -``` - -{% include figure.html filename="Earth_screenshot_vector.png" caption="Captura de pantalla de la ventana de FFplay con vectorscopio incorporado" %} - -También podemos ajustar este comando para escribir nuevos archivos de vídeo con vectorscopios: - -```bash -ffmpeg -i destEarth_Mars_video.mp4 -vf "split=2[m][v], [v]vectorscope=b=0.7:m=color3:g=green[v],[m][v]overlay=x=W-w:y=H-h" -c:v libx264 destEarth_Mars_vectorscope.mp4 -``` - -```bash -ffmpeg -i destEarth_Earth_video.mp4 -vf "split=2[m][v], [v]vectorscope=b=0.7:m=color3:g=green[v],[m][v]overlay=x=W-w:y=H-h" -c:v libx264 destEarth_Earth_vectorscope.mp4 -``` - -Nota los pequeños pero importantes cambios en sintaxis: - * Hemos agregado una bandera de `-i` porque es un comando de `ffmpeg` - * Hemos especificado el códec del vídeo del archivo de salida como [H.264](https://es.wikipedia.org/wiki/H.264/MPEG-4_AVC) con la bandera `-c:v libx264` y no estamos recodificando el códec de audio (`-c:a copy`), aunque puedes especificar otro códec de audio si lo necesitas. - * Hemos definido el nombre del archivo de salida - -Tómate unos minutos para ver estos vídeos con los vectorscopios integrados en ellos. Observa cuán dinámicos (o no) son los cambios entre los extractos de "Marte" y "Tierra". Compara lo que ves en el vectorscopio con tus propias impresiones del vídeo mismo. Podríamos usar las observaciones de estos vectorscopios para hacer determinaciones sobre qué tonos de color aparecen de manera más regular o intensa en el vídeo, o podemos comparar diferentes formatos uno al lado del otro para ver cómo el color se codifica o representa de manera diferente en función de diferentes códecs, resoluciones, etc. - -Aunque los vectorscopios proporcionan una representación útil y en tiempo real de la información del color, es posible que también deseemos acceder a los datos sin procesar que se encuentran debajo de ellos. Luego, podemos usar estos datos para desarrollar visualizaciones más flexibles que no dependan de ver el archivo de vídeo simultáneamente y que ofrezcan un enfoque más cuantitativo para el análisis de color. En nuestros próximos comandos, utilizaremos `ffprobe` para producir un conjunto tabular de datos que pueda usarse para crear un gráfico de datos de color. - -### Extracción de datos de color con FFprobe -Al comienzo de este tutorial, utilizamos un comando `ffprobe` para ver los metadatos básicos de nuestro archivo impresos en el `stdout`. En los siguientes ejemplos, utilizaremos `ffprobe` para extraer datos de color de nuestros extractos de vídeo y enviar esta información a archivos` .csv`. Dentro de nuestro comando `ffprobe`, vamos a utilizar el filtro` signalstats` para crear reportes `.csv` de información de tono de color medio para cada marco en la secuencia de vídeo de` destEarth_Mars_video.mp4` y `destEarth_Earth_video.mp4`, respectivamente. - -```bash -ffprobe -f lavfi -i movie=destEarth_Mars_video.mp4,signalstats -show_entries frame=pkt_pts_time:frame_tags=lavfi.signalstats.HUEMED -print_format csv > destEarth_Mars_hue.csv -``` - -* `ffprobe` = comienza el comando -* `-f lavfi` = especifica el dispositivo de entrada virtual [libavfilter](https://ffmpeg.org/ffmpeg-devices.html#lavfi) como el formato elegido. Esto es necesario cuando se usa `signalstats` y muchos filtros en comandos FFmpeg más complejos. -* `-i movie=destEarth_Mars_video.mp4` = nombre del archivo de entrada -* `,signalstats` = especifica el uso del filtro `signalstats` con el archivo de entrada -* `-show_entries` = establece una lista de entradas que se mostrarán en el informe. Estos se especifican en las siguientes opciones. -* `frame=pkt_pts_time` = especifica mostrar cada marco con tu correspondiente `pkt_pts_time`, creando una entrada única para cada marco de vídeo -* `:frame_tags=lavfi.signalstats.HUEMED` = crea una etiqueta para cada marco que contiene el valor de tono medio -* `-print_format csv` = especifica el formato del informe de metadatos -* `> destEarth_Mars_hue.csv` = escribe un nuevo archivo `.csv` que contiene el informe de metadatos usando`> `, un [operador de redireccionamiento de Bash](https://www.gnu.org/software/bash/manual/html_node/Redirections.html). Este operador toma el comando que lo precede y "redirige" la salida a otra ubicación. En este caso, está escribiendo la salida en un nuevo archivo `.csv`. La extensión de archivo proporcionada aquí también debe coincidir con el formato especificado por el indicador `print_format`. - -A continuación, ejecuta el mismo comando para el extracto de "Tierra": - -```bash -ffprobe -f lavfi -i movie=destEarth_Earth_video.mp4,signalstats -show_entries frame=pkt_pts_time:frame_tags=lavfi.signalstats.HUEMED -print_format csv > destEarth_Earth_hue.csv -``` - -
    -Para obtener más información sobre el filtro de signalstats y las diversas métricas que se pueden extraer de las transmisiones de vídeo, consulta la documentación del filtro FFmpeg. -
    - - -Ahora deberías tener dos archivos `.csv` en tu directorio. Si los abres en un editor de texto o en un programa de hoja de cálculo, verás tres columnas de datos: - -{% include figure.html filename="csv_head.png" caption="Las primeras filas de nuestro informe de color en formato .csv" %} - -Comenzando a la izquierda y moviéndose a la derecha, las dos primeras columnas nos dan información sobre dónde estamos en el vídeo. Los números decimales representan fracciones de segundo que también corresponden aproximadamente a la base de tiempo de vídeo de 30 marcos por segundo. Cada fila en nuestro `.csv` corresponde a un marco de vídeo. La tercera columna lleva un número entero entre 0-360, valor que representa el tono medio para ese marco de vídeo. Estos números son los datos cuantitativos subyacentes del diagrama de vectorscopio y corresponden a su posición (en radianes) en la gratícula circular. Haciendo referencia a nuestra imagen de vectorescopio de antes, puedes ver que comenzando en la parte inferior del círculo (0 grados) y moviéndose a la izquierda, los "verdes" comienzan alrededor de los 38 grados, los "amarillos" en los 99 grados, los "rojos" en los 161 grados, los "magentas" en los 218 grados, los "azules" en los 279 grados y los "cianes" en los 341 grados. Una vez que comprendas estos "rangos" de tono, puedes hacerte una idea de cuál es el valor de tono medio para un marco de vídeo con solo mirar este valor numérico. - -Además, ten en cuenta que este valor extraído por el filtro `signalstats` no es una medida absoluta o completa de las cualidades de color de una imagen, sino simplemente un punto de referencia significativo desde el cual podemos explorar una estrategia basada en datos para el análisis de color. La percepción del color y la teoría del color son [áreas complejas y en evolución de la investigación académica](https://colourturn.net/) que incorporan muchas estrategias diferentes de las humanidades, las ciencias sociales y las ciencias cognitivas. Es por eso que debemos tener en cuenta que cualquier estrategia analítica debe tomarse dentro del contexto de estos discursos más amplios y con un espíritu colaborativo y generativo. - -### Visualizando datos de color -Los dos archivos `.csv` que creamos con los comandos anteriores ahora se pueden usar para crear gráficos que visualicen los datos. Hay una serie de plataformas (tanto propietarias como de código abierto) que se pueden usar para lograr esto, como [Microsoft Excel](https://www.wikihow.com/Create-a-Graph-in-Excel), [RawGraphs](https://rawgraphs.io/) y/o [plotly](https://plotly.com/graphing-libraries/). Una discusión en profundidad sobre cómo usar cualquiera de estas plataformas está fuera del alcance de este tutorial; sin embargo, a continuación se muestra la visualización final de los comandos anteriores, que se creó con los archivos `.csv` y plotly. - -{% include figure.html filename="Final_Graph_plotly.png" caption="Gráfico que incluye datos de tono medio de ambos extractos de vídeo" %} - -### Conclusiones -Al observar el gráfico, podemos ver que las trazas de Marte y la Tierra tienen rangos dinámicos muy diferentes en sus valores de tono medio. La traza de Marte es muy limitada y se mantiene dentro de los rangos rojo y amarillo (aproximadamente entre 100 y 160) en la mayoría del extracto. Esto sugiere algo sobre el uso del color en la película como un dispositivo retórico que sirve como mensaje propagandístico. Recuerda que esta sección presenta una visión antipática de la forma de vida y el sistema político marcianos: una población uniforme e infeliz, que depende de tecnología y transporte ineficientes mientras se les exige que observen la obediencia total a un gobernante supremo totalitario. La película conecta esta experiencia negativa con una paleta de tonos relativamente opacos de rojo y amarillo. También deberíamos considerar el público objetivo original de esta película, los jóvenes ciudadanos de los Estados Unidos en la década de 1950, y cómo probablemente habrían interpretado estas imágenes y usos del color en ese momento histórico. En particular, podemos considerar este uso del color en el contexto de las crecientes tensiones geopolíticas entre la Unión Soviética y los Estados Unidos y sus aliados en Europa occidental. El color rojo, específicamente, se usaba comúnmente en los medios impresos y de difusión para describir [la "amenaza" del comunismo global](https://es.wikipedia.org/wiki/Temor_rojo) durante esta era de la historia mundial. Además, la elección de presentar al líder totalitario marciano con una apariencia muy similar al icónico líder soviético [Joseph Stalin](https://es.wikipedia.org/wiki/I%C3%B3sif_Stalin) puede leerse como una señal visual y cultural explícita para la audiencia. Así, esta representación de Marte parece ser una caricatura alegórica de la vida bajo el velo del comunismo, tal como la percibe un observador externo y un oponente político/ideológico. Esta caricatura emplea no solo una paleta de colores limitada, sino una que está cargada con otras referencias culturales. El uso del color aprovecha los prejuicios y asociaciones que están presentes en el imaginario de la audiencia y, por lo tanto, está ligado estrechamente al argumento central de la película, que sostiene que el comunismo no es un sistema político viable. - -En contraste con el uso limitado del color en nuestro extracto de Marte, la traza de la Tierra cubre un rango dinámico mucho más amplio de valores de tono. En este pasaje, el emisario marciano está aprendiendo sobre el maravilloso y rico estilo de vida de los terrícolas gracias a un sistema capitalista y a la explotación de petroleo y de productos derivados de este. La secuencia enfatiza la riqueza material y la libertad empresarial ofrecida bajo un sistema capitalista usando una variedad y vivacidad de color mucho mayor que en el extracto de Marte. Los productos comerciales y las personas se representan utilizando el espectro completo del proceso Technicolor, creando asociaciones positivas entre los resultados de la industria petrolera y el estilo de vida acomodado de quienes se benefician de él. Al igual que el extracto de Marte, a la audiencia se le ofrece una caricatura unilateral de un sistema político y una forma de vida, pero en esta sección la representación reduccionista es laudable y próspera en lugar de desoladora y opresiva. - -Como una pieza de propaganda, *Destination Earth* se basa en estas distinciones poderosas pero demasiado simplistas entre dos sistemas políticos para influir en la opinión pública y promover el consumo de productos derivados del petróleo. La manera en que se usa (o no se usa) el color es una herramienta importante para elaborar y enfatizar este mensaje. Además, una vez que podemos extraer datos de color y visualizarlos utiliza técnicas gráficas simples, podemos ver que la disparidad en el rango dinámico proporciona una medida cuantitativa para vincular el uso técnico y estético del color en esta película animada con la retórica propagandística presentada por sus productores. - -{% include figure.html filename="lovely_oil.png" caption="El petróleo y los ideales estadounidenses de riqueza y prosperidad se expresan en esplendor colorido" %} - -### Escalando el análisis de color con FFprobe -Uno de los límites de esta metodología es que estamos generando manualmente informes de color en un solo archivo a la vez. Si quisiéramos adoptar un enfoque de [visión distante](https://distantviewing.org/) más en línea con las metodologías tradicionales de Humanidades Digitales, podríamos emplear un script de Bash para ejecutar nuestro comando `ffprobe` en todos los archivos en un determinado directorio. Esto es útil si, por ejemplo, un(a) investigador(a) esta interesado en realizar un análisis similar en [todas las películas animadas de John Sutherland encontradas en la colección de Archivos Prelinger](https://archive.org/details/prelinger&tab=collection?and%5B%5D=john+sutherland&sin=) u otro conjunto de material de vídeo de archivo. - -Una vez que tengas un conjunto de material para trabajar guardado en un solo lugar, puedes guardar el siguiente [bucle _for_ de Bash o "for loop"](https://www.shellscript.sh/loops.html) dentro del directorio y ejecutarlo para generar archivos `.csv` que contengan los mismos datos de tono medio a nivel de fotograma que extrajimos de nuestros extractos de *Destination Earth*. - -```bash -for file in *.m4v; do -ffprobe -f lavfi -i movie="$file",signalstats -show_entries frame=pkt_pts_time:frame_tags=lavfi.signalstats.HUEMED -print_format csv > "${file%.m4v}.csv"; -done -``` - -* `for file in *.m4v; do` = inicia el bucle _for_. Esta primera línea le dice a FFmpeg "para todos los archivos en este directorio con la extensión `.m4v`, ejecuta el siguiente comando." -* El `*` es un [comodín de Bash](http://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm) adjunto a un tipo de archivo dado para especificarlos como archivos de entrada. -* La palabra `file` es una variable arbitraria que representará cada archivo a medida que se ejecuta a través del bucle. -* `ffprobe -f lavfi -i movie="$file",signalstats -show_entries frame=pkt_pts_time:frame_tags=lavfi.signalstats.HUEMED -print_format csv > "${file%.m4v}.csv"; done` = el mismo comando de extracción de metadatos de color que ejecutamos en nuestros dos extractos de *Destination Earth*, con algunas pequeñas modificaciones en la sintaxis para explicar su uso en varios archivos en un directorio: - * `"$file"` = recuerda cada variable. Las comillas aseguran que se conserva el nombre de archivo original. - * `> "${file%.m4v}.csv";` = conserva el nombre de archivo original al escribir los archivos de salida `.csv`. Esto asegurará que los nombres de los archivos de vídeo originales coincidan con sus correspondientes reportes en `.csv`. - * `done` = termina el script una vez que se hayan completado todos los archivos del directorio. - -
    -También puedes usar signalstats para obtener otra información valiosa relacionada con el color. Consulta la documentación del filtro para obtener una lista completa de las métricas visuales disponibles. -
    - -Una vez que ejecutas este script, verás que cada archivo de vídeo en el directorio ahora tiene un archivo `.csv` correspondiente que contiene el conjunto de datos especificado. - -# En resumen -En este tutorial, hemos aprendido: - * cómo instalar FFmpeg en diferentes sistemas operativos y cómo acceder al _framework_ en el navegador web - * cuál es la sintaxis básica y la estructura de los comandos FFmpeg - * cómo visualizar metadatos técnicos básicos de un archivo audiovisual - * cómo transformar un archivo audiovisual a través de la transcodificación y el "re-wrapping" - * cómo analizar y editar ese archivo audiovisual separando sus componentes ("demux") y crear extractos - * cómo reproducir archivos audiovisuales usando `ffplay` - * cómo crear nuevos archivos de vídeo con vectorscopios integrados - * cómo exportar datos tabulares relacionados con el color de una pista de vídeo usando `ffprobe` - * cómo crear un bucle _for_ de Bash para extraer información de datos de color de múltiples archivos de vídeo con un solo comando - -A un nivel más amplio, este tutorial aspira a proporcionar una introducción informada y atractiva sobre cómo se pueden incorporar las herramientas y metodologías audiovisuales en los proyectos y las prácticas de Humanidades Digitales. Con herramientas abiertas y potentes como FFmpeg, existe un gran potencial para expandir el alcance del campo para incluir tipos de medios y análisis más ricos y complejos que nunca. - -# Más recursos -FFmpeg tiene una comunidad grande y bien apoyada de usarios a través de todo el mundo. Como tal, hay muchos recursos gratuitos y de código abierto para descubir nuevos comandos y técnicas para trabajar con materiales audiovisuales. Por favor, contacta al autor con cualquier adición a esta lista, especialmente si se trata de recursos educativos en español para aprender FFmpeg. - -* [La documentación oficial de FFmpeg](https://www.ffmpeg.org/ffmpeg.html) -* [FFmpeg Wiki](https://trac.ffmpeg.org/wiki/WikiStart) -* [ffmprovisr](https://amiaopensource.github.io/ffmprovisr/) de [La Asociación de Archivistas de Imágenes en Movimiento](https://amianet.org/?lang=es) -* [Entrenamiento de preservación audiovisual de Ashley Blewer](https://training.ashleyblewer.com/) -* [La presentación de Andrew Weaver: "Demystifying FFmpeg"](https://github.com/privatezero/NDSR/blob/master/Demystifying_FFmpeg_Slides.pdf) -* [FFmpeg: Presentación de Ben Turkus](https://docs.google.com/presentation/d/1NuusF948E6-gNTN04Lj0YHcVV9-30PTvkh_7mqyPPv4/present?ueb=true&slide=id.g2974defaca_0_231) -* [FFmpeg Cookbook for Archivists de Reto Kromer](https://avpres.net/FFmpeg/) - -## Programas de código abierto de análisis audiovisual que usan FFmpeg - -* [MediaInfo](https://mediaarea.net/en/MediaInfo) -* [QC Tools](https://bavc.org/preserve-media/preservation-tools) - -# Referencias - -* Champion, E. (2017) “Digital Humanities is text heavy, visualization light, and simulation poor,” Digital Scholarship in the Humanities 32(S1), i25-i32 - -* Hockey, S. (2004) “The History of Humanities Computing,” A Companion to Digital Humanities, ed. Susan Schreibman, Ray Siemens, John Unsworth. Oxford: Blackwell - -Este tutorial fue posible gracias al apoyo de la Academia Británica y fue escrito durante el Taller de _Programming Historian_ desarrollado en la Universidad de Los Andes en Bogotá, Colombia, entre el 31 de julio y 3 de agosto de 2018. +--- +title: Introducción a la transcodificación, edición y visualización de datos audiovisuales con FFmpeg +authors: +- Dave Rodriguez +editors: +- Brandon Walsh +reviewers: +- Tesla Cariani +- Josh Romphf +original: introduction-to-ffmpeg +date: 2018-12-20 +translator: +- Dave Rodriguez +- Sebastian Fiori +translation_date: 2020-12-11 +translation-editor: +- Antonio Rojas Castro +translation-reviewer: +- Jennifer Isasi +- José Antonio Motilla +original: introduction-to-ffmpeg +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/302 +difficulty: 2 +activity: analyzing +topics: [data-manipulation, data-visualization] +abstract: Esta lección introduce las funciones básicas de FFmpeg, una herramienta libre de línea de comandos utilizada para manipular y analizar materiales audiovisuales. +avatar_alt: Una cámara antigua +doi: 10.46430/phes0049 +layout: lesson +--- + +{% include toc.html %} + +# Introducción +Historicamente, las Humanidades Digitales se han enfocado casi exclusivamente en el analisis de fuentes textuales a través de métodos computacionales (Hockey, 2004). Sin embargo, hay un interés creciente en el campo de la utilización de métodos computacionales para el análisis de materiales audiovisuales de patrimonio cultural, como refleja la creación de la [Alianza de Organizaciones de Humanidades Digitales Grupo de Interés Especial: Materiales audiovisuales en Humanidades Digitales](https://avindhsig.wordpress.com/) y [el aumento de las presentaciones relacionadas con temas audiovisuales en la conferencia global de AOHD](https://figshare.com/articles/AV_in_DH_State_of_the_Field/5680114) en los años anteriores. Investigaciones recientes, tal como [Distant Viewing TV](https://distantviewing.org), indican un cambio en el campo hacia proyectos relacionados con el uso de técnicas computacionales para ampliar el alcance de los materiales que los y las humanistas digitales pueden explorar. Como afirma Erik Champion, "la audiencia de Humanidades Digitales no siempre está enfocada en la literatura o está interesada en las formas tradicionales de alfabetización" y la aplicación de metodologías digitales para estudiar cultura audiovisual es una faceta emergente y emocionante de las humanidades digitales (Champion, 2017, traducido por el autor). Hay muchas herramientas valiosas, gratuitas y de código abierto disponibles para aquellos interesados en trabajar con materiales audiovisuales (por ejemplo, el tutorial de _Programming Historian_ [Editar Audio con Audacity](/es/lecciones/editar-audio-con-audacity)). Este tutorial presentará otra: FFmpeg. + +[FFmpeg](https://www.ffmpeg.org/about.html) es el _framework_ multimedia de código abierto líder para transcodificar, editar, filtrar y reproducir casi cualquier tipo de formato audiovisual digital (sitio web de FFmpeg - "About"). Muchos programas comunes y sitios web usan FFmpeg para leer y escribir archivos audiovisuales, por ejemplo, VLC, Google Chrome, YouTube y [muchos más](https://trac.ffmpeg.org/wiki/Projects). Además de ser una herramienta de programa y de desarrollo web, FFmpeg se puede usar en la interfaz de la línea de comandos para realizar muchas tareas comunes, complejas e importantes, relacionadas con la gestión, modificación y análisis de archivos audiovisuales. Estos tipos de procesos, tales como editar, transcodificar o extraer los metadatos de archivos, generalmente requieren acceso a otro programa (tal como editores de vídeo no lineal, como Adobe Premiere o Final Cut Pro); sin embargo, FFmpeg permite a un usuario operar directamente en archivos audiovisuales sin el uso de interfaces o programa de terceros. Como tal, el conocimiento del _framework_ permite a los usuarios manipular materiales audiovisuales para satisfacer sus necesidades con una solución de código abierto y gratuita, que ofrece gran parte de la funcionalidad de un costoso programa de audio y vídeo. Este tutorial ofrece una introducción a la lectura y escritura de comandos de FFmpeg y una guía paso a paso a partir de un caso práctico para aprender a utilizar el _framework_ en un trabajo específico para los humanistas digitales. Específicamente, se mostrará cómo FFmpeg puede ser utilizado para extraer y analizar datos de color en un video archivístico. + +## Objetivos de aprendizaje +* Instalar FFmpeg en tu computadora o usar una versión "demo" en el navegador web +* Comprender la estructura básica y la sintaxis de los comandos de FFmpeg +* Aprender varios comandos útiles, tales como: + * "Re-wrap" (cambiar el contenedor) y transcodificar (recodificar archivos) + * "Demux" de archivos (separar audio y vídeo) + * Recortar/Editar archivos + * Usar FFplay para reproducir archivos + * Crear vectorscopios para visualizar los datos de color + * Usar FFprobe para generar informes de los datos de color +* Introducir recursos para mayor exploración y experimentación + +## Requisitos previos +Antes de comenzar con este tutorial, es necesario que localices la [Terminal](https://es.wikipedia.org/wiki/Terminal_(macOS)) de tu computadora u otra interfaz de línea de comandos, ya que ahí es donde ingresarás y ejecutarás los comandos de FFmpeg. Si necesitas instrucción para acceder y usar la interfaz de línea de comandos, te recomendamos la lección de _Programming Historian_ [Introducción a la línea de comandos en Bash](/es/lecciones/introduccion-a-bash) para usarios de Mac y Linux o, para usarios de Windows, [Introducción a la línea de comandos de Windows con PowerShell](/es/lecciones/introduccion-a-powershell). Adicionalmente, será de utilidad tener conocimientos básicos de [códecs](https://es.wikipedia.org/wiki/C%C3%B3dec) y [contenedores](https://es.wikipedia.org/wiki/Formato_contenedor) audiovisuales para entender con mayor detalle el funcionamiento de FFmpeg. Proporcionaremos información adicional y revisaremos con mayor detalle sobre códecs y contenedores en la sección sobre ejemplos de comandos preliminares de este tutorial. + +# Cómo instalar FFmpeg +La instalación de FFmpeg es posiblemente la parte más difícil de usar esta herramienta. Afortunadamente, existen algunas guías y recursos disponibles para instalar el _framework_ para cada sistema operativo. + +
    +Nuevas versiones de FFmpeg son lanzadas aproximadamente cada seis meses. Para mantenerse al tanto de ellas, es recomendable seguir a FFmpeg en Twitter o en su sitio web. Las nuevas versiones de FFmpeg generalmente contienen características tales como filtros nuevos y actualizados, compatibilidades de códecs y corrección de errores. La sintaxis de FFmpeg no cambia con estas actualizaciones y las capacidades antiguas rara vez se eliminan. Puedes aprender más sobre estas actualizaciones consultando los anuncios de actualizaciones anteriores en la sección de News en el sitio web de FFmpeg. +
    + +## Para usuarios de Mac OS +La opción más simple es usar un administrador de paquetes como [Homebrew](https://brew.sh/) para instalar FFmpeg y asegurar que permanezca en la versión más reciente. Para completar este tipo de instalación, sigue estos pasos: +* Instala Homebrew de acuerdo a las instrucctiones en el enlace de arriba +* Para comenzar con una instalación básica, ejecuta `brew install ffmpeg` en tu Terminal para comenzar una instalación básica + **Nota**: generalmente se recomienda instalar FFmpeg con opciones adicionales a la incluidas en la instalación básica; esto proporcionará acceso a más herramientas y funciones. [La Guía de Instalación de Apple de Reto Kromer](https://avpres.net/FFmpeg/install_Apple.html) proporciona un buen conjunto de opciones adicionales: + + ```bash + brew install ffmpeg --with-freetype --with-openjpeg --with-x265 --with-rubberband --with-tesseract + ``` + + * Para una explicación de estas opciones adicionales, revisa [La Guía FFmpeg de Ashley Blewer](https://training.ashleyblewer.com/presentations/ffmpeg.html#10). + * Además, puedes ejecutar `brew options ffmpeg` para ver qué características están o han estado disponibles en la versión actual de FFmpeg + * Para actualizar tu instalación a la versión más reciente, ejecuta: + + ```bash + brew update && brew upgrade ffmpeg + ``` + +* Para más opciones de instalación para Mac OS, revisa [La Guía de Compilación de FFmpeg para Mac OS](https://trac.ffmpeg.org/wiki/CompilationGuide/macOS) (la guía solo está disponible en inglés). + +## Para usuarios de Windows +Los usarios de Windows pueden usar el adminstratdor de paquetes [Chocolately](https://chocolatey.org/) para instalar y mantener FFmpeg. [La Guía de Instalación de Windows de Reto Kromer](https://avpres.net/FFmpeg/install_Windows.html) proporciona toda la información necesaria para usar Chocolately o construir el _framework_ a partir del código fuente (la guía solo está disponible en inglés). + +## Para usuarios de Linux +[Linuxbrew](ttp://linuxbrew.sh/) es un programa similar a Homebrew que se puede utilizar para instalar y mantener FFmepg en Linux. Reto Kromer también proporciona una guía útil, [la Guía de Instalación de Linux](https://avpres.net/FFmpeg/install_Linux.html), que es similar a la instalación en Mac OS. Tu distribución de Linux puede tener su [propio administrador de paquetes](https://www.linode.com/docs/tools-reference/linux-package-management/) que incluye paquetes FFmpeg (la guía solo está disponible en inglés). Dependiendo de tu distribución de Linux (Ubuntu, Fedora, Arch Linux, etc.) estas versiones pueden variar, así que usar Linuxbrew podría ser útil para asegurar que la versión es la misma independientemente del tipo de Linux que utilices. + +## Otros recursos de instalación + +* [Descarga de paquetes](https://www.ffmpeg.org/download.html) + * FFmpeg permite el accesso a archivos binarios, código fuente y versiones estáticas para Mac, Windows y Linux directamente en su sitio web. Los usuarios pueden construir el _framework_ sin un administrador de paquetes con estos recursos. Es probable que solo los usuarios avanzados quieran usar esta opción. +* [La Guía de Compilación de FFmpeg](https://trac.ffmpeg.org/wiki/CompilationGuide) + * La página Wiki de FFmpeg también proporciona un compendio de guías y estrategias para instalar FFmpeg en tu computadora (la guía solo está disponible en inglés). + +## Probando la instalación +* Para asegurarte de que FFmpeg se haya instalado correctamente, ejecuta: + + ```bash + ffmpeg -version + ``` + +* Si ves una lista larga con información, ¡la instalación fue exitosa! Debe ser similar a lo siguiente: + +```bash +ffmpeg version 4.0.1 Copyright (c) 2000-2018 the FFmpeg developers +built with Apple LLVM version 9.1.0 (clang-902.0.39.1) +configuration: --prefix=/usr/local/Cellar/ffmpeg/4.0.1 --enable-shared --enable-pthreads --enable-version3 --enable-hardcoded-tables --enable-avresample --cc=clang --host-cflags= --host-ldflags= --enable-gpl --enable-ffplay --enable-libfreetype --enable-libmp3lame --enable-librubberband --enable-libtesseract --enable-libx264 --enable-libx265 --enable-libxvid --enable-opencl --enable-videotoolbox --disable-lzma --enable-libopenjpeg --disable-decoder=jpeg2000 --extra-cflags=-I/usr/local/Cellar/openjpeg/2.3.0/include/openjpeg-2.3 +libavcodec 58. 18.100 / 58. 18.100 +libavformat 58. 12.100 / 58. 12.100 +libavdevice 58. 3.100 / 58. 3.100 +libavfilter 7. 16.100 / 7. 16.100 +libavresample 4. 0. 0 / 4. 0. 0 +libswscale 5. 1.100 / 5. 1.100 +libswresample 3. 1.100 / 3. 1.100 +libpostproc 55. 1.100 / 55. 1.100 +``` + +* Si el sistema arroja `-bash: ffmpeg: command not found`, algo ha ido mal. + * Nota: Si estás usando un administrador de paquetes, es improbable que encuentres este mensaje de error. Sin embargo, si hay un problema después de instalar con un administrador de paquetes, es probable que haya un problema con el administrador de paquetes y no con FFmpeg. Consulta la solución de problemas en [Homebrew](https://docs.brew.sh/Troubleshooting), [Chocolatey](https://chocolatey.org/docs/troubleshooting), o [Linuxbrew](https://linuxbrew.sh/) para asegurar que el administrador de paquetes está funcionando correctamente en tu computadora (las guías solo está disponible en inglés). Si estás intentando instalar sin un administrador de paquetes y ves este mensaje de error, haz una referencia cruzada de tu método con la La Guía de Compilación de FFmpeg anterior. + +## Usando FFmpeg en el navegador +Si no quieres instalar FFmepg en tu computadora pero te gustaría familiarizarte con el _framework_ y usarlo en la interfaz de línea de comandos, [videoconverter.js](https://bgrins.github.io/videoconverter.js/demo/) de Brian Grinstead proporciona un método para ejecutar los comandos FFmpeg en tu navegador (la interfaz está en inglés). +
    + Esta interfaz del navegador no tiene las funcionalidades como para completar todo este tutorial, pero es útil para aprender los comandos esenciales de FFmpeg. Adicionalmente, este recurso opera en una versión anterior de FFmpeg y posiblemente no tenga todas las características de la versión más reciente. +
    + +## Estructura básica y sintaxis de los comandos FFmpeg +El comando básico tiene cuatro partes: + +```bash +[Símbolo del Sistema] [Archivo de Entrada] [Banderas/Acciones] [Archivo de Salida] +``` + +* Cada comando comenzará con un símbolo del sistema. Dependiendo del uso, este será `ffmpeg` (cambiar archivos), `ffprobe` (generar metadatos de archivos) o `ffplay` (reproducir archivos). +* Los archivos de entradas son los archivos que están siendo leídos, editados o examinados. +* Las banderas y acciones son las cosas que le estás diciendo a FFmpeg que haga con los archivos de entrada. La mayoría de los comandos contendrán múltiples banderas y acciones de complejidad variable. +* Los archivos de salida son los archivos creados por el comando o los informes creados por los commandos de `ffprobe`. + +Escrito genéricamente, el comando básico es parecido a lo siguiente: + +```bash + ffmpeg -i /ruta_de_archivo/archivo_de_entrada.ext -bandera alguna_acción /ruta_de_archivo/archivo_de_salida.ext + ``` +
    +Como con cualquier interfaz de línea de comandos, tendrás que escribir las rutas de los archivos de entrada y de salida dependiendo de las ubicaciones de tus directorios de trabajo. En los ejemplos proporcionados en este tutorial, las rutas de archivos no estarán escritas completamente y se supone que el usuario ha navegado al directorio de trabajo para ejecutar los comandos.
    + + +A continuación, examinaremos algunos ejemplos de varios comandos diferentes que usan esta estructura y sintaxis. Adicionalmente, estos comandos demostrarán algunas de las características más útiles de FFmpeg y nos permitirán familiarizarnos con la forma en que se construyen los archivos audiovisuales digitales. + +# Para empezar +Para este tutorial, utilizaremos una película archivística que se llama [*Destination Earth*](https://archive.org/details/4050_Destination_Earth_01_47_33_28) como nuestro objeto de estudio. Esta película está publicada por los [Archivos Prelinger](https://es.wikipedia.org/wiki/Archivos_Prelinger) y en el [Internet Archive](https://archive.org/). Esta película, estrenada en 1956 y producida por [El American Petroleum Institute](https://es.wikipedia.org/wiki/American_Petroleum_Institute) y [John Sutherland Productions](https://en.wikipedia.org/wiki/John_Sutherland_(producer)), es un excelente ejemplo de la propaganda de la época de la Guerra Fría que exalta las virtudes del capitalismo y el estilo de vida estadounidense. Utilizando el proceso de [Technicolor](https://es.wikipedia.org/wiki/Technicolor), este corto animado de ciencia ficción cuenta la historia de una sociedad marciana que vive bajo un gobierno opresivo y sus esfuerzos para mejorar sus métodos industriales. Envían un emisario a la Tierra que descubre que la clave para esto es la refinación de petróleo y la libre empresa. Utilizaremos el vídeo para introducir algunas de las funcionalidades básicas de FFmpeg y analizar sus propiedades de color con relación a su retórica propagandística. + +{% include figure.html filename="destEarth_titlecard.png" caption="Destination Earth (1956)" %} + +En este tutorial se llevarán a cabo los siguientes pasos: +* Navegar a la página de [*Destination Earth*](https://archive.org/details/4050_Destination_Earth_01_47_33_28) en el Internet Archive +* Descargar dos archivos vídeos: las versiones "MPEG4" (extensión de archivo `.m4v`) y "OGG" (extensión de archivo `.ogv`) de la película +* Guardar estos archivos en la misma carpeta en algún lugar de tu computadora. Guárdalos con los nombres de archivos `destEarth`, seguido por su extensión. + +Tómate unos minutos para ver el vídeo y tener una idea de su estructura, mensaje y motivos visuales antes de continuar con los siguientes comandos. + +# Ejemplos de comandos preliminares + +## Ver metadatos básicos con FFprobe +Antes de comenzar a manipular nuestros archivos `destEarth`, usemos FFmpeg para examinar información básica sobre el archivo utilizando un simple comando de `ffprobe`. Esto ayudará a comprender cómo se construyen los archivos audiovisuales digitales y proporcionará una base para el resto del tutorial. Navega hasta el directorio del archivo y ejecuta: + +```bash +ffprobe destEarth.ogv +``` + +Verás los metadatos técnicos básicos del archivo impresos en `stdout`: + +{% include figure.html filename="ffprobe_ogg_es.png" caption="El output de un comando básico `ffprobe` con destEarth.ogv" %} + +La línea `Input # 0` del informe identifica el **contenedor** como [ogg](https://es.wikipedia.org/wiki/Ogg). Los contenedores (también llamados "envoltorios" o "wrappers", en inglés) proporcionan al archivo la estructura de sus diversas pistas. Los diferentes contenedores (otros más comunes incluyen `.mkv`, `.avi` y `.flv`) tienen diferentes características y compatibilidad con diversos programas. Examinaremos cómo y por qué es posible que desees cambiar el contenedor de un archivo en el siguiente comando. + +Las líneas `Stream #0:0` y `Stream #0:1` proporcionan información sobre las pistas del archivo (es decir, el contenido que ves en la pantalla y escuchas a través de sus altavoces) y también identifican el **códec** de cada pista. Los códecs especifican cómo se codifica/comprime (se escribe y almacena) y se decodifica (se reproduce) la información. La pista vídeo (`Stream #0:0`) de nuestro archivo `.ogv` usa el códec [theora](https://es.wikipedia.org/wiki/Theora) y la pista audio (`Stream #0:1`) usa el códec [vorbis](https://es.wikipedia.org/wiki/Vorbis). Estas líneas también proporcionan información importante relacionada con el espacio de color de la pista de vídeo (`yuv420p`), resolución (`400x300`) y marcos por segundo (`29.97 fps`). Adicionalmente, proporcionan información de audio como la tasa de muestreo (`44100 Hz`) y la tasa de bits (`128 kb/s`). + +Los códecs, en mayor medida que los contenedores, determinan la calidad y la compatibilidad de un archivo audiovisual con diferentes programas y plataformas (otros códecs comunes incluyen `DNxHD` y` ProRes` para vídeo y `mp3` y` FLAC` para audio). Examinaremos cómo y por qué es posible que también desees cambiar el códec de un archivo en el siguiente comando. + +Ejecuta otro comando de `ffprobe`, esta vez con el archivo `.m4v`: + +```bash +ffprobe destEarth.m4v +``` + +Una vez más, verás los metadatos técnicos básicos impresos en el `stdout`: + +{% include figure.html filename="ffprobe_mp4_es.png" caption="El output de un comando básico `ffprobe` con destEarth.m4v" %} + +También notarás que el informe para el archivo `.m4v` contiene múltiples contenedores en la línea `Input # 0` como `mov` y `m4a`. No es necesario profundizar en los detalles para los fines de este tutorial, pero ten en cuenta que los contenedores `mp4` y` mov` se presentan en múltiples "sabores" y diferentes extensiones de archivo. Sin embargo, todos son muy similares en su construcción técnica y, como tal, pueden verse agrupados en metadatos técnicos. De manera similar, el archivo `ogg` tiene la extensión` .ogv`, un "sabor" o variante del formato `ogg`. + +Al igual que en nuestro comando anterior, las líneas `Stream # 0: 0` y` Stream # 0: 1` identifican el códec de cada pista. Podemos ver que nuestro archivo `.m4v` usa el códec vídeo [H.264](https://es.wikipedia.org/wiki/H.264/MPEG-4_AVC) y el códec audio [aac](https://es.wikipedia.org/wiki/Advanced_Audio_Coding). Ten en cuenta que se nos proporcionan metadatos similares a nuestro archivo `.ogv`, pero algunas características importantes relacionadas con el análisis visual (como la resolución) son significativamente diferentes. Nuestro `.m4v` tiene una resolución más alta (`640x480`) y, por lo tanto, utilizaremos esta versión de *Destination Earth* como nuestro vídeo de origen. + +Ahora que sabemos más sobre la composición técnica de nuestro archivo, podemos comenzar a explorar las características y funcionalidades transformadoras de FFmpeg (volveremos a utilizar `ffprobe` más adelante en el tutorial para realizar una extracción de metadatos de color más avanzada). + +## Cambiar el contenedor (volver a envolver, "re-wrap") +Dependiendo de tu sistema operativo, puedes tener uno o más reproductores de medios instalados. Para efectos de demostración veamos qué sucede si intentas abrir `destEarth.ogv` usando el reproductor de medios QuickTime que viene con Mac OSX: + +{% include figure.html filename="QT_fail.png" caption="Los reproductores multimedia patentados como Quicktime a menudo están limitados en los tipos de archivos con los que pueden trabajar" %} + +Una opción cuando te enfrentas a un mensaje de este tipo es simplemente usar otro reproductor de medios. [VLC](https://www.videolan.org/vlc/index.es.html), que está construido con FFmpeg, es una excelente alternativa de código abierto, pero simplemente "usar otro programa" puede no ser siempre una solución viable (y es posible que no siempre tengas otra versión de archivo con la que trabajar). Muchos editores de vídeo populares, como Adobe Premiere, Final Cut Pro y DaVinci Resolve, tienen sus propias limitaciones en cuanto a los tipos de formatos con los que son compatibles. Además, las diferentes plataformas web y sitios de alojamiento/transmisión, como Vimeo, [también tienen sus propios requisitos.](https://help.vimeo.com/hc/es/articles/12426043233169-Video-and-audio-compression-guidelines) Por lo tanto, es importante poder volver a envolver y transcodificar tus archivos para cumplir con las diversas especificaciones para la reproducción, edición, publicación digital y ajuste de archivos a los estándares requeridos por las plataformas de archivo o preservación digital. + +
    +Para obtener una lista completa de los códecs y contenedores compatibles con tu instalación de FFmpeg, ejecuta ffmpeg -codecs y ffmpeg -formats, respectivamente, para ver la lista impresa de tu stdout. +
    + + +Como un ejercicio para aprender la sintaxis básica de FFmpeg y aprender a transcodificar entre formatos, comenzaremos con nuestro archivo `destEarth.ogv` y escribiremos un nuevo archivo con vídeo codificado en` H.264`, audio en `AAC` y envuelto en un contenedor `.mp4`, una combinación muy común y altamente portátil de códecs y contenedores que es prácticamente idéntico al archivo` .m4v` que originalmente descargamos. Aquí está el comando que ejecutarás, junto con una explicación de cada parte de la sintaxis: + +```bash +ffmpeg -i destEarth.ogv -c:v libx264 -c:a aac destEarth_transcoded.mp4 +``` + +* `ffmpeg` = comienza el comando +* `-i destEarth.ogv` = especifica el archivo de entrada +* `-c:v libx264` = transcodifica la pista de vídeo al codec H.264 +* `-c:a aac` = transcodifica la pista de audio al codec AAC +* `destEarth_transcoded.mp4` = especifica el archivo de salida. Ten en cuenta que aquí es donde se especifica el nuevo tipo de contenedor. + +Si ejecutas como está escrito y en el mismo directorio que `destEarth.ogv`, verás un nuevo archivo llamado` destEarth_transcoded.mp4`, que aparecerá en el directorio. Si estás operando en Mac OSX, también podrás reproducir este nuevo archivo con QuickTime. Una exploración completa de los convenios de códecs, contenedores, compatibilidad y extensión de archivos está más allá del alcance de este tutorial; sin embargo, este conjunto de ejemplos preliminares debería darles a aquellos que no estén familiarizados con la forma en que se construyen los archivos audiovisuales digitales un conjunto de conocimientos de referencia que les permitirá completar el resto del tutorial. + +## Creación de extractos y "demuxing" de audio y vídeo +Ahora que tenemos un mejor entendimiento de las pistas, códecs, y contenedores, veamos formas en que FFmpeg puede trabajar con materiales de vídeo a un nivel más granular. Para este tutorial, examinaremos dos secciones separadas de *Destination Earth* para comparar cómo se usa el color en relación con la retórica propagandística de la película. Crearemos y prepararemos estos extractos para el análisis utilizando un comando que realiza dos funciones diferentes simultáneamente: + +* Primero, el comando creará dos extractos de `destEarth.m4v`. +* Segundo, el comando eliminará ("demux") los componentes de audio (`Stream # 0: 1`) de estos extractos. +
    + Estamos eliminando el audio para ahorrar espacio de almacenamiento (la información de audio no es necesaria para el análisis de color). Esto probablemente será útil si esperas utilizar este tipo de análisis a escalas más grandes. Cerca del final del tutorial, se discutirá más información sobre la ampliación del análisis de color. +
    + +El primer extracto que haremos contiene una secuencia correspondiente al comienzo de la película que describe las difíciles condiciones y la vida oprimida de la sociedad marciana. El siguiente comando especifica los puntos de inicio y finalización del extracto, le dice a FFmpeg que retenga toda la información en la pista de vídeo sin transcodificar nada y le indica que escriba nuestro nuevo archivo sin la pista de audio: + +```bash +ffmpeg -i destEarth.m4v -ss 00:01:00 -to 00:04:35 -c:v copy -an destEarth_Mars_video.mp4 +``` +* `ffmpeg` = comienza el comando +* `-i destEarth.m4v` = especifica el archivo de entrada +* `-ss 00:01:00` = establece el punto de inicio a 1 minuto del inicio del archivo +* `-to 00:04:45` = establece el punto final a 4 minutos y 45 segundos desde el inicio del archivo +* `-c:v copy` = copia la pista de vídeo directamente, sin transcodificar +* `-an` = le dice a FFmpeg que ignore la pista de audio al escribir el archivo de salida. +* `destEarth_Mars_video.mp4` = especifica el archivo de salida + +{% include figure.html filename="Mars_screenshot.png" caption="Vida en Marte" %} + +Ahora, ejecutaremos un comando similar para crear un extracto de "Tierra". Esta parte de la película tiene una secuencia similar que describe las maravillas de la vida en la Tierra y la riqueza de su sociedad gracias al capitalismo de libre empresa y al uso de petróleo y productos derivados de este: + +```bash +ffmpeg -i destEarth.m4v -ss 00:07:30 -to 00:11:05 -c:v copy -an destEarth_Earth_video.mp4 +``` + +{% include figure.html filename="Earth_screenshot.png" caption="La abundancia de la Tierra" %} + + +Ahora deberías tener dos archivos nuevos en tu directorio llamados `destEarth_Mars_video.mp4` y` destEarth_Earth_video.mp4`. Puedes probar uno o ambos archivos (o cualquiera de los otros archivos en el directorio) usando la función `ffplay` de FFmpeg. Simplemente ejecuta: + +```bash +ffplay destEarth_Mars_video.mp4 +``` + +y/o + +```bash +ffplay destEarth_Earth_video.mp4 +``` + +Verás una ventana abierta y el vídeo comenzará en el punto de iniicio especificado. Se reproducirá una vez y luego la ventana se cerrará (además, notarás que no hay sonido en tu vídeo). También notarás que los comandos `ffplay` no requieren que se especifique una entrada (`-i`) o una salida porque la reproducción en sí misma es la salida. +
    +FFplay es un reproductor multimedia muy versátil que viene con una serie de opciones para personalizar la reproducción. Por ejemplo, si agregas `-loop 0` al comando se reproducirá en bucle indefinidamente.
    + + +Ahora hemos creado nuestros dos extractos para el análisis. Si vemos estos clips por separado, parece haber diferencias significativas en la forma en que se utilizan el color y la variedad de colores. En la siguiente parte del tutorial examinaremos y extraeremos datos de los archivos de vídeo para cuantificar y apoyar esta hipótesis. + +## Análisis de datos de color +El uso de herramientas digitales para analizar la información de color en películas es otra faceta emergente de las Humanidades Digitales que se superpone con los estudios cinematográficos tradicionales. En particular, el proyecto [FilmColors](https://filmcolors.org/) de la Universidad de Zurich cuestiona la intersección crítica de las "características estéticas formales de los aspectos semánticos, históricos y tecnológicos" de su producción, recepción y difusión a través del uso de herramientas de análisis y anotación digital (Flueckiger, 2017, traducido por el autor). Aunque no hay un método estandarizado para este tipo de investigación, en el momento de escribir esta lección el comando `ffprobe` que se describe a continuación es una una herramienta útil para extraer información de color que se puede usar en el análisis computacional. Primero, veamos otra manera estandarizada de representar la información de color que informa este enfoque cuantitativo, basado en datos, para el análisis de color: los vectorscopios. + +### Vectorscopios +Durante años, profesionales del vídeo han confiado en los [vectorscopios](https://es.wikipedia.org/wiki/Vectorscopio) para ver la información del color de una manera estandarizada y fácilmente legible. Un vectorscopio grafica información de color en una gratícula circular. La posición del gráfico corresponde a los [tonos](https://es.wikipedia.org/wiki/Tono_(color)) particulares encontrados en una señal de vídeo. Otros factores, como la saturación, determinan también el tamaño de un gráfico. A continuación se presenta un ejemplo de un vectorscopio que muestra los valores de color de las barras SMPTE. + +{% include figure.html filename="vectorscope.png" caption="Una lectura de vectorescopio que representa las barras SMPTE NTSC estándar. Fuente: Wikimedia Commons" %} + +{% include figure.html filename="smpte_bars.png" caption="Las barras SMPTE. Fuente: Wikimedia Commons" %} + +FFmpeg se puede utilizar para reproducir y crear archivos de vídeo con vectorscopios integrados en ellos para proporcionar una referencia en tiempo real para la información de color del vídeo. Los siguientes comandos `ffplay` incorporarán un vectorscopio en la esquina inferior derecha del marco. A medida que se reproduce el vídeo, notarás el cambio en el gráfico del vectorscopio a medida que cambia el color en pantalla: + +```bash +ffplay destEarth_Mars_video.mp4 -vf "split=2[m][v], [v]vectorscope=b=0.7:m=color3:g=green[v],[m][v]overlay=x=W-w:y=H-h" +``` + +* `ffplay` = comienza el comando +* `-i entrada_archivo.ext` = la ruta y el nombre del archivo de entrada +* `-vf` = crea un [*filter-graph*](https://trac.ffmpeg.org/wiki/FilteringGuide) para usar con las pistas +* `"` = una comilla para comenzar el *filter-graph.* La información entre las comillas + especifica los parámetros de la apariencia y posición del vectorscopio +* `split=2[m][v]` = divide la entrada en dos salidas idénticas llamadas `[m]` y `[v]` +* `,` = la coma indica que viene otro parámetro +* `[v]vectorscope=b=0.7:m=color3:g=green[v]` = asigna la salida `[v]` al filtro del vectorscopio +* `[m][v]overlay=x=W-w:y=H-h` = superpone el vectorscopio encima de la imagen de vídeo en una cierta ubicación (en este caso, en la esquina inferior derecha de la pantalla) +* `"` = termina el *filter-graph* + +
    +Para obtener más información sobre las diversas opciones para crear vectorscopios, consulta la documentación oficial y la página Wiki FFmpeg Vectorscope. Además, puedes encontrar más información sobre cómo colocar las superposiciones en la documentación del filtro de superposición FFmpeg. +
    + +{% include figure.html filename="Mars_screenshot_vector.png" caption="Captura de pantalla de la ventana de FFplay con vectorscopio incorporado" %} + +Y para el extracto de "Tierra": + +```bash +ffplay destEarth_Earth_video.mp4 -vf "split=2[m][v], [v]vectorscope=b=0.7:m=color3:g=green[v],[m][v]overlay=x=W-w:y=H-h" +``` + +{% include figure.html filename="Earth_screenshot_vector.png" caption="Captura de pantalla de la ventana de FFplay con vectorscopio incorporado" %} + +También podemos ajustar este comando para escribir nuevos archivos de vídeo con vectorscopios: + +```bash +ffmpeg -i destEarth_Mars_video.mp4 -vf "split=2[m][v], [v]vectorscope=b=0.7:m=color3:g=green[v],[m][v]overlay=x=W-w:y=H-h" -c:v libx264 destEarth_Mars_vectorscope.mp4 +``` + +```bash +ffmpeg -i destEarth_Earth_video.mp4 -vf "split=2[m][v], [v]vectorscope=b=0.7:m=color3:g=green[v],[m][v]overlay=x=W-w:y=H-h" -c:v libx264 destEarth_Earth_vectorscope.mp4 +``` + +Nota los pequeños pero importantes cambios en sintaxis: + * Hemos agregado una bandera de `-i` porque es un comando de `ffmpeg` + * Hemos especificado el códec del vídeo del archivo de salida como [H.264](https://es.wikipedia.org/wiki/H.264/MPEG-4_AVC) con la bandera `-c:v libx264` y no estamos recodificando el códec de audio (`-c:a copy`), aunque puedes especificar otro códec de audio si lo necesitas. + * Hemos definido el nombre del archivo de salida + +Tómate unos minutos para ver estos vídeos con los vectorscopios integrados en ellos. Observa cuán dinámicos (o no) son los cambios entre los extractos de "Marte" y "Tierra". Compara lo que ves en el vectorscopio con tus propias impresiones del vídeo mismo. Podríamos usar las observaciones de estos vectorscopios para hacer determinaciones sobre qué tonos de color aparecen de manera más regular o intensa en el vídeo, o podemos comparar diferentes formatos uno al lado del otro para ver cómo el color se codifica o representa de manera diferente en función de diferentes códecs, resoluciones, etc. + +Aunque los vectorscopios proporcionan una representación útil y en tiempo real de la información del color, es posible que también deseemos acceder a los datos sin procesar que se encuentran debajo de ellos. Luego, podemos usar estos datos para desarrollar visualizaciones más flexibles que no dependan de ver el archivo de vídeo simultáneamente y que ofrezcan un enfoque más cuantitativo para el análisis de color. En nuestros próximos comandos, utilizaremos `ffprobe` para producir un conjunto tabular de datos que pueda usarse para crear un gráfico de datos de color. + +### Extracción de datos de color con FFprobe +Al comienzo de este tutorial, utilizamos un comando `ffprobe` para ver los metadatos básicos de nuestro archivo impresos en el `stdout`. En los siguientes ejemplos, utilizaremos `ffprobe` para extraer datos de color de nuestros extractos de vídeo y enviar esta información a archivos` .csv`. Dentro de nuestro comando `ffprobe`, vamos a utilizar el filtro` signalstats` para crear reportes `.csv` de información de tono de color medio para cada marco en la secuencia de vídeo de` destEarth_Mars_video.mp4` y `destEarth_Earth_video.mp4`, respectivamente. + +```bash +ffprobe -f lavfi -i movie=destEarth_Mars_video.mp4,signalstats -show_entries frame=pkt_pts_time:frame_tags=lavfi.signalstats.HUEMED -print_format csv > destEarth_Mars_hue.csv +``` + +* `ffprobe` = comienza el comando +* `-f lavfi` = especifica el dispositivo de entrada virtual [libavfilter](https://ffmpeg.org/ffmpeg-devices.html#lavfi) como el formato elegido. Esto es necesario cuando se usa `signalstats` y muchos filtros en comandos FFmpeg más complejos. +* `-i movie=destEarth_Mars_video.mp4` = nombre del archivo de entrada +* `,signalstats` = especifica el uso del filtro `signalstats` con el archivo de entrada +* `-show_entries` = establece una lista de entradas que se mostrarán en el informe. Estos se especifican en las siguientes opciones. +* `frame=pkt_pts_time` = especifica mostrar cada marco con tu correspondiente `pkt_pts_time`, creando una entrada única para cada marco de vídeo +* `:frame_tags=lavfi.signalstats.HUEMED` = crea una etiqueta para cada marco que contiene el valor de tono medio +* `-print_format csv` = especifica el formato del informe de metadatos +* `> destEarth_Mars_hue.csv` = escribe un nuevo archivo `.csv` que contiene el informe de metadatos usando`> `, un [operador de redireccionamiento de Bash](https://www.gnu.org/software/bash/manual/html_node/Redirections.html). Este operador toma el comando que lo precede y "redirige" la salida a otra ubicación. En este caso, está escribiendo la salida en un nuevo archivo `.csv`. La extensión de archivo proporcionada aquí también debe coincidir con el formato especificado por el indicador `print_format`. + +A continuación, ejecuta el mismo comando para el extracto de "Tierra": + +```bash +ffprobe -f lavfi -i movie=destEarth_Earth_video.mp4,signalstats -show_entries frame=pkt_pts_time:frame_tags=lavfi.signalstats.HUEMED -print_format csv > destEarth_Earth_hue.csv +``` + +
    +Para obtener más información sobre el filtro de signalstats y las diversas métricas que se pueden extraer de las transmisiones de vídeo, consulta la documentación del filtro FFmpeg. +
    + + +Ahora deberías tener dos archivos `.csv` en tu directorio. Si los abres en un editor de texto o en un programa de hoja de cálculo, verás tres columnas de datos: + +{% include figure.html filename="csv_head.png" caption="Las primeras filas de nuestro informe de color en formato .csv" %} + +Comenzando a la izquierda y moviéndose a la derecha, las dos primeras columnas nos dan información sobre dónde estamos en el vídeo. Los números decimales representan fracciones de segundo que también corresponden aproximadamente a la base de tiempo de vídeo de 30 marcos por segundo. Cada fila en nuestro `.csv` corresponde a un marco de vídeo. La tercera columna lleva un número entero entre 0-360, valor que representa el tono medio para ese marco de vídeo. Estos números son los datos cuantitativos subyacentes del diagrama de vectorscopio y corresponden a su posición (en radianes) en la gratícula circular. Haciendo referencia a nuestra imagen de vectorescopio de antes, puedes ver que comenzando en la parte inferior del círculo (0 grados) y moviéndose a la izquierda, los "verdes" comienzan alrededor de los 38 grados, los "amarillos" en los 99 grados, los "rojos" en los 161 grados, los "magentas" en los 218 grados, los "azules" en los 279 grados y los "cianes" en los 341 grados. Una vez que comprendas estos "rangos" de tono, puedes hacerte una idea de cuál es el valor de tono medio para un marco de vídeo con solo mirar este valor numérico. + +Además, ten en cuenta que este valor extraído por el filtro `signalstats` no es una medida absoluta o completa de las cualidades de color de una imagen, sino simplemente un punto de referencia significativo desde el cual podemos explorar una estrategia basada en datos para el análisis de color. La percepción del color y la teoría del color son [áreas complejas y en evolución de la investigación académica](https://colourturn.net/) que incorporan muchas estrategias diferentes de las humanidades, las ciencias sociales y las ciencias cognitivas. Es por eso que debemos tener en cuenta que cualquier estrategia analítica debe tomarse dentro del contexto de estos discursos más amplios y con un espíritu colaborativo y generativo. + +### Visualizando datos de color +Los dos archivos `.csv` que creamos con los comandos anteriores ahora se pueden usar para crear gráficos que visualicen los datos. Hay una serie de plataformas (tanto propietarias como de código abierto) que se pueden usar para lograr esto, como [Microsoft Excel](https://www.wikihow.com/Create-a-Graph-in-Excel), [RawGraphs](https://rawgraphs.io/) y/o [plotly](https://plotly.com/graphing-libraries/). Una discusión en profundidad sobre cómo usar cualquiera de estas plataformas está fuera del alcance de este tutorial; sin embargo, a continuación se muestra la visualización final de los comandos anteriores, que se creó con los archivos `.csv` y plotly. + +{% include figure.html filename="Final_Graph_plotly.png" caption="Gráfico que incluye datos de tono medio de ambos extractos de vídeo" %} + +### Conclusiones +Al observar el gráfico, podemos ver que las trazas de Marte y la Tierra tienen rangos dinámicos muy diferentes en sus valores de tono medio. La traza de Marte es muy limitada y se mantiene dentro de los rangos rojo y amarillo (aproximadamente entre 100 y 160) en la mayoría del extracto. Esto sugiere algo sobre el uso del color en la película como un dispositivo retórico que sirve como mensaje propagandístico. Recuerda que esta sección presenta una visión antipática de la forma de vida y el sistema político marcianos: una población uniforme e infeliz, que depende de tecnología y transporte ineficientes mientras se les exige que observen la obediencia total a un gobernante supremo totalitario. La película conecta esta experiencia negativa con una paleta de tonos relativamente opacos de rojo y amarillo. También deberíamos considerar el público objetivo original de esta película, los jóvenes ciudadanos de los Estados Unidos en la década de 1950, y cómo probablemente habrían interpretado estas imágenes y usos del color en ese momento histórico. En particular, podemos considerar este uso del color en el contexto de las crecientes tensiones geopolíticas entre la Unión Soviética y los Estados Unidos y sus aliados en Europa occidental. El color rojo, específicamente, se usaba comúnmente en los medios impresos y de difusión para describir [la "amenaza" del comunismo global](https://es.wikipedia.org/wiki/Temor_rojo) durante esta era de la historia mundial. Además, la elección de presentar al líder totalitario marciano con una apariencia muy similar al icónico líder soviético [Joseph Stalin](https://es.wikipedia.org/wiki/I%C3%B3sif_Stalin) puede leerse como una señal visual y cultural explícita para la audiencia. Así, esta representación de Marte parece ser una caricatura alegórica de la vida bajo el velo del comunismo, tal como la percibe un observador externo y un oponente político/ideológico. Esta caricatura emplea no solo una paleta de colores limitada, sino una que está cargada con otras referencias culturales. El uso del color aprovecha los prejuicios y asociaciones que están presentes en el imaginario de la audiencia y, por lo tanto, está ligado estrechamente al argumento central de la película, que sostiene que el comunismo no es un sistema político viable. + +En contraste con el uso limitado del color en nuestro extracto de Marte, la traza de la Tierra cubre un rango dinámico mucho más amplio de valores de tono. En este pasaje, el emisario marciano está aprendiendo sobre el maravilloso y rico estilo de vida de los terrícolas gracias a un sistema capitalista y a la explotación de petroleo y de productos derivados de este. La secuencia enfatiza la riqueza material y la libertad empresarial ofrecida bajo un sistema capitalista usando una variedad y vivacidad de color mucho mayor que en el extracto de Marte. Los productos comerciales y las personas se representan utilizando el espectro completo del proceso Technicolor, creando asociaciones positivas entre los resultados de la industria petrolera y el estilo de vida acomodado de quienes se benefician de él. Al igual que el extracto de Marte, a la audiencia se le ofrece una caricatura unilateral de un sistema político y una forma de vida, pero en esta sección la representación reduccionista es laudable y próspera en lugar de desoladora y opresiva. + +Como una pieza de propaganda, *Destination Earth* se basa en estas distinciones poderosas pero demasiado simplistas entre dos sistemas políticos para influir en la opinión pública y promover el consumo de productos derivados del petróleo. La manera en que se usa (o no se usa) el color es una herramienta importante para elaborar y enfatizar este mensaje. Además, una vez que podemos extraer datos de color y visualizarlos utiliza técnicas gráficas simples, podemos ver que la disparidad en el rango dinámico proporciona una medida cuantitativa para vincular el uso técnico y estético del color en esta película animada con la retórica propagandística presentada por sus productores. + +{% include figure.html filename="lovely_oil.png" caption="El petróleo y los ideales estadounidenses de riqueza y prosperidad se expresan en esplendor colorido" %} + +### Escalando el análisis de color con FFprobe +Uno de los límites de esta metodología es que estamos generando manualmente informes de color en un solo archivo a la vez. Si quisiéramos adoptar un enfoque de [visión distante](https://distantviewing.org/) más en línea con las metodologías tradicionales de Humanidades Digitales, podríamos emplear un script de Bash para ejecutar nuestro comando `ffprobe` en todos los archivos en un determinado directorio. Esto es útil si, por ejemplo, un(a) investigador(a) esta interesado en realizar un análisis similar en [todas las películas animadas de John Sutherland encontradas en la colección de Archivos Prelinger](https://archive.org/details/prelinger&tab=collection?and%5B%5D=john+sutherland&sin=) u otro conjunto de material de vídeo de archivo. + +Una vez que tengas un conjunto de material para trabajar guardado en un solo lugar, puedes guardar el siguiente [bucle _for_ de Bash o "for loop"](https://www.shellscript.sh/loops.html) dentro del directorio y ejecutarlo para generar archivos `.csv` que contengan los mismos datos de tono medio a nivel de fotograma que extrajimos de nuestros extractos de *Destination Earth*. + +```bash +for file in *.m4v; do +ffprobe -f lavfi -i movie="$file",signalstats -show_entries frame=pkt_pts_time:frame_tags=lavfi.signalstats.HUEMED -print_format csv > "${file%.m4v}.csv"; +done +``` + +* `for file in *.m4v; do` = inicia el bucle _for_. Esta primera línea le dice a FFmpeg "para todos los archivos en este directorio con la extensión `.m4v`, ejecuta el siguiente comando." +* El `*` es un [comodín de Bash](https://tldp.org/LDP/GNU-Linux-Tools-Summary/html/x11655.htm) adjunto a un tipo de archivo dado para especificarlos como archivos de entrada. +* La palabra `file` es una variable arbitraria que representará cada archivo a medida que se ejecuta a través del bucle. +* `ffprobe -f lavfi -i movie="$file",signalstats -show_entries frame=pkt_pts_time:frame_tags=lavfi.signalstats.HUEMED -print_format csv > "${file%.m4v}.csv"; done` = el mismo comando de extracción de metadatos de color que ejecutamos en nuestros dos extractos de *Destination Earth*, con algunas pequeñas modificaciones en la sintaxis para explicar su uso en varios archivos en un directorio: + * `"$file"` = recuerda cada variable. Las comillas aseguran que se conserva el nombre de archivo original. + * `> "${file%.m4v}.csv";` = conserva el nombre de archivo original al escribir los archivos de salida `.csv`. Esto asegurará que los nombres de los archivos de vídeo originales coincidan con sus correspondientes reportes en `.csv`. + * `done` = termina el script una vez que se hayan completado todos los archivos del directorio. + +
    +También puedes usar signalstats para obtener otra información valiosa relacionada con el color. Consulta la documentación del filtro para obtener una lista completa de las métricas visuales disponibles. +
    + +Una vez que ejecutas este script, verás que cada archivo de vídeo en el directorio ahora tiene un archivo `.csv` correspondiente que contiene el conjunto de datos especificado. + +# En resumen +En este tutorial, hemos aprendido: + * cómo instalar FFmpeg en diferentes sistemas operativos y cómo acceder al _framework_ en el navegador web + * cuál es la sintaxis básica y la estructura de los comandos FFmpeg + * cómo visualizar metadatos técnicos básicos de un archivo audiovisual + * cómo transformar un archivo audiovisual a través de la transcodificación y el "re-wrapping" + * cómo analizar y editar ese archivo audiovisual separando sus componentes ("demux") y crear extractos + * cómo reproducir archivos audiovisuales usando `ffplay` + * cómo crear nuevos archivos de vídeo con vectorscopios integrados + * cómo exportar datos tabulares relacionados con el color de una pista de vídeo usando `ffprobe` + * cómo crear un bucle _for_ de Bash para extraer información de datos de color de múltiples archivos de vídeo con un solo comando + +A un nivel más amplio, este tutorial aspira a proporcionar una introducción informada y atractiva sobre cómo se pueden incorporar las herramientas y metodologías audiovisuales en los proyectos y las prácticas de Humanidades Digitales. Con herramientas abiertas y potentes como FFmpeg, existe un gran potencial para expandir el alcance del campo para incluir tipos de medios y análisis más ricos y complejos que nunca. + +# Más recursos +FFmpeg tiene una comunidad grande y bien apoyada de usarios a través de todo el mundo. Como tal, hay muchos recursos gratuitos y de código abierto para descubir nuevos comandos y técnicas para trabajar con materiales audiovisuales. Por favor, contacta al autor con cualquier adición a esta lista, especialmente si se trata de recursos educativos en español para aprender FFmpeg. + +* [La documentación oficial de FFmpeg](https://www.ffmpeg.org/ffmpeg.html) +* [FFmpeg Wiki](https://trac.ffmpeg.org/wiki/WikiStart) +* [ffmprovisr](https://amiaopensource.github.io/ffmprovisr/) de [La Asociación de Archivistas de Imágenes en Movimiento](https://amianet.org/?lang=es) +* [Entrenamiento de preservación audiovisual de Ashley Blewer](https://training.ashleyblewer.com/) +* [La presentación de Andrew Weaver: "Demystifying FFmpeg"](https://github.com/privatezero/NDSR/blob/master/Demystifying_FFmpeg_Slides.pdf) +* [FFmpeg: Presentación de Ben Turkus](https://docs.google.com/presentation/d/1NuusF948E6-gNTN04Lj0YHcVV9-30PTvkh_7mqyPPv4/present?ueb=true&slide=id.g2974defaca_0_231) +* [FFmpeg Cookbook for Archivists de Reto Kromer](https://avpres.net/FFmpeg/) + +## Programas de código abierto de análisis audiovisual que usan FFmpeg + +* [MediaInfo](https://mediaarea.net/en/MediaInfo) +* [QC Tools](https://bavc.org/preserve-media/preservation-tools) + +# Referencias + +* Champion, E. (2017) “Digital Humanities is text heavy, visualization light, and simulation poor,” Digital Scholarship in the Humanities 32(S1), i25-i32 + +* Hockey, S. (2004) “The History of Humanities Computing,” A Companion to Digital Humanities, ed. Susan Schreibman, Ray Siemens, John Unsworth. Oxford: Blackwell + +Este tutorial fue posible gracias al apoyo de la Academia Británica y fue escrito durante el Taller de _Programming Historian_ desarrollado en la Universidad de Los Andes en Bogotá, Colombia, entre el 31 de julio y 3 de agosto de 2018. diff --git a/es/lecciones/introduccion-a-imageplot-y-la-visualizacion-de-metadatos.md b/es/lecciones/introduccion-a-imageplot-y-la-visualizacion-de-metadatos.md index 4892098783..2d0399c5b3 100644 --- a/es/lecciones/introduccion-a-imageplot-y-la-visualizacion-de-metadatos.md +++ b/es/lecciones/introduccion-a-imageplot-y-la-visualizacion-de-metadatos.md @@ -41,7 +41,7 @@ Como paso final, utilizarás las medidas generadas a través de ImagePlot para c ### Información de trasfondo #### ¿Qué es ImagePlot? -Esta herramienta fue creada por el equipo de [Software Studies Initiative](http://lab.culturalanalytics.info/) para realizar visualizaciones que puedan abarcar la totalidad de una colección de imágenes y opera dentro de otro programa de libre acceso llamado [ImageJ](https://imagej.net/). Los creadores de ImagePlot utilizan el término "vista a distancia", *distant viewing* en inglés, para describir este tipo de visualización y análisis porque el punto de partida o enfoque principal es lo que se puede percibir de la totalidad de los elementos de la colección, en vez de uno o dos elementos a la vez. +Esta herramienta fue creada por el equipo de [Software Studies Initiative](https://lab.culturalanalytics.info/) para realizar visualizaciones que puedan abarcar la totalidad de una colección de imágenes y opera dentro de otro programa de libre acceso llamado [ImageJ](https://imagej.net/). Los creadores de ImagePlot utilizan el término "vista a distancia", *distant viewing* en inglés, para describir este tipo de visualización y análisis porque el punto de partida o enfoque principal es lo que se puede percibir de la totalidad de los elementos de la colección, en vez de uno o dos elementos a la vez. #### Visualización de metadatos @@ -51,7 +51,7 @@ No obstante, no necesitamos una colección inmensa para sacarle provecho a la vi #### La colección y los metadatos -Para esta lección utilizaremos una serie de imágenes provenientes de una colección de afiches del [Museo de la Palabra y la Imágen](https://web.archive.org/web/20201120143502/http://museo.com.sv/es/) en San Salvador, El Salvador. Los 394 afiches representados en los metadatos fueron creados durante los 12 años de la Guerra Civil de El Salvador (1980–1992), por más de 171 organizaciones de por lo menos 21 países. Los metadatos fueron preparados por personal de MUPI junto con el equipo de [Iniciativas Digitales de América Latina](http://ladi.lib.utexas.edu/)[^1] y la hoja de metadatos que utilizarás en esta lección es una versión modificada de la original. +Para esta lección utilizaremos una serie de imágenes provenientes de una colección de afiches del [Museo de la Palabra y la Imágen](https://web.archive.org/web/20201120143502/https://museo.com.sv/es/) en San Salvador, El Salvador. Los 394 afiches representados en los metadatos fueron creados durante los 12 años de la Guerra Civil de El Salvador (1980–1992), por más de 171 organizaciones de por lo menos 21 países. Los metadatos fueron preparados por personal de MUPI junto con el equipo de [Iniciativas Digitales de América Latina](https://ladi.lib.utexas.edu/)[^1] y la hoja de metadatos que utilizarás en esta lección es una versión modificada de la original. ## Imageplot Para utilizar ImagePlot debes comenzar descargando [ImageJ](https://imagej.net/ij/download.html) y luego descargar los cuatro macros de nuestro [repositorio en GitHub](https://github.com/programminghistorian/jekyll/tree/Issue-3275/assets/introduccion-a-imageplot-y-la-visualizacion-de-metadatos/)[^2]. Para organizar los archivos, puedes crear una carpeta con el nombre que prefieras para guardar los cuatro macros en un solo sitio. diff --git a/es/lecciones/introduccion-a-markdown.md b/es/lecciones/introduccion-a-markdown.md index 11b5a51ea6..338bb405a6 100644 --- a/es/lecciones/introduccion-a-markdown.md +++ b/es/lecciones/introduccion-a-markdown.md @@ -39,7 +39,7 @@ Dado que las lecciones de *The Programming Historian en español* deben ser envi ### ¿Qué es Markdown? -Markdown fue desarrollado en 2004 por [John Gruber](http://daringfireball.net/projects/markdown/), y se refiere tanto a (1) una manera de formar archivos de texto, como a (2) una utilidad del lenguaje de programación Perl para convertir archivos Markdown en HTML. En esta lección nos centraremos en la primera acepción y aprenderemos a escribir archivos utilizando la sintaxis de Markdown. +Markdown fue desarrollado en 2004 por [John Gruber](https://daringfireball.net/projects/markdown/), y se refiere tanto a (1) una manera de formar archivos de texto, como a (2) una utilidad del lenguaje de programación Perl para convertir archivos Markdown en HTML. En esta lección nos centraremos en la primera acepción y aprenderemos a escribir archivos utilizando la sintaxis de Markdown. Los archivos de texto plano tienen muchas ventajas sobre otro tipo de formato. Por un lado, se pueden leer prácticamente en todos los dispositivos. También han resistido la prueba del paso del tiempo mejor que otro tipo de archivos -si alguna vez has intentado abrir un documento guardado en un formato de [procesador de textos heredado](https://es.wikipedia.org/wiki/Sistema_heredado), estarás familiarizado con los problemas de compatibilidad que implican-. @@ -243,13 +243,13 @@ Los enlaces también se utilizan para crear notas a pie de página y son útiles Entonces puedes incluir el URL en otra parte del documento: -`[1]: http://programminghistorian.org/` +`[1]: https://programminghistorian.org/` Lo cual se despliega de la siguiente manera: Un ejemplo es el sitio *[The Programming Historian en español][1]* -[1]: http://programminghistorian.org/ +[1]: https://programminghistorian.org/ #### Imágenes @@ -325,7 +325,7 @@ Aunque Markdown se está haciendo cada vez más popular, particularmente para lo Markdown es un término medio muy útil entre los archivos de texto plano sin estilo y los documentos de procesadores de texto heredados. Su sintaxis simple se aprende rápidamente y es altamente legible en el mismo documento y cuando se transforma en HTML u otro tipo de documentos. En conclusión, escribir tus documentos en Markdown significa que serán capaces de ser utilizados y leídos a largo plazo. -[John Gruber]: http://daringfireball.net/projects/markdown/ +[John Gruber]: https://daringfireball.net/projects/markdown/ [Autoría sustentable utilizando Pandoc y Markdown]: /lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown [StackEdit]: https://stackedit.io [editor de StackEdit]: https://stackedit.io/editor diff --git a/es/lecciones/introduccion-a-powershell.md b/es/lecciones/introduccion-a-powershell.md index 0da782dce7..70140e41a9 100644 --- a/es/lecciones/introduccion-a-powershell.md +++ b/es/lecciones/introduccion-a-powershell.md @@ -33,9 +33,9 @@ doi: 10.46430/phes0037 # Introducción -En este tutorial aprenderás las bases de PowerShell de Windows, la interfaz de línea de comandos estándar de computadoras con Windows. Si eres usuario de Mac o Linux deberías consultar la [Introducción a la línea de comandos en Bash](/es/lecciones/introduccion-a-bash). Si ya estás familiarizado con el uso de Bash, es posible que puedas comenzar con PowerShell solamente con ver la [tabla al final de esta lección](#referencia-rapida). +En este tutorial aprenderás las bases de PowerShell de Windows, la interfaz de línea de comandos estándar de computadoras con Windows. Si eres usuario de Mac o Linux deberías consultar la [Introducción a la línea de comandos en Bash](/es/lecciones/introduccion-a-bash). Si ya estás familiarizado con el uso de Bash, es posible que puedas comenzar con PowerShell solamente con ver la [tabla al final de esta lección](#referencia-rápida). -El tutorial está dividido en dos secciones principales. En la primera sección, "[Para empezar](#para-empezar)", aprenderás a realizar tareas básicas de escritorio como crear y abrir archivos y carpetas con PowerShell. En la segunda sección, "[Haciendo más](#haciendo-más)", obtendrás un vistazo de algunas de las características que hacen que el trabajo en línea de comandos sea particularmente eficiente y aprendas lo básico para poder explorar más por tu cuenta. También te prepararás para [ejecutar *scripts* de Python desde la línea de comandos](#Utilización-de-herramientas-de-línea-de-comandos-y-ejecución-de-secuencias-de-comandos-en-Python). +El tutorial está dividido en dos secciones principales. En la primera sección, "[Para empezar](#para-empezar)", aprenderás a realizar tareas básicas de escritorio como crear y abrir archivos y carpetas con PowerShell. En la segunda sección, "[Haciendo más](#haciendo-más)", obtendrás un vistazo de algunas de las características que hacen que el trabajo en línea de comandos sea particularmente eficiente y aprendas lo básico para poder explorar más por tu cuenta. También te prepararás para [ejecutar *scripts* de Python desde la línea de comandos](#utilización-de-herramientas-de-línea-de-comandos-y-ejecución-de-secuencias-de-comandos-en-python). Este tutorial fue escrito para PowerShell 5.0. Si estás usando una versión anterior, encontrarás algunas pequeñas diferencias de sintaxis que debes ser capaz de superar con la pequeña ayuda de un buscador. diff --git a/es/lecciones/introduccion-a-tei-1.md b/es/lecciones/introduccion-a-tei-1.md index 280a85dd69..e180bd8f72 100644 --- a/es/lecciones/introduccion-a-tei-1.md +++ b/es/lecciones/introduccion-a-tei-1.md @@ -12,7 +12,7 @@ reviewers: - Rocío Méndez - Iñaki Cano review-ticket: https://github.com/programminghistorian/ph-submissions/issues/366 -next: introduccion-a-tei-2 +next: /es/lecciones/introduccion-a-tei-2 series_total: 2 lessons sequence: 1 difficulty: 2 diff --git a/es/lecciones/introduccion-a-tei-2.md b/es/lecciones/introduccion-a-tei-2.md index 1695d8236d..950ce5c1f8 100644 --- a/es/lecciones/introduccion-a-tei-2.md +++ b/es/lecciones/introduccion-a-tei-2.md @@ -12,7 +12,7 @@ editors: reviewers: - David Merino Recalde - Rosa María Muñoz Mendo -previous: introduccion-a-tei-1 +previous: /es/lecciones/introduccion-a-tei-1 series_total: 2 lessons sequence: 2 difficulty: 2 @@ -1429,7 +1429,7 @@ Con todo, OxGarage puede ser muy útil para extraer el texto de codificaciones d [^1]: Adoptaremos la convención de usar una `@` para denotar en esta lección un atributo de un elemento de XML. Sin embargo, ese signo no se usa en el código de XML, sino solo en la documentación (como esta lección). Por ejemplo, `@type` significa el atributo `type` en —digamos— `
    `. -[^2]: Al respecto véase la [primera lección](/es/lecciones/introduccion-a-tei-1#visualizaci%C3%B3n-vs-categorizaci%C3%B3n). +[^2]: Al respecto véase la [primera lección](/es/lecciones/introduccion-a-tei-1#visualización-vs-categorización). [^3]: Un "elemento de autocerrado" es un elemento de XML que no tiene contenido, por ejemplo: ``, que se abrevia así: ``. Nótese la barra invertida `/` *antes* del cierre de la etiqueta. Suelen usarse en TEI para los denominados elementos "hitos" (*milestones*), como los saltos de línea (``), saltos de página (``) y saltos de columna (``), que carecen de contenido y solo se usan para marcar un lugar preciso en el texto. Los procesadores de XML (como por ejemplo los navegadores web) automáticamente expanden estos elementos en su forma larga, de modo que son completamente sinónimos. diff --git a/es/lecciones/introduccion-al-web-scraping-usando-r.md b/es/lecciones/introduccion-al-web-scraping-usando-r.md index 41c1264d38..be12940c70 100644 --- a/es/lecciones/introduccion-al-web-scraping-usando-r.md +++ b/es/lecciones/introduccion-al-web-scraping-usando-r.md @@ -162,7 +162,7 @@ Existe otro lugar en el que podemos encontrar información sobre cómo interactu Este archivo está pensado principalmente para robots que hacen extracciones masivas del contenido de algunas páginas. Sin embargo, en él encontraremos información relevante para tareas más discretas, como las que realizaremos en esta serie de lecciones. -El documento robots.txt se encuentra en el directorio raíz de un sitio web, por lo tanto, en caso de estar disponible, podemos acceder a su contenido agregando "robots.txt" luego de la url principal. Por ejemplo, si quisiéramos revisar la versión de este archivo del sitio web del proyecto [Memoria Chilena](http://www.memoriachilena.gob.cl/) de la Biblioteca Nacional de Chile, tendríamos que escribir: `http://www.memoriachilena.gob.cl/robots.txt`. Eso nos llevará a [una página](https://perma.cc/37MD-HP8Y) con el siguiente contenido: +El documento robots.txt se encuentra en el directorio raíz de un sitio web, por lo tanto, en caso de estar disponible, podemos acceder a su contenido agregando "robots.txt" luego de la url principal. Por ejemplo, si quisiéramos revisar la versión de este archivo del sitio web del proyecto [Memoria Chilena](https://www.memoriachilena.gob.cl/) de la Biblioteca Nacional de Chile, tendríamos que escribir: `http://www.memoriachilena.gob.cl/robots.txt`. Eso nos llevará a [una página](https://perma.cc/37MD-HP8Y) con el siguiente contenido: ``` User-agent: * diff --git a/es/lecciones/introduccion-datos-abiertos-enlazados.md b/es/lecciones/introduccion-datos-abiertos-enlazados.md index 79f7235357..f285bc8503 100644 --- a/es/lecciones/introduccion-datos-abiertos-enlazados.md +++ b/es/lecciones/introduccion-datos-abiertos-enlazados.md @@ -47,7 +47,7 @@ Con el fin de proporcionar a los lectores una base sólida de los principios bá 1. La [web semántica](https://es.wikipedia.org/wiki/Web_sem%C3%A1ntica) y el [razonamiento semántico](https://en.wikipedia.org/wiki/Semantic_reasoner) de [conjuntos de datos](https://es.wikipedia.org/wiki/Conjunto_de_datos). Un razonador semántico deduciría que Jorge VI es el hermano o medio hermano de Eduardo VIII, dado el hecho de que a) Eduardo VIII es el hijo de Jorge V y b) Jorge VI es el hijo de Jorge V. Este tutorial no se centra en este tipo de tareas. -2. La creación y subida de conjuntos de datos abiertos enlazados a la [nube de datos enlazados](http://linkeddatacatalog.dws.informatik.uni-mannheim.de/state/). Compartir tus LOD es un principio importante, al que se anima más adelante. Sin embargo, los aspectos prácticos de contribuir con tus LOD a la nube de datos enlazados está fuera del alcance de esta lección. Al final de este tutorial hay algunos recursos disponibles que pueden ayudarte a comenzar con esta tarea. +2. La creación y subida de conjuntos de datos abiertos enlazados a la [nube de datos enlazados](https://linkeddatacatalog.dws.informatik.uni-mannheim.de/state/). Compartir tus LOD es un principio importante, al que se anima más adelante. Sin embargo, los aspectos prácticos de contribuir con tus LOD a la nube de datos enlazados está fuera del alcance de esta lección. Al final de este tutorial hay algunos recursos disponibles que pueden ayudarte a comenzar con esta tarea. ## Datos abiertos enlazados: ¿qué son? LOD es información estructurada en un formato destinado a las máquinas y, por tanto, no es necesariamente fácil de entender a primera vista. No te desanimes por esto, ya que una vez que entiendas los principios, puedes conseguir que una máquina los lea por ti. @@ -72,7 +72,7 @@ Vamos a crear un ejemplo con Jack Straw. Con este nombre propio podemos referirn persona=64183282 -A continuación, vamos a identificar al Jack Straw descrito por el *[Oxford Dictionary of National Biography](http://www.oxforddnb.com)* como 'el enigmático líder rebelde' con el número `33059614`. En consecuencia, su par atributo-valor sería el siguiente: +A continuación, vamos a identificar al Jack Straw descrito por el *[Oxford Dictionary of National Biography](https://www.oxforddnb.com)* como 'el enigmático líder rebelde' con el número `33059614`. En consecuencia, su par atributo-valor sería el siguiente: persona=33059614 @@ -82,7 +82,7 @@ Los pares atributo-valor también pueden almacenar información sobre otros tipo lugar=2655524 -En este momento podrías estar pensando, "esto es lo que hace el catálogo de la biblioteca". Es cierto que la idea clave aquí es la de [control de autoridades](https://es.wikipedia.org/wiki/Control_de_autoridades), que es central en biblioteconomía (un fichero de autoridad es una lista cerrada de términos que pueden ser utilizados en un contexto particular, por ejemplo cuando se cataloga un libro). En ambos ejemplos mencionados anteriormente, hemos utilizado los ficheros de autoridad para asignar los números (los identificadores únicos) a los Jacks y a Blackburn. Los números que utilizamos para los dos Jack Straws provienen del [Virtual International Authority File - Archivo de Autoridades Internacional Virtual](https://www.oclc.org/es/viaf.html) (VIAF), que es mantenido por un consorcio de bibliotecas de todo el mundo para tratar de abordar el problema de la miríada de formas en las que una misma persona podría ser nombrada. El identificador único que utilizamos para el distrito electoral de Blackburn provino de [GeoNames](http://www.geonames.org/), una base de datos geográfica gratuita. +En este momento podrías estar pensando, "esto es lo que hace el catálogo de la biblioteca". Es cierto que la idea clave aquí es la de [control de autoridades](https://es.wikipedia.org/wiki/Control_de_autoridades), que es central en biblioteconomía (un fichero de autoridad es una lista cerrada de términos que pueden ser utilizados en un contexto particular, por ejemplo cuando se cataloga un libro). En ambos ejemplos mencionados anteriormente, hemos utilizado los ficheros de autoridad para asignar los números (los identificadores únicos) a los Jacks y a Blackburn. Los números que utilizamos para los dos Jack Straws provienen del [Virtual International Authority File - Archivo de Autoridades Internacional Virtual](https://www.oclc.org/es/viaf.html) (VIAF), que es mantenido por un consorcio de bibliotecas de todo el mundo para tratar de abordar el problema de la miríada de formas en las que una misma persona podría ser nombrada. El identificador único que utilizamos para el distrito electoral de Blackburn provino de [GeoNames](https://www.geonames.org/), una base de datos geográfica gratuita. Pero intentemos ser más precisos por lo que entendemos por Blackburn en este caso. Jack Straw ejerció su cargo parlamentario en representación de Blackburn (que cuenta con un solo miembro en el parlamento británico). Los límites de Blackburn han cambiado con el paso del tiempo, así que en el proyecto '[Digging Into Linked Parliamentary Data](https://repository.jisc.ac.uk/6544/)' (Dilipad) (en el que trabajé) se crearon identificadores únicos para las afiliaciones a partidos y para los distritos electorales de cada miembro del parlamento. En este ejemplo, Jack Straw representó a la circunscripción conocida como 'Blackburn' en su encarnación posterior a 1955: @@ -126,18 +126,18 @@ En la sección anterior usamos dos números distintos para identificar nuestros El problema es que en todo el mundo hay muchas bases de datos que contienen personas con estos números, y probablemente sean personas diferentes. Fuera de nuestro contexto inmediato, estas cifras no identifican individuos únicos. Tratemos de arreglar eso. Aquí están estos mismos identificadores pero como URI: - http://viaf.org/viaf/64183282/ - http://viaf.org/viaf/33059614/ + https://viaf.org/viaf/64183282/ + https://viaf.org/viaf/33059614/ Así como el número único desambiguó nuestros dos Jack Straws, el URI completo anterior nos ayuda a eliminar la ambigüedad entre todos los diferentes archivos de autoridad que existen. En este caso, está claro que estamos usando VIAF como nuestro archivo de autoridad. Ya has visto esta forma de desambiguación muchas veces en la web. Hay muchos sitios web alrededor del mundo con páginas llamadas `/home` o `/faq`. Pero no hay confusión porque el [dominio](https://es.wikipedia.org/wiki/Dominio_de_Internet) (la primera parte del Localizador Uniforme de Recursos (URL) - por ejemplo,`bbc.co.uk`) es único y, por lo tanto, todas las páginas que son parte de ese dominio son únicas, diferenciándose de otras páginas `/faq` de otros sitios web. En la dirección `http://www.bbc.co.uk/faqs`, es la parte `bbc.co.uk` la que hace únicas las páginas siguientes. Esto es tan obvio para las personas que usan la web todo el tiempo que no piensan en ello. Probablemente también sepas que si quieres iniciar un sitio web llamado `bbc.co.uk` no puedes hacerlo, porque ese nombre ya se ha registrado con la autoridad correspondiente, que es el [Sistema de Nombres de Dominio](https://es.wikipedia.org/wiki/Sistema_de_nombres_de_dominio) (Domain Name System - DNS). El registro garantiza la unicidad. Los URIs también deben ser únicos. Si bien los ejemplos anteriores se parecen a las URLs, es posible también construir un URI que no se parezca en nada a una URL. Tenemos muchas maneras de identificar personas y cosas de manera única y rara vez lo pensamos o nos preocupamos de ello. Los códigos de barras, los números de pasaporte e incluso tu dirección postal están diseñados para ser únicos. En el mundo desarrollado los números de teléfono móvil se colocan con frecuencia en los carteles de las tiendas precisamente porque son únicos. Todos ellos podrían usarse como URIs. -Cuando quisimos crear URIs para las entidades descritas por el proyecto '[Tobias](http://www.history.ac.uk/projects/digital/tobias)', elegimos una estructura tipo URL y elegimos utilizar nuestro espacio web institucional, dejando de lado `data.history.ac.uk/tobias-project/` como un lugar dedicado a alojar estos URI. Al ponerlo en `data.history.ac.uk` en lugar de en `history.ac.uk`, hubo una separación clara entre los URI y las páginas del sitio web. Por ejemplo, uno de los URIs del proyecto Tobias era http://data.history.ac.uk/tobias-project/person/15601. Si bien el formato de los URI mencionados anteriormente es el mismo que el de una URL, no se vinculan a páginas web (intenta pegarlas en un navegador web). Muchas personas nuevas con los LOD encuentran esto confuso. Todas las URL son URI, pero no todas las URI son URL. Una URI puede describir cualquier cosa, mientras que una URL describe la ubicación de algo en la web. Es decir, una URL te dice la ubicación de una página web o un archivo o algo similar. Un URI simplemente hace el trabajo de identificar algo. Así como el Número Estándar Internacional de Libro, o [ISBN](https://www.iso.org/standard/36563.html) 978-0-1-873354-6 identifica de manera única una edición de tapa dura de _Bautismo, Hermandad y Creencias en la Reforma de Alemania_ por Kat Hill, pero no te dice dónde conseguir una copia. Para eso, necesitarías algo como una [signatura](https://www.upo.es/biblioteca/guia_loc_sig/signatura/index.html), que te da una ubicación exacta en un estante de una biblioteca específica. +Cuando quisimos crear URIs para las entidades descritas por el proyecto '[Tobias](https://www.history.ac.uk/projects/digital/tobias)', elegimos una estructura tipo URL y elegimos utilizar nuestro espacio web institucional, dejando de lado `data.history.ac.uk/tobias-project/` como un lugar dedicado a alojar estos URI. Al ponerlo en `data.history.ac.uk` en lugar de en `history.ac.uk`, hubo una separación clara entre los URI y las páginas del sitio web. Por ejemplo, uno de los URIs del proyecto Tobias era https://data.history.ac.uk/tobias-project/person/15601. Si bien el formato de los URI mencionados anteriormente es el mismo que el de una URL, no se vinculan a páginas web (intenta pegarlas en un navegador web). Muchas personas nuevas con los LOD encuentran esto confuso. Todas las URL son URI, pero no todas las URI son URL. Una URI puede describir cualquier cosa, mientras que una URL describe la ubicación de algo en la web. Es decir, una URL te dice la ubicación de una página web o un archivo o algo similar. Un URI simplemente hace el trabajo de identificar algo. Así como el Número Estándar Internacional de Libro, o [ISBN](https://www.iso.org/standard/36563.html) 978-0-1-873354-6 identifica de manera única una edición de tapa dura de _Bautismo, Hermandad y Creencias en la Reforma de Alemania_ por Kat Hill, pero no te dice dónde conseguir una copia. Para eso, necesitarías algo como una [signatura](https://www.upo.es/biblioteca/guia_loc_sig/signatura/index.html), que te da una ubicación exacta en un estante de una biblioteca específica. Hay un poco de jerga alrededor de los URIs. La gente habla de si son, o no, [desreferenciables](https://es.wikipedia.org/wiki/Referencia_(inform%C3%A1tica)). Eso solo significa que *¿se puede pasar desde una referencia abstracta a otra cosa?* Por ejemplo, si pegas un URI en la barra de direcciones de un navegador, ¿devolverá algo? El URI de VIAF para el historiador Simon Schama es: - http://viaf.org/viaf/46784579 + https://viaf.org/viaf/46784579 Si lo pones en el navegador, obtendrás una página web sobre Simon Schama que contiene datos estructurados sobre él y su historial de publicaciones. Esto es muy útil, pero, por otro lado, no es obvio desde la URI a quién o incluso a qué se refiere. Del mismo modo, si tratamos un número de teléfono móvil (con código internacional) como URI para una persona, entonces debería ser desreferenciable. Alguien podría responder el teléfono, e incluso podría ser Schama. @@ -162,10 +162,10 @@ Estamos poniendo ejemplos simplemente con el fin de ilustrar, pero si deseas enl Una ontología es más flexible porque no es jerárquica. Su objetivo es representar la fluidez del mundo real, donde las cosas se pueden relacionar entre sí de formas más complejas que las representadas por una estructura jerárquica de tipo arbóreo. En cambio, una ontología es más como una tela de araña. -Sea lo que sea que desees representar con los LOD, te sugerimos que busques un vocabulario existente y lo uses, en lugar de intentar escribir el tuyo propio. Esta página principal incluye [una lista de algunos de los vocabularios más populares](http://semanticweb.org/wiki/Main_Page.html) +Sea lo que sea que desees representar con los LOD, te sugerimos que busques un vocabulario existente y lo uses, en lugar de intentar escribir el tuyo propio. Esta página principal incluye [una lista de algunos de los vocabularios más populares](https://semanticweb.org/wiki/Main_Page.html) > N.T.: desplázate hacia la zona derecha/abajo de la página: "Popular Vocabularies" -Dado que nuestro anterior ejemplo se centra en los pianistas, sería una buena idea encontrar una ontología adecuada en lugar de crear nuestro propio sistema. De hecho, hay [una ontología para la música](http://web.archive.org/web/20170715094229/http://www.musicontology.com/). Además de una especificación bien desarrollada, tiene también algunos ejemplos útiles de su uso. Puedes echar un vistazo a las páginas de [Introducción](http://web.archive.org/web/20170718143925/http://musicontology.com/docs/getting-started.html) para tener una idea de cómo puedes usar esa ontología particular. +Dado que nuestro anterior ejemplo se centra en los pianistas, sería una buena idea encontrar una ontología adecuada en lugar de crear nuestro propio sistema. De hecho, hay [una ontología para la música](https://web.archive.org/web/20170715094229/https://www.musicontology.com/). Además de una especificación bien desarrollada, tiene también algunos ejemplos útiles de su uso. Puedes echar un vistazo a las páginas de [Introducción](https://web.archive.org/web/20170718143925/https://musicontology.com/docs/getting-started.html) para tener una idea de cómo puedes usar esa ontología particular. Lamentablemente, no encuentro nada que describa la relación entre un profesor y un alumno en Music Ontology. Pero la ontología se publica en abierto, así que puedo usarla para describir otras características de la música y luego crear mi propia extensión. Si luego publico mi extensión en abierto, otros pueden usarla si lo desean y puede convertirse en un estándar. Si bien el proyecto Music Ontology no tiene la relación que necesito, el proyecto [Linked Jazz](https://linkedjazz.org/) permite el uso de 'mentorDe', que parece que podría funcionar bien en nuestro caso. Aunque esta no es la solución ideal, conviene esforzarse por usar lo que ya existe. @@ -174,7 +174,7 @@ Ahora bien, si estuvieras estudiando la historia de los pianistas, querrías ide Dame los nombres de todos los pianistas enseñados por x donde x fue enseñado a tocar el piano por Liszt -La consulta devolvería todas las personas en el conjunto de datos que fueron alumnos de un alumno de Liszt. No nos entusiasmemos demasiado: esta consulta no nos dará a cada alumno de cada alumno de Liszt que haya existido alguna vez porque esa información probablemente no exista y no exista dentro de ningún grupo de tripletas existente. Lidiar con datos del mundo real muestra todo tipo de omisiones e inconsistencias que veremos cuando analicemos el mayor conjunto de LOD, [DBpedia](http://wiki.dbpedia.org), en la sección final. +La consulta devolvería todas las personas en el conjunto de datos que fueron alumnos de un alumno de Liszt. No nos entusiasmemos demasiado: esta consulta no nos dará a cada alumno de cada alumno de Liszt que haya existido alguna vez porque esa información probablemente no exista y no exista dentro de ningún grupo de tripletas existente. Lidiar con datos del mundo real muestra todo tipo de omisiones e inconsistencias que veremos cuando analicemos el mayor conjunto de LOD, [DBpedia](https://wiki.dbpedia.org), en la sección final. Si has utilizado [bases de datos relacionales](https://en.wikipedia.org/wiki/Relational_database), podrías pensar que pueden realizar la misma función. En el caso de Liszt, la información sobre pianistas descrita anteriormente podría organizarse en una [tabla](https://es.wikipedia.org/wiki/Base_de_datos_relacional) de base de datos llamada algo así como 'Alumnos'. @@ -211,19 +211,19 @@ Reconocer qué serialización estás viendo significa que puedes elegir las herr Turtle usa alias o atajos conocidos como [prefijos](https://www.w3.org/TeamSubmission/turtle/#sec-tutorial), lo que nos ahorra tener que escribir URIs completos todo el tiempo. Regresemos al URI que inventamos en la sección anterior: - http://data.history.ac.uk/tobias-project/person/15601 + https://data.history.ac.uk/tobias-project/person/15601 No queremos escribir esto cada vez que nos referimos a esta persona (Jack Straw, como recordarás). Entonces sólo tenemos que anunciar nuestro atajo: - @prefix toby: . + @prefix toby: . Así, Jack es `toby:15601`, que reemplaza el URI largo y es más fácil de leer. He elegido 'toby', pero podría haber elegido cualquier cadena de letras con la misma facilidad. Pasemos ahora de Jack Straw a William Shakespeare y usemos Turtle para describir algunos elementos sobre sus obras. Tendremos que decidir qué archivos de autoridad usar, un proceso que, como se mencionó anteriormente, se optimiza si consultamos otros conjuntos de LOD. Aquí usaremos [Dublin Core](https://es.wikipedia.org/wiki/Dublin_Core), un estándar de [metadatos](https://es.wikipedia.org/wiki/Metadato) usado por las bibliotecas, como uno de nuestros prefijos, el archivo de autoridad del [Número de control de la Biblioteca del Congreso](https://es.wikipedia.org/wiki/Library_of_Congress_Control_Number) para otro, y el último (VIAF) debería serte familiar. En conjunto, estos tres archivos de autoridad proporcionan identificadores únicos para todas las entidades que planeo usar en este ejemplo: - @prefix lccn: . - @prefix dc: . - @prefix viaf: . + @prefix lccn: . + @prefix dc: . + @prefix viaf: . lccn:n82011242 dc:creator viaf:96994048 . @@ -233,9 +233,9 @@ En el ejemplo anterior, lccn: n82011242 representa a Macbeth; dc: creator vincul Turtle también te permite listar tripletas sin molestarte en repetir cada URI cuando acabas de usarlo. Agreguemos la fecha en la que los expertos creen que Macbeth fue escrita utilizando el par atributo-valor de Dublin Core:`dc: created 'YYYY'` : - @prefix lccn: . - @prefix dc: . - @prefix viaf: . + @prefix lccn: . + @prefix dc: . + @prefix viaf: . lccn: n82011242 dc: creator viaf: 96994048 ; dc: created "1606" . @@ -252,11 +252,11 @@ Puedes usar un punto y coma si el sujeto es el mismo pero el predicado y el obje Aquí estamos diciendo que Shakespeare (96994048) y John Fletcher (12323361) fueron los creadores de la obra *Los dos nobles caballeros*. -Cuando anteriormente vimos las ontologías, sugerí que le echaras un vistazo a los ejemplos de la [Music Ontology](http://web.archive.org/web/20170718143925/http://musicontology.com/docs/getting-started.html). Espero que no te decepcionaran. Echa un vistazo de nuevo ahora. Todavía es algo complicado, pero ¿tiene más sentido ahora? +Cuando anteriormente vimos las ontologías, sugerí que le echaras un vistazo a los ejemplos de la [Music Ontology](https://web.archive.org/web/20170718143925/https://musicontology.com/docs/getting-started.html). Espero que no te decepcionaran. Echa un vistazo de nuevo ahora. Todavía es algo complicado, pero ¿tiene más sentido ahora? Una de las ontologías más accesibles es Friend of a Friend, o [FOAF](https://es.wikipedia.org/wiki/FOAF). Está diseñada para describir personas y es, quizás por esa razón, bastante intuitiva. Si, por ejemplo, deseas escribirme para decirme que este tutorial es lo mejor que has leído, aquí está mi dirección de correo electrónico expresada como tripletas en FOAF: - @prefix foaf: . + @prefix foaf: . :"Jonathan Blaney" foaf:mbox . #### RDF/XML @@ -267,10 +267,10 @@ En contraste con Turtle, RDF/XML puede parecer un poco pesado. Para empezar, con En RDF/XML, con los prefijos declarados dentro del fragmento XML, es así: - - - + + + @@ -278,29 +278,29 @@ El formato RDF/XML tiene la misma información básica que Turtle, pero se escri Pasemos a un ejemplo diferente para mostrar cómo RDF/XML combina tripletas y, al mismo tiempo, presentamos [SKOS](https://es.wikipedia.org/wiki/Simple_Knowledge_Organization_System) (Simple Knowledge Organization System - Sistema Simple de Organización del Conocimiento), que está diseñado para codificar tesauros o taxonomías. - + Abdication -Aquí estamos diciendo que el concepto SKOS `21250`, abdicación, tiene una etiqueta preferida de "abdicación". La forma en que funciona es que el elemento sujeto (incluida la parte de abdicación, que es un valor de atributo en términos XML) tiene el predicado y el objeto anidados dentro de él. El elemento anidado es el predicado y el nodo hoja [(*the leaf node*)](https://es.wikipedia.org/wiki/%C3%81rbol_(inform%C3%A1tica)#Terminolog.C3.ADa), es el objeto. Este ejemplo está tomado de un proyecto para publicar un [tesauro de historia británica e irlandesa](http://www.history.ac.uk/projects/digital/tobias). +Aquí estamos diciendo que el concepto SKOS `21250`, abdicación, tiene una etiqueta preferida de "abdicación". La forma en que funciona es que el elemento sujeto (incluida la parte de abdicación, que es un valor de atributo en términos XML) tiene el predicado y el objeto anidados dentro de él. El elemento anidado es el predicado y el nodo hoja [(*the leaf node*)](https://es.wikipedia.org/wiki/%C3%81rbol_(inform%C3%A1tica)#Terminolog.C3.ADa), es el objeto. Este ejemplo está tomado de un proyecto para publicar un [tesauro de historia británica e irlandesa](https://www.history.ac.uk/projects/digital/tobias). Al igual que con Turtle, podemos agregar más tripletas. Entonces, declaremos que el término más restringido en nuestra jerarquía de temas, uno más abajo de *Abdicación* será *Crisis de la abdicación (1936)*. - + Abdication - - + + ¿Recuerdas cómo los predicados y los objetos están anidados dentro del sujeto? Aquí lo hemos hecho dos veces con el mismo sujeto, por lo que podemos hacer esto menos detallado al anidar ambos conjuntos de predicados y objetos dentro de un sujeto: - + Abdication - + -Si estás familiarizado con XML, esto será muy fácil para ti. Si no lo estás, podrías preferir un formato como Turtle. Pero la ventaja aquí es que creando tu RDF/XML puedes usar las herramientas habituales disponibles para XML, como editores y analizadores XML, para verificar que tu RDF/XML esté formateado correctamente. Si no tienes experiencia con XML, recomiendo Turtle, para lo que puedes usar una herramienta [en línea](http://www.easyrdf.org/converter) para verificar que tu sintaxis sea correcta. +Si estás familiarizado con XML, esto será muy fácil para ti. Si no lo estás, podrías preferir un formato como Turtle. Pero la ventaja aquí es que creando tu RDF/XML puedes usar las herramientas habituales disponibles para XML, como editores y analizadores XML, para verificar que tu RDF/XML esté formateado correctamente. Si no tienes experiencia con XML, recomiendo Turtle, para lo que puedes usar una herramienta [en línea](https://www.easyrdf.org/converter) para verificar que tu sintaxis sea correcta. ## Consultas RDF con SPARQL @@ -356,7 +356,7 @@ Volvamos a los resultados de la consulta que ejecuté hace un momento: Puedo ver una larga lista en la columna etiquetada como _c_ . Estos son todos los atributos que Roper tiene en la DBpedia y nos ayudarán a encontrar otras personas con estos atributos. Por ejemplo, puedo ver ```http://dbpedia.org/class/yago/Historian110177150```. ¿Puedo usar esto para obtener una lista de historiadores? Voy a poner esto en mi consulta pero en tercer lugar (porque ahí es donde estaba cuando lo encontré en los resultados de Lyndal Roper). Mi consulta se ve así: SELECT * WHERE { - ?historian_name ?predicate + ?historian_name ?predicate } He hecho un pequeño cambio aquí. Si esta consulta funciona entonces espero que mis historiadores estén en la primera columna, porque "historiador" no parece ser un predicado: no funciona como un verbo en una oración; así que voy a llamar a mi primera columna de resultados 'nombre_historiador' y a mi segunda (de la que no sé nada) 'predicado'. @@ -370,8 +370,8 @@ Así que esto funciona para crear listas, lo cual es útil, pero sería mucho m SELECT ?name WHERE { - ?name ?b . - ?name ?b + ?name ?b . + ?name ?b } ¡Funciona! Obtengo cinco resultados. En el momento de escribir, hay cinco historiadoras británicas en *DBpedia*... @@ -380,7 +380,7 @@ Así que esto funciona para crear listas, lo cual es útil, pero sería mucho m ¿Solo cinco mujeres británicas historiadoras? Por supuesto que hay, en realidad, muchas más que eso, como podríamos demostrar fácilmente sustituyendo el nombre de, digamos, Alison Weir en nuestra primera consulta de Lyndal Roper. Esto nos lleva al problema con *Dbpedia* que mencioné anteriormente: no está marcado de manera consistente con información estructural del tipo que usa *DBpedia*. Nuestra consulta puede enumerar algunas historiadoras británicas, pero resulta que no podemos usarla para generar una lista significativa de personas en esta categoría. Todo lo que hemos encontrado es la gente en las entradas en Wikipedia que alguien ha decidido categorizar como "Historiador británico" y "mujer historiadora". -Con SPARQL en *DBpedia*, debes tener cuidado con las inconsistencias del material de múltiples fuentes. Podrías usar SPARQL exactamente de la misma manera en un conjunto de datos más mantenido, por ejemplo, los datos del gobierno del Reino Unido: [https://data-gov.tw.rpi.edu//sparql]() y esperar obtener resultados más sólidos (hay un breve resumen tutorial para este conjunto de datos aquí: [https://data-gov.tw.rpi.edu/wiki/A\_crash\_course\_in\_SPARQL]()). +Con SPARQL en *DBpedia*, debes tener cuidado con las inconsistencias del material de múltiples fuentes. Podrías usar SPARQL exactamente de la misma manera en un conjunto de datos más mantenido, por ejemplo, los datos del gobierno del Reino Unido: [https://data-gov.tw.rpi.edu//sparql](https://data-gov.tw.rpi.edu//sparql) y esperar obtener resultados más sólidos (hay un breve resumen tutorial para este conjunto de datos aquí: [https://data-gov.tw.rpi.edu/wiki/A\_crash\_course\_in\_SPARQL](https://data-gov.tw.rpi.edu/wiki/A\_crash\_course\_in\_SPARQL)). Sin embargo, a pesar de sus inconsistencias, *DBpedia* es un gran lugar para aprender SPARQL. Esto solo ha sido una breve introducción pero hay mucho más en ['Uso de SPARQL para acceder a datos abiertos enlazados'](/es/lecciones/sparql-datos-abiertos-enlazados). @@ -392,7 +392,7 @@ Sin embargo, a pesar de sus inconsistencias, *DBpedia* es un gran lugar para apr * Bob DuCharme, *Learning SPARQL*, O'Reilly, 2011 -* El blog de [Bob DuCharme](http://www.snee.com/bobdc.blog/) merece la pena leerlo también. +* El blog de [Bob DuCharme](https://www.snee.com/bobdc.blog/) merece la pena leerlo también. * Richard Gartner, *Metadata: Shaping Knowledge from Antiquity to the Semantic Web*, Springer, 2016 @@ -404,15 +404,15 @@ Sin embargo, a pesar de sus inconsistencias, *DBpedia* es un gran lugar para apr * Dominic Oldman, Martin Doerr y Stefan Gradmann, "Zen and the Art of Linked Data: New Strategies for a Semantic Web of Humanist Knowledge", in *A New Companion to Digital Humanities*, editado por Susan Schreibman et al. -* Max Schmachtenberg, Christian Bizer y Heiko Paulheim, [State of the LOD Cloud 2017](http://linkeddatacatalog.dws.informatik.uni-mannheim.de/state/) +* Max Schmachtenberg, Christian Bizer y Heiko Paulheim, [State of the LOD Cloud 2017](https://linkeddatacatalog.dws.informatik.uni-mannheim.de/state/) * David Wood, Marsha Zaidman y Luke Ruth, *Linked Data: Structured data on the Web*, Manning, 2014 -* Biblioteca del Congreso Nacional de Chile, [Linked Open Data: ¿Qué es?](http://datos.bcn.cl/es/informacion/que-es) +* Biblioteca del Congreso Nacional de Chile, [Linked Open Data: ¿Qué es?](https://datos.bcn.cl/es/informacion/que-es) * Ana-Isabel Torre-Bastida, Marta González-Rodríguez y Esther Villar-Rodríguez, [Datos abiertos enlazados (LOD) y su implantación en bibliotecas: iniciativas y tecnologías](https://web.archive.org/web/20180720105638/https://recyt.fecyt.es/index.php/EPI/article/download/epi.2015.mar.04/18804) ## Agradecimientos -El autor del tutorial agradece a los revisores del tutorial original, Matthew Lincoln y a Terhi Nurmikko-Fuller, y al editor, Admam Cyrmble, por dedicar tiempo generosamente a ayudarle a mejorar este tutorial con numerosas sugerencias, aclaraciones y correcciones. Esta lección se basa en un trabajo perteneciente al “Tesauro de historia Británica e Irlandsesa como SKOS” (proyecto [Tobias](http://www.history.ac.uk/projects/digital/tobias)), financiado por el [AHRC](http://www.ahrc.ac.uk/). Ha sido revisado para *The Programming Historian*. +El autor del tutorial agradece a los revisores del tutorial original, Matthew Lincoln y a Terhi Nurmikko-Fuller, y al editor, Admam Cyrmble, por dedicar tiempo generosamente a ayudarle a mejorar este tutorial con numerosas sugerencias, aclaraciones y correcciones. Esta lección se basa en un trabajo perteneciente al “Tesauro de historia Británica e Irlandsesa como SKOS” (proyecto [Tobias](https://www.history.ac.uk/projects/digital/tobias)), financiado por el [AHRC](https://www.ahrc.ac.uk/). Ha sido revisado para *The Programming Historian*. diff --git a/es/lecciones/introduccion-e-instalacion.md b/es/lecciones/introduccion-e-instalacion.md index fb8fc408d5..0b53b06e5d 100644 --- a/es/lecciones/introduccion-e-instalacion.md +++ b/es/lecciones/introduccion-e-instalacion.md @@ -27,7 +27,7 @@ abstract: "Esta primera lección de nuestra sección sobre recursos en línea es original: introduction-and-installation avatar_alt: Grabado de una serpiente de cascabel doi: 10.46430/phes0016 -next: ver-archivos-html +next: /es/lecciones/ver-archivos-html sequence: 1 series_total: 15 --- diff --git a/es/lecciones/introduccion-map-warper.md b/es/lecciones/introduccion-map-warper.md index 4a16d548b0..62c6280a06 100644 --- a/es/lecciones/introduccion-map-warper.md +++ b/es/lecciones/introduccion-map-warper.md @@ -30,7 +30,7 @@ doi: 10.46430/phes0048 Map Warper fue diseñada para georreferenciar mapas antiguos -mapamundis, portulanos, cartas náuticas, planos topográficos, planos arquitectónicos, cartas geográficas-, fotografías aéreas y demás materiales cartográficos contenidos en las colecciones de caracter patrimonial. En tal sentido, la herramienta nos posibilita la generación de material georreferenciado para trabajo en escritorio -rásteres- o en linea -Map Server-, útiles para vincular a sistemas de información geográfico (QGIS, JOSM, ArcGIS, Google Earth, World Map, otros). Asimismo, la herramienta ayuda a descentralizar y agilizar los procesos de georreferenciación, catalogación y visualización, ya que su plataforma crea un entorno de colaboración abierta. -Gracias a sus características, la herramienta es útil a investigadores, profesores y estudiantes, como a instituciones que están desarrollando procesos de digitalización, visualización y experimentación del material cartográfico de sus colecciones, o para el desarrollo de proyectos en humanidades espaciales, como son los caso de la [Mapoteca Digital](http://bibliotecanacional.gov.co/es-co/colecciones/biblioteca-digital/mapoteca) de la Biblioteca Nacional de Colombia, [Cartografía de Bogotá](http://cartografia.bogotaendocumentos.com/) de la Universidad Nacional de Colombia, [Paisajes coloniales: redibujando los territorios andinos en el siglo XVII](https://paisajescoloniales.com/) de la Universidad de los Andes (Colombia). +Gracias a sus características, la herramienta es útil a investigadores, profesores y estudiantes, como a instituciones que están desarrollando procesos de digitalización, visualización y experimentación del material cartográfico de sus colecciones, o para el desarrollo de proyectos en humanidades espaciales, como son los caso de la [Mapoteca Digital](https://bibliotecanacional.gov.co/es-co/colecciones/biblioteca-digital/mapoteca) de la Biblioteca Nacional de Colombia, [Cartografía de Bogotá](https://cartografia.bogotaendocumentos.com/) de la Universidad Nacional de Colombia, [Paisajes coloniales: redibujando los territorios andinos en el siglo XVII](https://paisajescoloniales.com/) de la Universidad de los Andes (Colombia). ### Lo que aprenderás en este tutorial @@ -77,7 +77,7 @@ La pestaña “Metadata” visualiza la información cumplimentada en la etapa d En la versión de Map Warper que se encuentra actualmente disponible ya no es posible añadir un mapa base.
    -En este tutorial explicaremos el proceso de georreferenciación con el [Mapa Cafetero de la República de Colombia](http://catalogoenlinea.bibliotecanacional.gov.co/custom/web/content/mapoteca/fmapoteca_984_figac_16/fmapoteca_984_figac_16.html) de la Mapoteca Digital de la Biblioteca Nacional de Colombia. El documento cartográfico lo publicó la Federación Nacional de Cafeteros de Colombia en 1933, en una época en donde el café era la industria agrícola rectora de la economía colombiana, como resultado del primer censo cafetero del país realizado en 1932. +En este tutorial explicaremos el proceso de georreferenciación con el [Mapa Cafetero de la República de Colombia](https://catalogoenlinea.bibliotecanacional.gov.co/custom/web/content/mapoteca/fmapoteca_984_figac_16/fmapoteca_984_figac_16.html) de la Mapoteca Digital de la Biblioteca Nacional de Colombia. El documento cartográfico lo publicó la Federación Nacional de Cafeteros de Colombia en 1933, en una época en donde el café era la industria agrícola rectora de la economía colombiana, como resultado del primer censo cafetero del país realizado en 1932. Recordamos que en caso de no tener cargada cartografía alguna, se podrá utilizar los mapas del siguiente listado, y en caso de recurrir al [listado](/assets/introduccion-map-warper/map-warper.csv) resaltar el mapa seleccionado en el interior del listado. @@ -97,10 +97,10 @@ En este aspecto, para comprender mejor el desarrollo de esta acción técnica, d ~~~ Google Maps: https://mt1.google.com/vt/lyrs=r&x={x}&y={y}&z={z} -Google Satellite: http://www.google.cn/maps/vt?lyrs=s@189&gl=cn&x={x}&y={y}&z={z} -Bing Satélite: http://ecn.t3.tiles.virtualearth.net/tiles/a{q}.jpeg?g=0&dir=dir_n’ -CARTO dark: http://a.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}.png -Stamen Watercolor: http://tile.stamen.com/watercolor/{z}/{x}/{y}.jpg +Google Satellite: https://www.google.cn/maps/vt?lyrs=s@189&gl=cn&x={x}&y={y}&z={z} +Bing Satélite: https://ecn.t3.tiles.virtualearth.net/tiles/a{q}.jpeg?g=0&dir=dir_n’ +CARTO dark: https://a.basemaps.cartocdn.com/dark_all/{z}/{x}/{y}.png +Stamen Watercolor: https://tile.stamen.com/watercolor/{z}/{x}/{y}.jpg ~~~ ![Add control point](/images/introduccion-map-warper/es-or-introduccion-map-warper-icon3.gif): El botón *Add control point* (agregar punto de control), ubicado en ambos recuadros, sirve para posicionar los puntos de control que relacionan el mapa vinculado o seleccionado con la capa base predeterminada o establecida. diff --git a/es/lecciones/lectura-escalable-de-datos-estructurados.md b/es/lecciones/lectura-escalable-de-datos-estructurados.md index 38b0c5a10a..57a54ec125 100644 --- a/es/lecciones/lectura-escalable-de-datos-estructurados.md +++ b/es/lecciones/lectura-escalable-de-datos-estructurados.md @@ -79,7 +79,7 @@ A continuación explicamos los tres pasos tanto en términos generales como espe Si deseas reproducir el análisis que presentamos a continuación, usando no solo el marco conceptual general sino también el código, asumimos que ya tienes un conjunto de datos que contiene datos de Twitter en formato JSON. Si todavía no tienes un conjunto de datos, puede conseguir uno de las siguientes maneras: -1. Utilizando una de las API de Twitter, por ejemplo, su API denominada "Essential" (Esencial), disponible gratuitamente, que usamos para construir el conjunto de datos utilizado en el ejemplo (ver más información sobre las API en esta sección [Introduction to Populating a Website with API Data](/en/lessons/introduction-to-populating-a-website-with-api-data#what-is-application-programming-interface-api)). Este enlace te llevará a las [Opciones de API de Twitter](https://developer.twitter.com/en/docs/twitter-api/getting-started/about-twitter-api). Puedes utilizar el paquete rtweet, con tu propia cuenta de Twitter para acceder a la API de Twitter a través de R, como describimos más adelante. +1. Utilizando una de las API de Twitter, por ejemplo, su API denominada "Essential" (Esencial), disponible gratuitamente, que usamos para construir el conjunto de datos utilizado en el ejemplo (ver más información sobre las API en esta sección [Introduction to Populating a Website with API Data](/en/lessons/introduction-to-populating-a-website-with-api-data)). Este enlace te llevará a las [Opciones de API de Twitter](https://developer.twitter.com/en/docs/twitter-api/getting-started/about-twitter-api). Puedes utilizar el paquete rtweet, con tu propia cuenta de Twitter para acceder a la API de Twitter a través de R, como describimos más adelante. 2. Usando la [Beginner's Guide to Twitter Data](/en/lessons/beginners-guide-to-twitter-data) de _Programming Historian_ (en inglés). En lugar de elegir CSV como formato de "output" (salida), elige JSON. En R se trabaja con paquetes, cada uno de los cuales añade numerosas funcionalidades a sus elementos básicos. Los paquetes suelen ser códigos creados por la comunidad, que se colocan a disposición para su reutilización. Cuando usamos paquetes nos estamos apoyando sobre los hombros de otros programadores. En nuestro ejemplo los paquetes relevantes son los siguientes: **rtweet**, **tidyverse**, **lubridate** y **jsonlite**. Para instalar paquetes en R consulta esta sección de la lección [Procesamiento básico de textos en R](/es/lecciones/procesamiento-basico-de-textos-en-r). Para utilizar los paquetes en R hay que cargarlos con la función `library()` como se indica a continuación: @@ -122,7 +122,7 @@ Si aún no has obtenido datos de Twitter y deseas seguir los ejemplos de código La exploración de las dimensiones cronológicas de un dataset puede facilitar la primera revisión analítica de tus datos. Si estás estudiando la evolución de un único fenómeno a lo largo del tiempo (como nuestro caso de estudio sobre los acontecimientos específicos que estimularon los debates en torno a *Plaza Sesamo*), entender cómo este fenómeno ganó tracción y/o cómo disminuyó el interés puede ser revelador en cuanto a su importancia. Este puede ser el primer paso para comprender cómo se relacionan todos los datos compilados con el fenómeno a lo largo del tiempo. El interés por la dispersión temporal también puede estar relacionado no con un acontecimiento, sino con la distribución total de un conjunto de datos basado en una serie de categorías. -Por ejemplo, si estuvieras trabajando con los datos de la National Gallery, podrías explorar la distribución de sus colecciones según los diferentes períodos de la historia del arte para establecer qué períodos están mejor representados en el conjunto de datos de la National Gallery. El conocimiento de la dispersión temporal del conjunto de datos puede ayudar a contextualizar los datos seleccionados para una lectura atenta en el [Paso 3](#paso-3:-selecion-sistematica-y-reproducible-de-datos-individuales-para-la-lectura-atenta), pues te dará una idea de cómo un dato específico se relaciona con la cronología del conjunto de datos completo. +Por ejemplo, si estuvieras trabajando con los datos de la National Gallery, podrías explorar la distribución de sus colecciones según los diferentes períodos de la historia del arte para establecer qué períodos están mejor representados en el conjunto de datos de la National Gallery. El conocimiento de la dispersión temporal del conjunto de datos puede ayudar a contextualizar los datos seleccionados para una lectura atenta en el [Paso 3](#paso-3-selección-sistemática-y-reproducible-de-datos-individuales-para-la-lectura-atenta), pues te dará una idea de cómo un dato específico se relaciona con la cronología del conjunto de datos completo. ### Ejemplo de dispersión temporal de un dataset: datos de Twitter @@ -199,7 +199,7 @@ En este momento deberías tener un gráfico que representa la dispersión puntua Utilizar una lógica binaria para explorar un conjunto de datos puede ser una forma relativamente sencilla, en comparación con otros métodos digitales, de encontrar relaciones importantes en tu conjunto de datos. Las relaciones binarias son fáciles de contar usando código computacional, y pueden revelar estructuras sistemáticas y definitorias de tus datos. En nuestro caso, nos interesaban las relaciones de poder en Twitter, y en la esfera pública de forma general. Por lo tanto, exploramos las diferencias entre las cuentas verificadas y las no verificadas, ya que las cuentas verificadas son designadas de esa forma debido a su estatus público fuera de la plataforma. Sin embargo, puede que a ti te interese otra cosa, como saber cuántos tweets eran originales o retweets. En ambos casos puedes utilizar los metadatos existentes registrados en tu conjunto de datos para formular una pregunta que pueda responderse utilizando una lógica binaria: el tweet procede de una cuenta verificada, ¿sí o no?; el tweet es un retweet, ¿sí o no?. Ahora supongamos que estás trabajando con datos de la National Gallery. En este caso podrías explorar el sesgo de género en las colecciones y si la institución ha favorecido la adquisición de obras de arte de personas registradas como hombres en su catálogo. Para responder esa pregunta, podrías organizar tu conjunto de datos de manera que puedas artistas hombres (este artista está registrado como un hombre, ¿sí o no?). O, por ejemplo, si te interesa la distribución de las colecciones de las artistas danesas frente a las artistas internacionales, los datos podrían organizarse en una estructura binaria que te permitiría responder si tal artista está registrada como danesa, ¿sí o no? -Las relaciones binarias pueden formar un contexto para tu lectura atenta de los datos seleccionados en el [Paso 3](#Paso-3:-Selección-sistemática-y-reproducible-de-datos-individuales-para-la-lectura-atenta). Conocer la distribución de los datos en dos categorías también te permitirá establecer la representatividad de un dato específico con respecto a la distribución de esta categoría en el conjunto de datos completo. Por ejemplo, si en el paso 3 eliges trabajar con los 20 tweets con más likes, podrás ver que aunque haya muchos tweets de cuentas verificadas en este grupo seleccionado, estas cuentas no están bien representadas en el conjunto de datos; los 20 tweets con más likes que has seleccionado no son, por tanto, representativos de los tweets de la mayoría de las cuentas de tu conjunto de datos, sino que representan un porcentaje pequeño, pero muy "likeado". Si decides trabajar con las 20 obras de arte más expuestas en un conjunto de datos de la National Gallery, una exploración binaria de las artistas danesas frente a las no danesas podría mostrarte que, aunque las 20 obras más expuestas eran de artistas internacionales, en general estas artistas estaban poco representadas en las colecciones de la National Gallery. +Las relaciones binarias pueden formar un contexto para tu lectura atenta de los datos seleccionados en el [Paso 3](#paso-3-selección-sistemática-y-reproducible-de-datos-individuales-para-la-lectura-atenta). Conocer la distribución de los datos en dos categorías también te permitirá establecer la representatividad de un dato específico con respecto a la distribución de esta categoría en el conjunto de datos completo. Por ejemplo, si en el paso 3 eliges trabajar con los 20 tweets con más likes, podrás ver que aunque haya muchos tweets de cuentas verificadas en este grupo seleccionado, estas cuentas no están bien representadas en el conjunto de datos; los 20 tweets con más likes que has seleccionado no son, por tanto, representativos de los tweets de la mayoría de las cuentas de tu conjunto de datos, sino que representan un porcentaje pequeño, pero muy "likeado". Si decides trabajar con las 20 obras de arte más expuestas en un conjunto de datos de la National Gallery, una exploración binaria de las artistas danesas frente a las no danesas podría mostrarte que, aunque las 20 obras más expuestas eran de artistas internacionales, en general estas artistas estaban poco representadas en las colecciones de la National Gallery. ### Ejemplo de una exploración binaria: datos de Twitter @@ -322,7 +322,7 @@ Esta visualización se parece mucho a los gráficos de barras anteriores, pero l ## Paso 3: Selección sistemática y reproducible de datos individuales para la lectura atenta -Una de las grandes ventajas de combinar la lectura atenta con la distante es la posibilidad de hacer una selección sistemática y reproducible de datos específicos para la lectura atenta. Una vez que hayas explorado tu conjunto de datos usando los dos tipos diferentes de lectura distante descritos en el [Paso 1](#Paso-1:-exploración-cronológica-de-un-conjunto-de-datos) y el [Paso 2](#Paso-2:-Explorando-un-conjunto-de-datos-mediante-la-creación-de-categorías-analíticas-binarias), podrás utilizar estos conocimientos para seleccionar sistemáticamente datos individuales para una lectura atenta. La lectura atenta te permitirá explorar tendencias interesantes de tus datos, y desentrañar tus tópicos elegidos para investigarlos en profundidad. +Una de las grandes ventajas de combinar la lectura atenta con la distante es la posibilidad de hacer una selección sistemática y reproducible de datos específicos para la lectura atenta. Una vez que hayas explorado tu conjunto de datos usando los dos tipos diferentes de lectura distante descritos en el [Paso 1](#paso-1-exploración-cronológica-de-un-conjunto-de-datos) y el [Paso 2](#paso-2-explorando-un-conjunto-de-datos-mediante-la-creación-de-categorías-analíticas-binarias), podrás utilizar estos conocimientos para seleccionar sistemáticamente datos individuales para una lectura atenta. La lectura atenta te permitirá explorar tendencias interesantes de tus datos, y desentrañar tus tópicos elegidos para investigarlos en profundidad. El número de datos individuales que decidas leer de forma atenta dependerá del fenómeno que estés investigando, del tiempo que tengas a disposición y de la complejidad de los datos. Por ejemplo, analizar obras de arte individuales puede requerir mucho más tiempo que la lectura de tweets individuales, sin embargo, esto puede cambiar de acuerdo a tu objetivo. Por lo tanto, es importante ser sistemático en la selección de los datos individuales para garantizar su conformidad con las preguntas de la investigación. En nuestro caso, queríamos saber cómo los tweets con más likes representaban a *Plaza Sésamo*; cómo estos hablaban del programa y su historia, cómo se relacionaban con otros medios, y cómo se representaba el programa visualmente, por ejemplo, con imágenes, enlaces a vídeos, memes, etc. Considerando la interesante relación entre la escasa representación y el alto nivel de interacción de los tweets de las cuentas verificadas, quisimos hacer una lectura atenta de los 20 tweets con más likes (verificados y no verificados), y también del top 20 de tweets publicados por cuentas no verificadas para ver si estas hablaban de la serie y su historia de forma diferente. Elegimos el top 20 porque nos pareció que era una tarea que podíamos llevar a cabo con el tiempo que disponíamos. @@ -445,7 +445,7 @@ Ahora estás lista para copiar las URL del dataframe y examinar los tweets indiv ## Conclusión: continuando con la lectura atenta -Cuando hayas seleccionado los datos individuales que deseas leer atentamente ([Paso 3](#paso-3-selecion-sistematica-y-reproducible-de-datos-individuales-para-la-lectura-atenta)), los métodos iniciales de la lectura distante exploratoria ([Paso 1](#Paso-1:-exploración-cronológica-de-un-conjunto-de-datos) y [Paso 2](#Paso-2:-Explorando-un-conjunto-de-datos-mediante-la-creación-de-categorías-analíticas-binarias)) podrán utilizarse de forma combinada, como un contexto altamente cualificado para tu análisis en profundidad. Volviendo a la exploración cronológica ([Paso 1](#Paso-1:-exploración-cronológica-de-un-conjunto-de-datos)), podrás observar dónde están situados los datos que seleccionaste para analizar individualmente en tu conjunto total. Con esta información puedes, por ejemplo, considerar qué diferencia puede tener si los datos están situados de forma inicial o tardía en comparación con la distribución general de los datos, o qué significa si los datos individuales seleccionados hacen parte de un pico. Con respecto a las estructuras binarias ([Paso 2](#Paso-2:-Explorando-un-conjunto-de-datos-mediante-la-creación-de-categorías-analíticas-binarias)), la lectura distante puede ayudarte a determinar si un dato individual es un "outlier", un dato individual que se desvía significativamente del resto en un conjunto, o sí es representativo de una tendencia más amplia en los datos, como también indagar que tan grande es la porción del conjunto de datos que representa con relación a una característica determinada. En el ejemplo de los datos de Twitter, demostramos cómo la lectura atenta de datos individuales seleccionados pueden ser contextualizados con la lectura distante. +Cuando hayas seleccionado los datos individuales que deseas leer atentamente ([Paso 3](#paso-3-selección-sistemática-y-reproducible-de-datos-individuales-para-la-lectura-atenta)), los métodos iniciales de la lectura distante exploratoria ([Paso 1](#paso-1-exploración-cronológica-de-un-conjunto-de-datos) y [Paso 2](#paso-2-explorando-un-conjunto-de-datos-mediante-la-creación-de-categorías-analíticas-binarias)) podrán utilizarse de forma combinada, como un contexto altamente cualificado para tu análisis en profundidad. Volviendo a la exploración cronológica ([Paso 1](#paso-1-exploración-cronológica-de-un-conjunto-de-datos)), podrás observar dónde están situados los datos que seleccionaste para analizar individualmente en tu conjunto total. Con esta información puedes, por ejemplo, considerar qué diferencia puede tener si los datos están situados de forma inicial o tardía en comparación con la distribución general de los datos, o qué significa si los datos individuales seleccionados hacen parte de un pico. Con respecto a las estructuras binarias ([Paso 2](#paso-2-explorando-un-conjunto-de-datos-mediante-la-creación-de-categorías-analíticas-binarias)), la lectura distante puede ayudarte a determinar si un dato individual es un "outlier", un dato individual que se desvía significativamente del resto en un conjunto, o sí es representativo de una tendencia más amplia en los datos, como también indagar que tan grande es la porción del conjunto de datos que representa con relación a una característica determinada. En el ejemplo de los datos de Twitter, demostramos cómo la lectura atenta de datos individuales seleccionados pueden ser contextualizados con la lectura distante. La exploración cronológica puede ayudarte a determinar dónde están posicionados los tweets seleccionados para la lectura atenta con relación a un evento que te interese. Tal vez un tweet haya sido publicado antes que la mayoría, lo que indica que fue, tal vez, parte de una primera mirada sobre un determinado tema. Mientras que un tweet "tardío", tal vez sea más reflexivo o retrospectivo. Para determinar esto tendrás que realizar una lectura atenta y analizar los tweets seleccionados utilizando algunos métodos tradicionales de las humanidades, sin embargo, la lectura distante puede ayudarte a matizar y contextualizar tu análisis. Lo mismo ocurre con las estructuras binarias y los criterios utilizados para seleccionar los 20 tweets con más likes. Si sabes que un tweet proviene de una cuenta verificada o no, y si fue uno de los que más likes tuvo, entonces puedes compararlo con las tendencias generales de estos parámetros en el conjunto de datos cuando hagas tu lectura atenta. Esto te ayudará a robustecer tus argumentos en el caso de un análisis en profundidad de un dato individual, ya que sabrás lo que representa con relación al evento de forma general, al debate o al tema que estés investigando. diff --git a/es/lecciones/limpieza-de-datos-con-OpenRefine.md b/es/lecciones/limpieza-de-datos-con-OpenRefine.md index bc4355cb12..73ac524f5f 100644 --- a/es/lecciones/limpieza-de-datos-con-OpenRefine.md +++ b/es/lecciones/limpieza-de-datos-con-OpenRefine.md @@ -39,7 +39,7 @@ doi: 10.46430/phes0017 ## Objetivos de la lección -No confíes ciegamente en tus datos. Ese es el mensaje clave de este tutorial que se centra en mostrar cómo los investigadores pueden diagnosticar y proceder sobre la exactitud de los datos. En esta lección aprenderás los principios y la práctica de la limpieza de datos, así como la forma de usar [*OpenRefine*](http://openrefine.org/) para realizar cuatro tareas esenciales que te ayudarán a limpiar tus datos: +No confíes ciegamente en tus datos. Ese es el mensaje clave de este tutorial que se centra en mostrar cómo los investigadores pueden diagnosticar y proceder sobre la exactitud de los datos. En esta lección aprenderás los principios y la práctica de la limpieza de datos, así como la forma de usar [*OpenRefine*](https://openrefine.org/) para realizar cuatro tareas esenciales que te ayudarán a limpiar tus datos: 1. Eliminar registros duplicados 2. Separar varios valores contenidos en el mismo campo @@ -58,9 +58,9 @@ Tiempo atrás los historiadores debieron confiar en los especialistas en tecnolo Las IDTs se asemejan a los programas de hojas de cálculo de escritorio con los que todos estamos familiarizados, con los que comparten algunas funcionalidades. Por ejemplo, puedes utilizar una aplicación como Microsoft Excel para ordenar los datos basándote en filtros numéricos, alfabéticos y desarrollados a medida, lo que te permite detectar errores con mayor facilidad. Configurar estos filtros en una hoja de cálculo puede resultar difícil, ya que son una funcionalidad secundaria. De forma genérica se puede decir que las hojas de cálculo están diseñadas para trabajar en filas y celdas individuales mientras que las IDTs operan en grandes rangos de datos a la vez. Estas “super-hojas de cálculo” ofrecen una interfaz integrada y fácil de usar a través de la cual los usuarios finales pueden detectar y corregir errores. -En los últimos años se han desarrollado varias herramientas de propósito general para la transformación interactiva de datos, tales como [*Potter’s Wheel ABC*](https://perma.cc/Q6QD-E64N) y [*Wrangler*](http://vis.stanford.edu/papers/wrangler/) (actualmente [*Trifacta Wrangler*](https://api.trifacta.com/saas-pro/index.html)). Aquí nos centraremos específicamente en [*OpenRefine*](http://openrefine.org/) (anteriormente Freebase Gridworks y Google Refine) pues, en opinión de los autores, es la herramienta más fácil de usar para procesar y limpiar eficientemente grandes cantidades de datos en una interfaz basada en navegador. +En los últimos años se han desarrollado varias herramientas de propósito general para la transformación interactiva de datos, tales como [*Potter’s Wheel ABC*](https://perma.cc/Q6QD-E64N) y [*Wrangler*](https://vis.stanford.edu/papers/wrangler/) (actualmente [*Trifacta Wrangler*](https://api.trifacta.com/saas-pro/index.html)). Aquí nos centraremos específicamente en [*OpenRefine*](https://openrefine.org/) (anteriormente Freebase Gridworks y Google Refine) pues, en opinión de los autores, es la herramienta más fácil de usar para procesar y limpiar eficientemente grandes cantidades de datos en una interfaz basada en navegador. -Además del perfilado de datos y las operaciones de limpieza, las extensiones de [*OpenRefine*] permiten a los usuarios identificar conceptos en texto no estructurado, un proceso denominado [reconocimiento de nombres de entidades](https://es.wikipedia.org/wiki/Reconocimiento_de_nombres_de_entidades) ([*named-entity recognition*](http://en.wikipedia.org/wiki/Named-entity_recognition), NER, en inglés), pudiendo también cotejar[^1] sus propios datos con bases de conocimiento existentes. Así, [*OpenRefine*] puede ser una práctica herramienta para vincular datos con conceptos y autoridades que ya han sido publicadas en la Web por instituciones como la [*Biblioteca del Congreso de los EEUU*](http://www.loc.gov/index.html) u [OCLC](http://www.oclc.org/home.en.html). La limpieza de datos es un requisito previo para estos pasos; la tasa de éxito del NER y un proceso de coincidencia fructífera entre tus datos y las autoridades externas depende de tu capacidad para hacer tus datos tan coherentes como sea posible. +Además del perfilado de datos y las operaciones de limpieza, las extensiones de [*OpenRefine*] permiten a los usuarios identificar conceptos en texto no estructurado, un proceso denominado [reconocimiento de nombres de entidades](https://es.wikipedia.org/wiki/Reconocimiento_de_nombres_de_entidades) ([*named-entity recognition*](https://en.wikipedia.org/wiki/Named-entity_recognition), NER, en inglés), pudiendo también cotejar[^1] sus propios datos con bases de conocimiento existentes. Así, [*OpenRefine*] puede ser una práctica herramienta para vincular datos con conceptos y autoridades que ya han sido publicadas en la Web por instituciones como la [*Biblioteca del Congreso de los EEUU*](https://www.loc.gov/index.html) u [OCLC](https://www.oclc.org/home.en.html). La limpieza de datos es un requisito previo para estos pasos; la tasa de éxito del NER y un proceso de coincidencia fructífera entre tus datos y las autoridades externas depende de tu capacidad para hacer tus datos tan coherentes como sea posible. ## Descripción del ejercicio Powerhouse Museum diff --git a/es/lecciones/manipular-cadenas-de-caracteres-en-python.md b/es/lecciones/manipular-cadenas-de-caracteres-en-python.md index d2d491e543..67acc4bd1a 100644 --- a/es/lecciones/manipular-cadenas-de-caracteres-en-python.md +++ b/es/lecciones/manipular-cadenas-de-caracteres-en-python.md @@ -19,8 +19,8 @@ translation-reviewer: - Antonio Rojas Castro review-ticket: https://github.com/programminghistorian/ph-submissions/issues/43 layout: lesson -next: de-html-a-lista-de-palabras-1 -previous: trabajar-con-paginas-web +next: /es/lecciones/de-html-a-lista-de-palabras-1 +previous: /es/lecciones/trabajar-con-paginas-web original: manipulating-strings-in-python python_warning: false difficulty: 2 diff --git a/es/lecciones/mineria-de-datos-en-internet-archive.md b/es/lecciones/mineria-de-datos-en-internet-archive.md index 0c1d1eae4e..bf904a66e9 100644 --- a/es/lecciones/mineria-de-datos-en-internet-archive.md +++ b/es/lecciones/mineria-de-datos-en-internet-archive.md @@ -23,7 +23,7 @@ difficulty: 2 activity: acquiring topics: [web-scraping] original: data-mining-the-internet-archive -redirect_from: /es/lessons/data-mining-the-internet-archive +redirect_from: /es/lessons/data-mining-the-internet-archive/ abstract: | Las colecciones del Internet Archive incluyen una gran cantidad de fuentes históricas digitalizadas. Muchas de ellas contienen datos bibliográficos importantes en un formato llamado MARC. En esta lección aprenderás a usar Python para automatizar la descarga de archivos MARC en grandes cantidades desde el Internet Archive, así como el análisis sintáctico de archivos MARC con información específica tal como autores, lugar de publicación y fechas. La lección puede aplicarse de una manera general para otros elementos del Internet Archive así como en archivos MARC en cualquier otro repositorio. avatar_alt: Grabado de mineros trabajando en la construcción de un túnel. @@ -400,37 +400,37 @@ Desde luego, para que esta técnica sea útil se requiere hacer algo de [limpiez [^1]: Agradezco a [Shawn Graham](https://hypothes.is/a/AVKeGm0rvTW_3w8Lypo1) por señalar la dependencia de `six` en `pymarc` y brindar una solución. -[Internet Archive]: http://archive.org/ +[Internet Archive]: https://archive.org/ [JSTOR Early Journal Content]: https://archive.org/details/jstor_ejc [biblioteca personal de John Adams]: https://archive.org/details/johnadamsBPL [colección Haití]: https://archive.org/details/jcbhaiti -[Ian Milligan]: http://activehistory.ca/2013/09/the-internet-archive-rocks-or-two-million-plus-free-sources-to-explore/ -[Anti-Slavery Collection]: http://archive.org/details/bplscas +[Ian Milligan]: https://activehistory.ca/2013/09/the-internet-archive-rocks-or-two-million-plus-free-sources-to-explore/ +[Anti-Slavery Collection]: https://archive.org/details/bplscas [internetarchive]: https://pypi.python.org/pypi/internetarchive [pymarc]: https://pypi.python.org/pypi/pymarc/ -[esta carta]: http://archive.org/details/lettertowilliaml00doug -[manuscrito original]: http://archive.org/stream/lettertowilliaml00doug/39999066767938#page/n0/mode/2up -[múltiples archivos]: http://archive.org/download/lettertowilliaml00doug -[Dublin Core]: http://archive.org/download/lettertowilliaml00doug/lettertowilliaml00doug_dc.xml -[MARCXML]: http://archive.org/download/lettertowilliaml00doug/lettertowilliaml00doug_marc.xml -[formato MARC 21 de la Biblioteca del Congreso para datos bibliográficos]: http://www.loc.gov/marc/bibliographic/ -[cientos de cartas, manuscritos y publicaciones antiesclavistas]: http://archive.org/search.php?query=collection%3Abplscas&sort=-publicdate +[esta carta]: https://archive.org/details/lettertowilliaml00doug +[manuscrito original]: https://archive.org/stream/lettertowilliaml00doug/39999066767938#page/n0/mode/2up +[múltiples archivos]: https://archive.org/download/lettertowilliaml00doug +[Dublin Core]: https://archive.org/download/lettertowilliaml00doug/lettertowilliaml00doug_dc.xml +[MARCXML]: https://archive.org/download/lettertowilliaml00doug/lettertowilliaml00doug_marc.xml +[formato MARC 21 de la Biblioteca del Congreso para datos bibliográficos]: https://www.loc.gov/marc/bibliographic/ +[cientos de cartas, manuscritos y publicaciones antiesclavistas]: https://archive.org/search.php?query=collection%3Abplscas&sort=-publicdate [eBook and Texts]: https://archive.org/details/texts -[elementos y sus URL están estructurados]: http://blog.archive.org/2011/03/31/how-archive-org-items-are-structured/ +[elementos y sus URL están estructurados]: https://blog.archive.org/2011/03/31/how-archive-org-items-are-structured/ [búsqueda avanzada]: https://archive.org/advancedsearch.php [esta página]: https://archive.org/search.php?query=collection%3A%28bplscas%29 -[buscar en el Internet Archive usando el módulo de Python que instalamos]: http://internetarchive.readthedocs.io/en/latest/quickstart.html#searching -[búsqueda avanzada en una colección]: http://archive.org/search.php?query=collection%3Abplscas -[downloading]: http://internetarchive.readthedocs.io/en/latest/quickstart.html#downloading +[buscar en el Internet Archive usando el módulo de Python que instalamos]: https://archive.org/developers/internetarchive/cli.html#cli-search +[búsqueda avanzada en una colección]: https://archive.org/search.php?query=collection%3Abplscas +[downloading]: https://archive.org/developers/internetarchive/cli.html#cli-download [remember those?]: /lessons/code-reuse-and-modularity [son nombrados de acuerdo a reglas específicas]: https://archive.org/about/faqs.php#140 -[manejo de excepciones]: http://docs.python.org/2/tutorial/errors.html#handling-exceptions -[reglas específicas para el campo 260]: http://www.loc.gov/marc/bibliographic/bd260.html -[estándares MARC]: http://www.loc.gov/marc/ +[manejo de excepciones]: https://docs.python.org/2/tutorial/errors.html#handling-exceptions +[reglas específicas para el campo 260]: https://www.loc.gov/marc/bibliographic/bd260.html +[estándares MARC]: https://www.loc.gov/marc/ [1]: https://github.com/edsu/pymarc [algunas funciones que provee para trabajar con archivos MARC XML]: https://github.com/edsu/pymarc/blob/master/pymarc/marcxml.py [Contar frecuencias]: /es/lecciones/contar-frecuencias [lección de introducción a Google Maps]: /lessons/googlemaps-googleearth -[nube de palabras en Wordle]: https://web.archive.org/web/20201202151557/http://www.wordle.net/ +[nube de palabras en Wordle]: https://web.archive.org/web/20201202151557/https://www.wordle.net/ [limpieza de tus datos]: /lessons/cleaning-ocrd-text-with-regular-expressions [Instalar módulos de Python con pip]: /es/lecciones/instalar-modulos-python-pip diff --git a/es/lecciones/normalizar-datos.md b/es/lecciones/normalizar-datos.md index 2590d798b2..d37759ce39 100644 --- a/es/lecciones/normalizar-datos.md +++ b/es/lecciones/normalizar-datos.md @@ -21,8 +21,8 @@ translation-reviewer: - Antonio Rojas Castro review-ticket: https://github.com/programminghistorian/ph-submissions/issues/46 layout: lesson -next: contar-frecuencias -previous: de-html-a-lista-de-palabras-2 +next: /es/lecciones/contar-frecuencias +previous: /es/lecciones/de-html-a-lista-de-palabras-2 original: normalizing-data python_warning: false difficulty: 2 @@ -174,12 +174,12 @@ Para seguir a lo largo de las lecciones futuras es importante que tengas los arc - python-es-lecciones4.zip ([zip sync][]) [De HTML a lista de palabras (parte 2)]: /es/lecciones/de-html-a-lista-de-palabras-2 -[web page]: http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33 +[web page]: https://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33 [De HTML a lista de palabras (parte 1)]: /es/lecciones/de-html-a-lista-de-palabras-1 [Manipular cadenas de caracteres en Python]: /es/lecciones/manipular-cadenas-de-caracteres-en-python -[Unicode]: http://unicode.org/ -[soporte de Python]: https://web.archive.org/web/20180502053841/http://www.diveintopython.net/xml_processing/unicode.html -[Dive into Python]: https://web.archive.org/web/20180416143856/http://www.diveintopython.net/regular_expressions/index.html +[Unicode]: https://unicode.org/ +[soporte de Python]: https://web.archive.org/web/20180502053841/https://www.diveintopython.net/xml_processing/unicode.html +[Dive into Python]: https://web.archive.org/web/20180416143856/https://www.diveintopython.net/regular_expressions/index.html [zip]: /assets/python-es-lecciones3.zip [zip sync]: /assets/python-es-lecciones4.zip [página Web]: https://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33 diff --git a/es/lecciones/palabras-clave-en-contexto-n-grams.md b/es/lecciones/palabras-clave-en-contexto-n-grams.md index b3608bee40..52f0e1feee 100644 --- a/es/lecciones/palabras-clave-en-contexto-n-grams.md +++ b/es/lecciones/palabras-clave-en-contexto-n-grams.md @@ -20,8 +20,8 @@ translation-reviewer: - Antonio Rojas Castro review-ticket: https://github.com/programminghistorian/ph-submissions/issues/50 layout: lesson -next: salida-palabras-clave-contexto-ngrams -previous: salida-de-datos-como-archivo-html +next: /es/lecciones/salida-palabras-clave-contexto-ngrams +previous: /es/lecciones/salida-de-datos-como-archivo-html original: keywords-in-context-using-n-grams python_warning: false difficulty: 2 @@ -182,9 +182,9 @@ Para seguir a lo largo de las lecciones futuras es importante que tengas los arc - python-es-lecciones8.zip ([zip sync][]) - [Salida de datos como archivo HTML]: /es/lecciones/salida-de-datos-como-archivo-html - [Contar frecuencias de palabras]: /es/lecciones/contar-frecuencias - [1]: salida-de-datos-como-archivo-html - [archivo zip de las lecciones anteriores]: /assets/python-es-lecciones7.zip - [Manipular cadenas de caracteres en Python]: /es/lecciones/manipular-cadenas-de-caracteres-en-python - [zip sync]: /assets/python-es-lecciones8.zip +- [Salida de datos como archivo HTML](/es/lecciones/salida-de-datos-como-archivo-html) +- [Contar frecuencias de palabras](/es/lecciones/contar-frecuencias) +- [1](/es/lecciones/salida-de-datos-como-archivo-html) +- [archivo zip de las lecciones anteriores](/assets/python-es-lecciones7.zip) +- [Manipular cadenas de caracteres en Python](/es/lecciones/manipular-cadenas-de-caracteres-en-python) +- [zip sync](/assets/python-es-lecciones8.zip) diff --git a/es/lecciones/poniendo-omeka-a-funcionar.md b/es/lecciones/poniendo-omeka-a-funcionar.md index 8b73c6aee1..3fdaf1c895 100644 --- a/es/lecciones/poniendo-omeka-a-funcionar.md +++ b/es/lecciones/poniendo-omeka-a-funcionar.md @@ -32,15 +32,15 @@ doi: 10.46430/phes0022 -[Omeka.net](http://www.omeka.net) facilita la creación de sitios web para mostrar colecciones de ítems. -> *Nota de la traductora*: Antes de empezar es importante aclarar las diferencias entre **Omeka.net** y **Omeka.org**. Este tutorial es sobre **Omeka.net**, una plataforma de publicación en línea que permite a cualquier persona con una cuenta de acceso crear o colaborar en un sitio web para exhibir colecciones y construir exposiciones digitales. **Omeka.net** es una extensión de **Omeka.org** que está disponible para bajar e instalar en un servidor de tu propiedad. La traducción al español del tutorial de *The Programming Historian* sobre [cómo instalar Omeka](/lessons/installing-omeka) en un servidor se encuentra en proceso. +[Omeka.net](https://www.omeka.net) facilita la creación de sitios web para mostrar colecciones de ítems. +> *Nota de la traductora*: Antes de empezar es importante aclarar las diferencias entre **Omeka.net** y **Omeka.org**. Este tutorial es sobre **Omeka.net**, una plataforma de publicación en línea que permite a cualquier persona con una cuenta de acceso crear o colaborar en un sitio web para exhibir colecciones y construir exposiciones digitales. **Omeka.net** es una extensión de **Omeka.org** que está disponible para bajar e instalar en un servidor de tu propiedad. La traducción al español del tutorial de *The Programming Historian* sobre [cómo instalar Omeka](/en/lessons/installing-omeka) en un servidor se encuentra en proceso. al Regístrate para abrir una cuenta en Omeka ---------------------------- {% include figure.html filename="up-and-running-01.png" caption="Regístrate para una cuenta de prueba" %} -Entra a [www.omeka.net](http://www.omeka.net) y haz clic en **Sign Up**. Elige el plan "Omeka trial" (Plan de prueba). Rellena el formulario de registro. Revisa tu correo electrónico y haz clic en el enlace para activar tu cuenta. +Entra a [www.omeka.net](https://www.omeka.net) y haz clic en **Sign Up**. Elige el plan "Omeka trial" (Plan de prueba). Rellena el formulario de registro. Revisa tu correo electrónico y haz clic en el enlace para activar tu cuenta. Crea tu nuevo sitio en Omeka -------------------------- @@ -62,7 +62,7 @@ Un sitio vacío de Omeka ------------------- {% include figure.html filename="up-and-running-04.png" caption="Vista pública" %} -Este es tu sitio vacío de Omeka esperando a ser llenado. Para regresar a tu panel de control (*dashboard*) haz clic en el botón **Back** (Atrás) o escribe **http://www.omeka.net/dashboard**. Esta vez haz clic en **Manage Site** (Administra el sitio). +Este es tu sitio vacío de Omeka esperando a ser llenado. Para regresar a tu panel de control (*dashboard*) haz clic en el botón **Back** (Atrás) o escribe **https://www.omeka.net/dashboard**. Esta vez haz clic en **Manage Site** (Administra el sitio). Cambia de plantilla ------------- @@ -183,5 +183,5 @@ Ahora que has añadido algunos ítems y los has agrupado en una colección, tóm Recursos adicionales ----------------------------- -El equipo de Omeka ha compilado un conjunto de muy buenos recursos en las [páginas de ayuda](http://info.omeka.net) del software. +El equipo de Omeka ha compilado un conjunto de muy buenos recursos en las [páginas de ayuda](https://info.omeka.net) del software. [Este manual en español](https://www.rubenalcaraz.es/manual-omeka/) contiene información útil para evaluar las ventajas y desventajas de usar **Omeka.net** u **Omeka.org**, al igual que instrucciones generales sobre cómo instalar Omeka en tu servidor. diff --git a/es/lecciones/preservar-datos-de-investigacion.md b/es/lecciones/preservar-datos-de-investigacion.md index 7d550fd83e..a127755d87 100644 --- a/es/lecciones/preservar-datos-de-investigacion.md +++ b/es/lecciones/preservar-datos-de-investigacion.md @@ -81,7 +81,7 @@ El momento de documentar depende en gran medida de la persona y del ritmo de la Idealmente, los datos de investigación y la documentación deben ser guardados en archivos [independientes de plataforma] como .txt para notas y .csv (valores separados por comas) o .tsv (valores separados por tabuladores) para los datos en tablas. Estos formatos de texto plano son preferibles a los formatos propietarios utilizados por defecto por Microsoft Office o iWork porque pueden abrirse con muchos paquetes de programas y tienen una gran probabilidad de permanecer visibles o editables en el futuro. Muchas *suites* de ofimática (o paquetes de *software* para oficina) incluyen la opción de guardar archivos en formatos .txt, .csv y .tsv, lo que significa que se puede continuar trabajando con programas familiares y aún así tomar las acciones apropiadas para hacer accesible tu trabajo. Comparados con .doc o .xls, estos formatos tienen el beneficio adicional, desde una perspectiva de preservación, de contener solamente elementos legibles por la computadora. Mientras que es una práctica común el uso de negritas, itálicas o colores para indicar encabezados o para establecer una conexión visual entre elementos de los datos, estas anotaciones orientadas a la visualización no son legibles por la computadora y, por lo tanto, no puede ser consultadas ni buscadas, ni son adecuadas para grandes cantidades de información. Son preferibles los esquemas simples de anotación como el uso de un doble asterisco o tres signos de almohadilla para representar una característica de los datos; en mis propias notas, por ejemplo, tres signos de interrogación indica algo a lo que necesito dar seguimiento, y lo elegí porque "???" se puede encontrar fácilmente con una búsqueda mediante CTRL + F. -Es probable que en muchas ocasiones estos esquemas de anotación se desprendan de la práctica individual (y en consecuencia deban ser documentados), aunque hay sintaxis disponibles como [Markdown](https://es.wikipedia.org/wiki/Markdown) (los archivos Markdown se guardan como .md). En GitHub se pueden encontrar estos excelentes apuntes para quien quiera seguir -o adaptar- esta sintaxis. Se recomienda el uso de Notepad++ a los usuarios de Windows, aunque de ninguna manera es esencial para trabajar con archivos .md. Los usuarios de Mac o Unix pueden encontrar útil [Komodo Edit] o [Text Wrangler]. +Es probable que en muchas ocasiones estos esquemas de anotación se desprendan de la práctica individual (y en consecuencia deban ser documentados), aunque hay sintaxis disponibles como [Markdown](https://es.wikipedia.org/wiki/Markdown) (los archivos Markdown se guardan como .md). En GitHub se pueden encontrar estos excelentes apuntes para quien quiera seguir -o adaptar- esta sintaxis. Se recomienda el uso de Notepad++ a los usuarios de Windows, aunque de ninguna manera es esencial para trabajar con archivos .md. Los usuarios de Mac o Unix pueden encontrar útil [Komodo Edit] o [Text Wrangler]. #### Resumen 1 @@ -107,22 +107,22 @@ Examinar los URLs es una buena forma de pensar una estructura de datos de invest Un buen ejemplo de los primeros son los URLs utilizados por los sitios web de noticias o los servicios de *blogging*. Los URLs de WordPress utilizan el siguiente formato: - *Nombre del sitio web*/*año (4 dígitos)*/*mes (2 dígitos)*/*día (2 dígitos)*/*palabras-del-titulo-separadas-por-guiones* -- +- Un estilo similar es utilizado por las agencias de noticias como el periódico *The Guardian*: - *Nombre del sitio web*/*subdivisión de seccción*/*año (4 dígitos)*/*mes (3 caracteres)*/*día (2 dígitos)*/*palabras-que-describen-contenido-separadas-por-guiones* -- +- En los catálogos de archivo, se suelen utilizar URLs estructurados por un elemento de datos. *The British Cartoon Archive* estructura su archivo en línea utilizando el formato: - *nombre del sitio web*/*registro*/*número de referencia* -- +- Y el sitio Old Bailey Online usa este formato: - *nombre del sitio web*/browse.jsp?ref=*número de referencia* -- +- Lo que aprendemos de estos ejemplos es que la combinación de descripciones semánticas con elementos de datos hacen consistente y predecible la lectura de los datos estructurados tanto por máquinas como por seres humanos. Transferir esto a los datos digitales acumulados durante el curso de la investigación histórica hace que los datos de investigación sean más fácilmente navegables, así como buscar y consultar utilizando las herramientas estándar provistas por nuestros sistemas operativos (y, como veremos en próximas lecciones, por herramientas más avanzadas). @@ -236,11 +236,11 @@ blog (17 October 2013) Hitchcock, Tim, 'Judging a book by its URLs', Historyonics blog (3 January 2014) - + Howard, Sharon, 'Unclean, unclean! What historians can do about sharing our messy research data', Early Modern Notes blog (18 May 2013) - + Noble, William Stafford, A Quick Guide to Organizing Computational Biology Projects.PLoSComputBiol 5(7): e1000424 (2009) @@ -253,7 +253,7 @@ Information Management: Organising Humanities Material' (2011) Pennock, Maureen, 'The Twelve Principles of Digital Preservation (and a cartridge in a repository…)', British Library Collection Care blog (3 September 2013) - + Pritchard, Adam, 'Markdown Cheatsheet' (2013) diff --git a/es/lecciones/procesamiento-basico-de-textos-en-r.md b/es/lecciones/procesamiento-basico-de-textos-en-r.md index 8bc1479bb7..eae6d32333 100644 --- a/es/lecciones/procesamiento-basico-de-textos-en-r.md +++ b/es/lecciones/procesamiento-basico-de-textos-en-r.md @@ -626,8 +626,8 @@ Existen muchos tutoriales genéricos para estos tres ejemplos, además de docume [^3]: Hadley Wickham. “tidyverse: Easily Install and Load ‘Tidyverse’ Packages”. R Package, Version 1.1.1. https://cran.r-project.org/web/packages/tidyverse/index.html [^4]: Lincoln Mullen and Dmitriy Selivanov. “tokenizers: A Consistent Interface to Tokenize Natural Language Text Convert”. R Package, Version 0.1.4. https://cran.r-project.org/web/packages/tokenizers/index.html [^5]: Ten en cuenta que los nombres de las funciones como `library` o `install.packages` siempre estarán en inglés. No obstante, se proporciona una traducción de su significado para facilitar la comprensión y se traducen el nombre de las variables.[N. de la T.] -[^6]: Traducción publicada en CNN en español (12 de enero de 2016) http://cnnespanol.cnn.com/2016/01/12/discurso-completo-de-obama-sobre-el-estado-de-la-union/ [N. de la T.] -[^7]: Todos los discursos presidenciales del Estado de la Unión fueron descargados de The American Presidency Project at the University of California Santa Barbara (Accedido el 11 de noviembre de 2016) http://www.presidency.ucsb.edu/sou.php +[^6]: Traducción publicada en CNN en español (12 de enero de 2016) https://cnnespanol.cnn.com/2016/01/12/discurso-completo-de-obama-sobre-el-estado-de-la-union/ [N. de la T.] +[^7]: Todos los discursos presidenciales del Estado de la Unión fueron descargados de The American Presidency Project at the University of California Santa Barbara (Accedido el 11 de noviembre de 2016) https://www.presidency.ucsb.edu/sou.php [^8]: Aquí volvemos a la versión del discurso en su original (inglés) por motivos de continuación del análisis y, en particular, el listado de las palabras más frecuentes usadas en inglés. Seguimos traduciendo los nombres de las variables y de las funciones para facilitar la comprensión en español.[N. de la T.] [^9]: Aquí optamos por nombrar a las columnas de la tabla en inglés, como "word" (palabra) y "count" (recuento), para facilitar su interoperabilidad con el conjunto de datos que introducimos más adelante con la función `inner_join` de más adelante. [N. de la T.] [^10]: Peter Norvig. “Google Web Trillion Word Corpus”. (Accedido el 11 de noviembre de 2016) [http://norvig.com/ngrams/](https://web.archive.org/web/20260326183858/http://norvig.com/ngrams/). diff --git a/es/lecciones/publicar-archivos-tei-ceteicean.md b/es/lecciones/publicar-archivos-tei-ceteicean.md index 7808fa40f9..8f21b760b4 100644 --- a/es/lecciones/publicar-archivos-tei-ceteicean.md +++ b/es/lecciones/publicar-archivos-tei-ceteicean.md @@ -34,7 +34,7 @@ Para quienes se inician en el uso de TEI, uno de los escollos más comunes es qu Este tutorial te guiará a través de los pasos necesarios para publicar un archivo TEI en línea utilizando CETEIcean, una librería abierta escrita en el lenguaje de programación JavaScript. CETEIcean permite que los documentos TEI se muestren en un navegador web sin transformarlos primero a HTML. CETEIcean carga el archivo TEI dinámicamente en el navegador y cambia el nombre de los elementos de TEI por otros en HTML, de tal forma que estos nos permitan visualizar en el navegador web los fenómenos textuales que marcamos en nuestros archivos usando TEI. -En primer lugar, una aclaración sobre la visualización de tu trabajo: el método por defecto de CETEIcean para mostrar archivos TEI consiste en cargar los archivos desde otra ubicación. Sin embargo, no todos los navegadores te permitirán cargar los archivos si estos se encuentran almacenados en tu computadora. Puedes hacer el intento, pero si eso no funciona, tendrás que generar un servidor local, colocar los archivos en un servidor en línea, o utilizar un editor de código con funciones de previsualización. Para el caso de este tutorial, seguiremos esta última opción, ya que usaremos el editor [Atom](https://atom.io), con el plug-in `atom-html-preview`. No obstante, existen otras opciones libres para editar archivos TEI y generar previsualizaciones de HTML, como [jEdit](http://www.jedit.org/) o [Visual Studio Code](https://code.visualstudio.com/), y versiones propietarias como [Oxygen](https://www.oxygenxml.com/). +En primer lugar, una aclaración sobre la visualización de tu trabajo: el método por defecto de CETEIcean para mostrar archivos TEI consiste en cargar los archivos desde otra ubicación. Sin embargo, no todos los navegadores te permitirán cargar los archivos si estos se encuentran almacenados en tu computadora. Puedes hacer el intento, pero si eso no funciona, tendrás que generar un servidor local, colocar los archivos en un servidor en línea, o utilizar un editor de código con funciones de previsualización. Para el caso de este tutorial, seguiremos esta última opción, ya que usaremos el editor [Atom](https://atom.io), con el plug-in `atom-html-preview`. No obstante, existen otras opciones libres para editar archivos TEI y generar previsualizaciones de HTML, como [jEdit](https://www.jedit.org/) o [Visual Studio Code](https://code.visualstudio.com/), y versiones propietarias como [Oxygen](https://www.oxygenxml.com/).
    Actualización de marzo de 2025: No recomendamos usar Atom, ya que el software no ha recibido mantenimiento ni actualizaciones desde su cierre en diciembre de 2022. Puedes usar VSCode de la misma manera, siempre que también instales la extensión HTML Preview del marketplace de extensiones. @@ -44,9 +44,9 @@ Deberás entonces descargar e instalar [Atom](https://atom.io) antes de continua {% include figure.html filename="publicar-archivos-tei-ceteicean1.png" caption="Proceso de instalación del plug-in de Atom para previsualizar archivos en HTML" %} -Usaremos como texto de prueba la crónica conocida como *La Argentina Manuscrita*, del hispano-guaraní [Ruy Díaz de Guzmán](https://es.wikipedia.org/wiki/Ruy_D%C3%ADaz_de_Guzm%C3%A1n). Este texto del siglo XVII hace uso del topónimo Argentina por primera vez, para referirse a los extensos territorios del Cono Sur que componían el Río de la Plata y sus adyacencias, es decir, territorios de la actual Argentina, Paraguay, Uruguay, sur de Brasil y Bolivia. Puedes encontrar una edición digital completa del texto en: [http://hdlab.space/La-Argentina-Manuscrita](http://hdlab.space/La-Argentina-Manuscrita). +Usaremos como texto de prueba la crónica conocida como *La Argentina Manuscrita*, del hispano-guaraní [Ruy Díaz de Guzmán](https://es.wikipedia.org/wiki/Ruy_D%C3%ADaz_de_Guzm%C3%A1n). Este texto del siglo XVII hace uso del topónimo Argentina por primera vez, para referirse a los extensos territorios del Cono Sur que componían el Río de la Plata y sus adyacencias, es decir, territorios de la actual Argentina, Paraguay, Uruguay, sur de Brasil y Bolivia. Puedes encontrar una edición digital completa del texto en: [https://hdlab.space/La-Argentina-Manuscrita](https://hdlab.space/La-Argentina-Manuscrita). -Comenzaremos con un archivo simple (aunque un tanto extenso) en formato TEI P5, que queremos hacer visible en un navegador web: [`Ruy_Diaz-La_Argentina_Manuscrita.xml`](http://hdlab.space/La-Argentina-Manuscrita/assets/Ruy_Diaz-La_argentina_manuscrita.tei.xml). Para descargar el archivo haz clic derecho sobre el enlace de descarga y selecciona la opción 'Save Link As...'. +Comenzaremos con un archivo simple (aunque un tanto extenso) en formato TEI P5, que queremos hacer visible en un navegador web: [`Ruy_Diaz-La_Argentina_Manuscrita.xml`](https://hdlab.space/La-Argentina-Manuscrita/assets/Ruy_Diaz-La_argentina_manuscrita.tei.xml). Para descargar el archivo haz clic derecho sobre el enlace de descarga y selecciona la opción 'Save Link As...'. ## Paso 1: Crear una estructura para nuestros archivos Comenzaremos por establecer una estructura para nuestros archivos, es decir, una carpeta contenedora con el nombre 'tutorial_es' con las subcarpetas y archivos que te indicaremos a continuación. Puedes descargar el directorio completo del repositorio [CETEIcean en GitHub](https://github.com/TEIC/CETEIcean) y trabajar en la carpeta 'tutorial_es', o puedes descargar los archivos individualmente, siempre y cuando mantengan la misma estructura que en GitHub, que es la siguiente: @@ -270,7 +270,7 @@ CETEIcean posee una cantidad de comportamientos integrados que puedes reemplazar Si haces esto, puede que desees agregar estilos de CSS o comportamientos para elegir la forma en la que se visualizará el contenido del TEI Header en el navegador. -En este tutorial no agotamos todas las posibilidades para la presentación de nuestro documento fuente. Te invitamos a que continúes experimentando por tu cuenta en las diferentes formas en las que un marcado de TEI puede visualizarse en un navegador usando CETEICean. Puedes encontrar más información en [CETEIcean](http://teic.github.io/CETEIcean/). +En este tutorial no agotamos todas las posibilidades para la presentación de nuestro documento fuente. Te invitamos a que continúes experimentando por tu cuenta en las diferentes formas en las que un marcado de TEI puede visualizarse en un navegador usando CETEICean. Puedes encontrar más información en [CETEIcean](https://teic.github.io/CETEIcean/). ## Referencias @@ -280,11 +280,11 @@ Allés Torrent, Susanna. 2019. "Introducción a la Text Encoding Initiative". *T Atom. A hackable text editor for the 21st Century. https://atom.io -Cayless, Hugh y Viglianti, Raffaele. CETEIcean. http://teic.github.io/CETEIcean/ +Cayless, Hugh y Viglianti, Raffaele. CETEIcean. https://teic.github.io/CETEIcean/ -del Rio Riande, Gimena; De León, Romina, y Hernández, Nidia. 2019. *Historia de la conquista del Río de la Plata o La Argentina manuscrita*. http://hdlab.space/La-Argentina-Manuscrita/ +del Rio Riande, Gimena; De León, Romina, y Hernández, Nidia. 2019. *Historia de la conquista del Río de la Plata o La Argentina manuscrita*. https://hdlab.space/La-Argentina-Manuscrita/ -Jedit. Programmer's text editor. Stable Version: 5.6.0. http://www.jedit.org/ +Jedit. Programmer's text editor. Stable Version: 5.6.0. https://www.jedit.org/ Oxygen. XML Editor. https://www.oxygenxml.com/ diff --git a/es/lecciones/retirada/introduccion-control-versiones-github-desktop.md b/es/lecciones/retirada/introduccion-control-versiones-github-desktop.md index 11bd1d4d6e..ba0e320413 100644 --- a/es/lecciones/retirada/introduccion-control-versiones-github-desktop.md +++ b/es/lecciones/retirada/introduccion-control-versiones-github-desktop.md @@ -25,8 +25,8 @@ abstract: "En esta lección aprenderás lo básico del control de versiones, com original: getting-started-with-github-desktop avatar_alt: Grabado de una pareja en un escritorio observando a un gato a la luz de una vela redirect_from: - - /es/lessons/getting-started-with-github-desktop - - /es/lecciones/introduccion-control-versiones-github-desktop + - /es/lessons/getting-started-with-github-desktop/ + - /es/lecciones/introduccion-control-versiones-github-desktop/ retired: true retirement-reason: | Esta lección utiliza una versión obsoleta de GitHub Desktop que ya no cuenta con el respaldo de GitHub. La última versión y la documentación pueden encontrarse en . @@ -142,7 +142,7 @@ Esta lección no cubre la sintaxis Markdown por razones de espacio, pero es úti ### Editores de texto -Para escribir un documento de texto plano necesitamos un editor. Hay muchos editores disponibles, algunos gratuitos, otros de pago. Algunos son fáciles de usar mientras que otros tienen una curva de aprendizaje y un potencial que sobrepasa las funciones de un editor de texto. A largo plazo, un editor avanzado como Vim o Emacs puede ahorrarte tiempo pero de momento puedes empezar con un editor más simple. Por ejemplo, [Atom](https://atom.io/) es un buen editor desarrollado por GitHub que destaca la sintaxis Markdown y, además, se integra con la plataforma GitHub. Es gratuito y su código es abierto; además, incluye un [manual](http://flight-manual.atom.io/) de instrucciones muy exhaustivo. +Para escribir un documento de texto plano necesitamos un editor. Hay muchos editores disponibles, algunos gratuitos, otros de pago. Algunos son fáciles de usar mientras que otros tienen una curva de aprendizaje y un potencial que sobrepasa las funciones de un editor de texto. A largo plazo, un editor avanzado como Vim o Emacs puede ahorrarte tiempo pero de momento puedes empezar con un editor más simple. Por ejemplo, [Atom](https://atom.io/) es un buen editor desarrollado por GitHub que destaca la sintaxis Markdown y, además, se integra con la plataforma GitHub. Es gratuito y su código es abierto; además, incluye un [manual](https://flight-manual.atom.io/) de instrucciones muy exhaustivo. Si no quieres instalar un programa nuevo, puedes utilizar uno de los editores que incluidos en tu ordenador como TextEdit para Mac. Si decides continuar aprendiendo Markdown en el futuro, te recomendamos utilizar un editor de texto que destaque la sintaxis Markdown, entre otras funcionalidades. @@ -202,7 +202,7 @@ Aunque hay diferencias entre el control de versiones de código y de textos, las Es importante que tus anotaciones y los mensajes asociados que las describen tengan sentido y sean específicos. Escribir buenas descripciones de las anotaciones requiere reflexión. A veces, los mensajes que para ti son claros en el momento de la anotación se vuelven difíciles de comprender en el futuro. Si vas a utilizar el control de versiones con otras personas es importante que tus colaboradores puedan entenderte. El control de versiones para gestionar cambios en documentos funciona mejor cuando nos esforzamos un poco en pensar cómo utilizamos el programa. Por tanto, cuando se lleva a cabo un trabajo colaborativo es importante aclarar estas cuestiones y compartir una misma visión para usar el control de cambios de manera efectiva. -Una manera de enfrentarse a este problema es intentar seguir un 'estilo de anotaciones'. Por ejemplo, te recomendamos seguir la influyente [sugerencia de Tim Pope](http://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html) cuando realices anotaciones. La sugerencia de Tim Pope [tiene en cuenta](https://github.com/blog/926-shiny-new-commit-styles), parcialmente, la interfaz de GitHub Desktop para anotar cambios y describirlos pero entender el formato te ayudará a poner en práctica su estrategia. El siguiente mensaje es una adaptación de la propuesta de Tim Pope, que se centra en la anotación de texto (en lugar de código): +Una manera de enfrentarse a este problema es intentar seguir un 'estilo de anotaciones'. Por ejemplo, te recomendamos seguir la influyente [sugerencia de Tim Pope](https://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html) cuando realices anotaciones. La sugerencia de Tim Pope [tiene en cuenta](https://github.com/blog/926-shiny-new-commit-styles), parcialmente, la interfaz de GitHub Desktop para anotar cambios y describirlos pero entender el formato te ayudará a poner en práctica su estrategia. El siguiente mensaje es una adaptación de la propuesta de Tim Pope, que se centra en la anotación de texto (en lugar de código): ``` Breve resumen (50 o menos caracteres) con mayúscula inicial. @@ -319,7 +319,7 @@ Esta forma de resolver conflictos puede parecer más compleja de lo que es, pero Hasta el momento hemos puesto en marcha un control de versiones con un documento muy básico. Si aprendes más acerca de Markdown y la escritura en texto plano, podrás usar el control de versiones de muchas maneras y te será muy útil para llevar a cabo tu investigación. Controlar las versiones de un documento Markdown te permitirá profundizar en esta sintaxis; para ello, te recomendamos consultar la lección [Escritura sostenible en texto plano usando Pandoc y Markdown](/es/lecciones/escritura-sostenible-usando-pandoc-y-markdown) escrita por Dennis Tenen y Grant Wythoff; esta lección te ayudará a entender cómo puedes usar el texto plano para escribir con Pandoc y Markdown. Pandoc es muy útil para convertir tus archivos de texto plano escritos en Markdown a otros formatos como HTML, PDF o Word. Si combinas Markdown, Pandoc y el control de versiones, podrás implementar un sistema muy potente y sostenible para escribir tus artículos y trabajos académicos. -Asimismo, el flujo de trabajo presentado en esta lección también puede convertirse en el fundamento para crear webs estáticas alojadas en GitHub. Una vez te sientas cómodo usando GitHub Desktop, puedes seguir con la lección escrita por Amanda Visconti, [Construcción de sitios estáticos usando Jekyll GitHub Pages](/lessons/building-static-sites-with-jekyll-github-pages). +Asimismo, el flujo de trabajo presentado en esta lección también puede convertirse en el fundamento para crear webs estáticas alojadas en GitHub. Una vez te sientas cómodo usando GitHub Desktop, puedes seguir con la lección escrita por Amanda Visconti, [Construcción de sitios estáticos usando Jekyll GitHub Pages](/en/lessons/building-static-sites-with-jekyll-github-pages). ## Más recursos @@ -331,6 +331,6 @@ GitHub Desktop es una forma sencilla de aprender a controlar versiones con GitHu * [Atlassian](https://www.atlassian.com/git/tutorials): contiene tutoriales más avanzados (pero fáciles de entender) de Git. Ponen el acento en las diferencias entre Git y otros controles de versiones; esto puede no ser relevante para ti pero te ayudará a comprender el funcionamiento de Git de manera más detallada. * [Pro Git](https://git-scm.com/book/en/v2): un libro exclusivamente sobre Git. Empieza con el funcionamiento básico y luego pasa a tratar asuntos más avanzados de Git. * Para [estudiantes](https://education.github.com/pack) e [investigadores](https://github.com/blog/1840-improving-github-for-science) GitHub ofrece repositorios privados sin pagar por una suscripción. Estos repositorios pueden ser útiles para borradores o notas que no queremos publicar. Nota: no es muy aconsejable guardar contenido delicado incluso en un repositorio privado en GitHub. -* [ProfHacker](https://web.archive.org/web/20170716182645/http://www.chronicle.com/blogs/profhacker/tag/github) tiene varias entradas sobre proyectos que utilizan GitHub en el contexto académico. -* [GitHub, Academia, and Collaborative Writing](https://www.hastac.org/blogs/harrisonm/2013/10/12/github-academia-and-collaborative-writing) reflexioina sobre el uso de GitHub para la escritura colaborativa. -* La lección [Introducción a Bash](/lessons/intro-to-bash) te permitirá aprender más sobre la línea de comandos, muy útil para utilizar GitHub. +* [ProfHacker](https://web.archive.org/web/20170716182645/https://www.chronicle.com/blogs/profhacker/tag/github) tiene varias entradas sobre proyectos que utilizan GitHub en el contexto académico. +* [GitHub, Academia, and Collaborative Writing](https://web.archive.org/web/20131215095438/https://www.hastac.org/blogs/harrisonm/2013/10/12/github-academia-and-collaborative-writing) reflexioina sobre el uso de GitHub para la escritura colaborativa. +* La lección [Introducción a Bash](/en/lessons/intro-to-bash) te permitirá aprender más sobre la línea de comandos, muy útil para utilizar GitHub. diff --git a/es/lecciones/retirada/sparql-datos-abiertos-enlazados.md b/es/lecciones/retirada/sparql-datos-abiertos-enlazados.md index 80d39121fa..bc9c1f7ca9 100644 --- a/es/lecciones/retirada/sparql-datos-abiertos-enlazados.md +++ b/es/lecciones/retirada/sparql-datos-abiertos-enlazados.md @@ -1,429 +1,429 @@ ---- -title: | - Uso de SPARQL para acceder a datos abiertos enlazados -authors: -- Matthew Lincoln -date: 2015-11-24 -translation_date: 2017-05-20 -editors: -- Fred Gibbs -reviewers: -- Patrick Murray-John -- Jason Heppler -- Will Hanley -- Fred Gibbs -translator: -- Nuria Rodríguez Ortega -translation-editor: -- Antonio Rojas Castro -translation-reviewer: -- Antonio Rojas Castro -- Juan Antonio Pastor Sánchez -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/67 -layout: lesson -original: graph-databases-and-SPARQL -redirect_from: -- /es/lessons/graph-databases-and-SPARQL -- /es/lecciones/sparql-datos-abiertos-enlazados -difficulty: 2 -activity: acquiring -topics: [lod] -abstract: "Esta lección explica por qué numerosas instituciones culturales están adoptando bases de datos orientadas a grafos y cómo los investigadores pueden acceder a estos datos a través de consultas realizadas en el lenguaje llamado SPARQL." -retired: true -retirement-reason: | - El Museo Británico no ha mantenido el acceso a su base de datos de colecciones de una manera consistente. Aunque la sintaxis y los comandos de SPARQL siguen siendo correctos, las URLs a las que intentan conectarse son ahora demasiado inconsistentes para su uso en una lección funcional. -avatar_alt: Grabado con dos peces unidos por una rama en sus bocas. -doi: 10.46430/phes0027 ---- - -Objetivos de la lección ------------------------ - -Esta lección explica por qué numerosas instituciones culturales están adoptando bases de datos orientadas a grafos (*graph databases*) y cómo los investigadores pueden acceder a estos datos a través de consultas realizadas en el lenguaje llamado SPARQL. - -{% include toc.html %} - - - - - - -## Bases de datos orientadas a grafo, RDF y datos abiertos enlazados (Linked Open Data, LOD) - -Actualmente, numerosas instituciones culturales están ofreciendo información sobre sus colecciones a través de las denominadas API ([*Application Programming Interfaces*](/lessons/intro-to-the-zotero-api.html)). Estas API son instrumentos muy eficaces para acceder de manera automatizada a registros individuales, sin embargo, no constituyen el procedimiento ideal cuando tratamos con datos culturales debido a que las API están estructuradas para trabajar con un conjunto predeterminado de consultas (*queries*). Por ejemplo, un museo puede tener información sobre donantes, artistas, obras de arte, exposiciones, procedencia de sus obras (*provenance*), etc., pero su API puede ofrecer solo una recuperación orientada a objetos, haciendo difícil o imposible buscar datos relacionados con donantes, artistas, etc. Así pues, esta estructura es interesante si el objetivo es buscar información sobre objetos particulares; sin embargo, puede complicar la operación de agregar información sobre los artistas o donantes que también se encuentran registrados en la base de datos. - -Las bases de datos RDF son muy apropiadas para expresar relaciones complejas entre múltiples entidades, como personas, lugares, eventos y conceptos ligados a objetos individuales. Estas bases de datos se denominan habitualmente bases de datos orientadas a grafos (*graph databases*) porque estructuran la información como un grafo o red, donde un conjunto de recursos o nodos están conectados entre sí mediante aristas (o enlaces) que describen las relaciones establecidas entre dichos recursos y/o nodos. - -Dado que las bases de datos RDF admiten el uso de URL, estas pueden estar accesibles *online* y también pueden enlazarse a otras bases de datos, de ahí el término "datos abiertos enlazados" (*Linked Open Data*, LOD). Importantes colecciones artísticas, entre las que se incluyen las del [British Museum](http://collection.britishmuseum.org/), [Europeana](http://labs.europeana.eu/api/linked-open-data-introduction), el [Smithsonian American Art Museum](http://americanart.si.edu/) y el [Yale Center for British Art](http://britishart.yale.edu/collections/using-collections/technology/linked-open-data), han publicado sus colecciones de datos como LOD. El [Getty Vocabulary Program](http://vocab.getty.edu/) también ha publicado sus vocabularios controlados (TGN, ULAN y AAT) como LOD. - -SPARQL es el lenguaje utilizado para interrogar este tipo de bases de datos. Este lenguaje es particularmente potente porque obvia las perspectivas que los usuarios transfieren a los datos. Una consulta sobre objetos y una consulta sobre donantes son prácticamente equivalentes en estas bases de datos. Lamentablemente, numerosos tutoriales sobre SPARQL utilizan modelos de datos tan extremadamente simplificados que no son operativos cuando se trata de utilizar las complejas bases de datos desarrolladas por las instituciones culturales. Este tutorial ofrece un curso intensivo sobre SPARQL utilizando un conjunto de datos (*dataset*) que un humanista podría realmente encontrar en Internet. En concreto, en este tutorial aprenderemos cómo interrogar la colección LOD del British Museum. - -### RDF en pocas palabras - -RDF representa la información en una declaración triple -también llamada tripleta- que sigue la estructura sujeto-predicado-objeto. Por ejemplo: - -``` - . - -``` - -(Observa que, como toda buena oración, estas declaraciones terminan con un punto y final). - -En este ejemplo, el sujeto `` y el objeto `` pueden ser considerados como dos nodos de un grafo, donde el predicado `` define la arista -o relación- entre ellos. (Técnicamente, puede ser tratado en otras consultas como un objeto o un sujeto, pero esta cuestión escapa el alcance de este tutorial). - -Una seudobase de datos RDF podría contener declaraciones interrelacionadas entre sí, como las siguientes: - -``` -... - . - <1642>. - <óleo sobre lienzo>. - <1606>. - . - . - . - <óleo sobre lienzo>. -... -``` - -Si visualizásemos estas declaraciones como nodos y aristas de un grafo o red, la representación sería como sigue: - -{% include figure.html caption="Visualización en red del seudoRDF mostrado más arriba. Las flechas indican la 'dirección' del predicado. Por ejemplo, que '*La tasadora de perlas* fue creada por Vermeer' y no al revés. Diagrama reconstruido por Nuria Rodríguez Ortega." filename="sparql-lod-01.png" %} - -Las tradicionales bases de datos relacionales pueden distribuir atributos sobre obras de arte y artistas en tablas separadas. En las bases de datos RDF u orientadas a grafos, todos estos datos pertenencen a un mismo mismo grafo interconectado, lo que permite a los usuarios una mayor flexibilidad a la hora de decidir cómo quieren interrogar estos recursos. - -### Buscando RDF con SPARQL - -SPARQL nos permite traducir datos en grafo, intensamente enlazados, en datos normalizados en formato tabular, esto es, distribuidos en filas y columnas, que se pueden abrir en programas como Excel o importar a programas de visualización, tales como [plot.ly](https://plot.ly/) o [Palladio](http://hdlab.stanford.edu/palladio/). - -Resulta útil pensar las consultas SPARQL como un [Mad Lib](https://en.wikipedia.org/wiki/Mad_Libs) -un conjunto de oraciones con espacios en blanco-. La base de datos tomará esta consulta y encontrará cada conjunto de oraciones que encaje correctamente en estos espacios en blanco, devolviéndonos los valores coincidentes como una tabla. Veamos esta consulta SPARQL: - -``` -SELECT ?pintura -WHERE { - ?pintura <óleo sobre lienzo> . -} -``` - -En este consulta, `?pintura` representa el nodo (o nodos) que la bases de datos nos devolverá. Una vez recibida la consulta, la base de datos buscará todos los valores para `?pintura` que adecuadamente complete la declaración RDF ` <óleo sobre lienzo>`. - -{% include figure.html caption="Visualización de lo que nuestra consulta está buscando. Diagrama reconstruido por Nuria Rodríguez Ortega." filename="sparql-lod-02.png" %} - - -Cuando la consulta interroga la base de datos completa, esta busca los sujetos, predicados y objetos que coinciden con esta declaración, exluyendo, al mismo tiempo, el resto de datos. - -{% include figure.html filename="sparql-lod-03.png" caption="Visualización de la consulta SPARQL con los elementos mencionados en naranja y los elementos seleccionados (aquellos que nos serán devueltos en los resultados) en rojo. Diagrama reconstruido por Nuria Rodríguez Ortega." %} - -Nuestros resultados podrían tener este aspecto: - -| **pinturas** | -| --------------------- | -| La ronda de noche | -| La tasadora de perlas | - -Ahora bien, lo que hace a RDF y a SPARQL herramientas tan potentes es su habilidad para crear consultas complejas que referencian múltiples variables al mismo tiempo. Por ejemplo, podríamos buscar en nuestra seudobase de datos RDF pinturas creadas por cualquier artista que fuese holandés: - -``` -SELECT ?artista ?pintura -WHERE { - ?artista . - ?pintura ?artista . - } -``` - -En este ejemplo, hemos introducido una segunda variable: `?artista`. La base de datos RDF devolverá todas las combinaciones conincidentes de `?artista` y `?pintura` que encajen en ambas declaraciones. - -{% include figure.html filename="sparql-lod-04.png" caption="Visualización de la consulta SPARQL con los elementos mencionados en naranja y los elementos seleccionados (aquellos que serán recuperados en los resultados en rojo). Diagrama reconstruido por Nuria Rodríguez Ortega." %} - -| artistas | pinturas | -| ------------------ | --------------------- | -| Rembrandt van Rijn | La ronda de noche | -| Johannes Vermeer | La tasadora de perlas | - -### URI y literales - -Hasta ahora, hemos visto una representación facticia de RDF que utiliza un texto fácil de leer. Sin embargo, RDF se almacena principalmente en formato URI (*Uniform Resource Identifiers*), que separa las entidades conceptuales de sus etiquetas lingüísticas. (Ten en cuenta que una URL, o *Uniform Resource Locator*, es una URI accesible desde la web). En RDF real, nuestra declaración original: - -``` - . -``` - -sería más parecido a lo siguiente: - -``` - . -``` - -*N.B. el Rijksmuseum todavía no ha desarrollado su propio sitio LOD, por lo que en esta consulta la URI responde únicamente a objetivos de demostración.* - -A fin de obtener una versión legible desde el punto de vista humano de la información representada por cada una de estas URI, lo que hacemos realmente es recuperar más declaraciones RDF. Incluso el predicado en esta declaración tiene su propia etiqueta literal: - -``` - "La ronda de noche". - "fue creado por". - "Rembrandt van Rijn". -``` - -Como se puede observar, a diferencia de las URI que en esta consulta están enmarcadas por los signos `<>`, los *objetos* son cadenas de texto entrecomilladas. Esto es lo que se conoce como *literales* (*literals*). Los literales representan valores, mientras que las URI representan referencias. Por ejemplo, `` representa una entidad que puede referenciar (y puede ser referenciada por) muchas otras declaraciones (fechas de nacimiento, discípulos, miembros de la familia, etc.), mientras que la cadena de texto `"Rembrandt van Rijn"` solo se representa a sí misma. Otros valores literales en RDF incluyen fechas y números. - -Fijémenos ahora en los predicados de estas declaraciones, con nombres de dominio como `purl.org`, `w3.org` y `xmlns.com`. Estos son algunos de los numerosos proveedores de ontologías que ayudan a estandarizar el modo en que describimos relaciones entre bits de información como, "título", "etiqueta", "creador" o "nombre". Cuanto más trabajemos con RDF/LOD, más proveedores de este tipo encontraremos. - -Las URI pueden llegar a ser difíciles de manejar cuando se componen consultas SPARQL. Para simplificar este proceso se utilizan los *prefijos* (*prefixes*). Los prefijos son atajos que nos liberan de tener que escribir toda la larga cadena de caracteres que constituye una URI. Por ejemplo, recordemos el predicado para recuperar el título de *La ronda de noche*, `http://purl.org/dc/terms/title>`. Con los siguientes prefijos, solo necesitamos escribir `dct:title` cuando queramos utilizar un predicado `purl.org`. `dct:` representa la cadena completa `http://purl.org.dc/terms,` y `'title'` simplemente se agrega al final de este enlace. - -Por ejemplo, con el prefijo `PREFIX rkm: que representa la cadena completa `, agregado al inicio de nuestra consulta SPARQL, `http://data.rijksmuseum.nl/item/8909812347 <` se convierte en `rkm:item/8909812347`. - -Debemos ser conscientes de que los prefijos se pueden asignar arbitrariamente a cualquier abreviatura que queramos; así, diferentes puntos de entrada (*endpoints*) pueden utilizar prefijos ligeramente diferentes para el mismo espacio de nombre (*namespace*) (por ejemplo: `dct vs.` `dcterms` para ``). - -### Términos para revisar - -* **SPARQL** - *Protocol and RDF Query Language* - El lenguaje utilizado para interrogar bases de datos RDF u orientadas a grafos. -* **RDF** - *Resource Description Framework* - Un método para estructurar datos en forma de grafo o como una red de declaraciones conectadas más que como una serie de tablas. -* **LOD** - *Linked Open Data* (datos abiertos enlazados) - LOD son datos RDF publicados *online* en formato URI de modo que los desarrolladores pueden referenciarlos de manera fiable y sin ambigüedad. -* **declaración** - a veces denominada "tripleta", una declaración RDF es una unidad de conocimiento que comprende sujeto, predicado y objeto. -* **URI** - *Uniform Resource Identifier* - una cadena de caracteres que identifica un recurso. Las declaraciones RDF utilizan URI para enlazar varios recursos. Una URL, o *Uniform Resource Locator*, es un tipo de URI que apunta a un determinado recurso en la web. -* **literal** - En las declaraciones RDF, algunos objetos no referencian recursos con una URI sino que vehiculan un valor, que puede ser un texto (`"Rembrandt van Rijn"`), un número (`5`) o una fecha (`1606-06-15`). Estos objetos se conocen como literales. -* **prefijo** - A fin de simplificar las consultas SPARQL, un usuario puede especificar prefijos que funcionan como abreviaturas de las URI completas. Estas abreviaturas, o **QNAmes**, se utilizan también en los espacios de nombre (*namespaces*) de los documentos XML. - -## Consultas basadas en casos reales - -### Todas las declaraciones para un objeto - -Vamos a empezar nuestra primera consulta utilizando el [punto de entrada SPARQL del British Museum](http://collection.britishmuseum.org/sparql). Un punto de entrada SPARQL es una dirección web que acepta consultas SPARQL y devuelve resultados. El punto de entrada del British Museum funciona como muchos otros: cuando accedemos a él a través de un navegador web, encontramos una caja de texto para componer las consultas. - -{% include figure.html filename="sparql-lod-05.png" caption="Web del punto de entrada SPARQL del British Museum. Para todas las consultas de este tutorial, hay que asegurarse de haber dejado las casillas 'Include inferred' y 'Expand results over equivalent URIs' sin marcar." %} - - -Cuando empezamos a explorar una nueva base de datos RDF, resulta últil examinar, a modo de ejemplo, las relaciones que emanan de un [objeto en concreto](http://collection.britishmuseum.org/resource?uri=http://collection.britishmuseum.org/id/object/PPA82633). - -(Para cada una de las siguientes consultas, clica en el enlace "Run query" situado más abajo para ver los resultados. La puedes ejecutar tal y como está o modificarla antes. En este último caso, recuerda que es necesario dejar sin marcar la casilla "Include inferred" antes de ejecutar la consulta). - -``` -SELECT ?p ?o -WHERE { - ?p ?o . -} -``` - -[Run query](http://collection.britishmuseum.org/sparql?query=SELECT+*%0D%0AWHERE+%7B%0D%0A++%3Chttp%3A%2F%2Fcollection.britishmuseum.org%2Fid%2Fobject%2FPPA82633%3E+%3Fp+%3Fo+.%0D%0A++%7D&_implicit=false&_equivalent=false&_form=%2Fsparql) - -Con la orden `SELECT ?p ?o`, le estamos diciendo a la base de datos que nos devuelva los valores de `?p` y `?o` descritos en el comando `WHERE {}`. Esta consulta devuelve cada declaración para la cual nuestra obra de arte seleccionada, ``, es el sujeto. `?p` ocupa la posición central en la declaración RDF en el comando `WHERE {}`, por lo que esta devuelve cualquier predicado que coincide con la declaración, mientras que `?o`, en la posición final, devuelve todos los objetos. Aunque yo las he nombrado como `?p` y `?o`, en realidad, tal y como se puede ver en el ejemplo inferior, es posible nombrar estas variables del modo que nosotros queramos. De hecho, será útil darles nombres significativos para las consultas complejas que siguen a continuación. - -{% include figure.html filename="sparql-lod-06.png" caption="Listado inicial de todos los predicados y objetos asociados con una obra de arte en el British Museum." %} - - -El punto de entrada del Britism Museum formatea la tabla de resultados con enlaces para cada una de las variables, que son, en realidad, nodos RDF, por lo que clicando en cada uno de estos enlaces podemos ver todos los predicados y objetos para cada uno de los nodos seleccionados. Advierte que el British Musuem incluye automáticamente un amplio rango de prefijos SPARQL en sus consultas, por lo que encontraremos numerosos enlaces mostrados en su versión abreviada; si pasamos el ratón sobre ellos, podremos ver las URI sin abreviar. - -{% include figure.html filename="sparql-lod-07.png" caption="Visualización del conjunto de nodos recuperados a través de la primera consulta realizada a la base de datos del British Museum. Los elementos de este grafo coloreados en rojo se encuentran también en la tabla de resultados mostrada más arriba. Se han incluido niveles adicionales en la jerarquía para mostrar cómo esta obra en particular se encuentra conectada en el grafo general que constituye la base de datos del BM." %} - - -Veamos ahora cómo se almacena la información de tipo objeto: busca el predicado `` (marcado en la tabla anterior) y clica en el enlace `thes:x8577` para acceder al nodo que describe el tipo de objeto "print" (grabado). - -{% include figure.html filename="sparql-lod-08.png" caption="Página del recurso `thes:x8577` ('print') en el conjunto de datos enlazados del British Museum." %} - -Como se puede observar, este nodo tiene una etiqueta (*label*) en texto plano, así como enlaces a nodos del tipo "objetos artísticos" con los que se relaciona en el conjunto de la base de datos. - -### Consultas complejas - -Para encontrar otros objetos del mismo tipo descritos con la etiqueta "print", podemos invocar esta consulta: - -``` -PREFIX bmo: -PREFIX skos: - -SELECT ?object -WHERE { - - # Busca todos los valores de ?object que tengan un "object type" dado - ?object bmo:PX_object_type ?object_type . - - # El "object type" debería tener la etiqueta "print" - ?object_type skos:prefLabel "print" . -} -LIMIT 10 -``` - -[Run query](https://collection.britishmuseum.org/sparql#query=PREFIX+bmo%3A+%3Chttp%3A%2F%2Fwww.researchspace.org%2Fontology%2F%3E%0APREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0A%0ASELECT+%3Fobject%0AWHERE+%7B%0A%0A++%23+Search+for+all+values+of+%3Fobject+that+have+a+given+%22object+type%22%0A++%3Fobject+bmo%3APX_object_type+%3Fobject_type+.%0A%0A++%23+That+object+type+should+have+the+label+%22print%22%0A++%3Fobject_type+skos%3AprefLabel+%22print%22+.%0A%7D%0ALIMIT+10) / [User-generated query](https://hypothes.is/a/AVLH7aAMvTW_3w8Ly19w) - -{% include figure.html filename="sparql-lod-09.png" caption="Tabla resultantes de nuestra consulta para todos los objetos del tipo 'print'." %} - -Recuerda que, dado que `"print"` funciona aquí como un literal, lo escribimos entrecomillado en nuestra consulta. Cuando se incluyen literales en las consultas SPARQL, la base de datos solo devuelve coincidencias exactas para estos valores. - -Advierte también que, dado que `?object_type` no se encuentra presente en el comando `SELECT`, este no se mostrará en la tabla de resultados. Sin embargo, resulta esencial estructurar nuestra consulta, porque es esto lo que permite conectar los puntos desde `?object` con la etiqueta `"print"`. - -### FILTER - -En los ejemplos anteriores, nuestra consulta SPARQL ha buscado una coincidencia exacta para el tipo de objeto con la etiqueta "print". Sin embargo, con frecuencia querremos encontrar valores literales que caen dentro de un determinado rango, como son las fechas. Para ello utilizaremos el comando `FILTER`. - -Para localizar las URI de todos los grabados presentes en la base de datos del British Museum creados entre 1580 y 1600, necesitaremos, en primer lugar, averiguar dónde se almacenan en la base de datos las fechas en relación con los objetos, y entonces añadir referencias a estas fechas en nuestra consulta. De manera similar al procedimiento que hemos seguido de un único enlace para determinar un tipo de objeto, debemos ahora movernos a través de diversos nodos para encontrar las fechas de producción asociadas a un objeto dado: - -{% include figure.html filename="sparql-lod-10.png" caption="Visualización de la parte del modelo de datos del British Museum donde las fechas de producción están conectadas a los objetos." %} - -``` -PREFIX bmo: -PREFIX skos: -PREFIX ecrm: -PREFIX xsd: - -# Recupera enlaces de objetos y fechas de creación -SELECT ?object ?date -WHERE { - - # Utilizaremos nuestro comando previo para buscar solo - # objetos del tipo "print" - ?object bmo:PX_object_type ?object_type . - ?object_type skos:prefLabel "print" . - - # Necesitamos enlazar diversos nodos para encontrar la - # fecha de creación asociada con un objeto - ?object ecrm:P108i_was_produced_by ?production . - ?production ecrm:P9_consists_of ?date_node . - ?date_node ecrm:P4_has_time-span ?timespan . - ?timespan ecrm:P82a_begin_of_the_begin ?date . - - # Como se ve, es necesario conectar unos cuantos pocos de puntos - # para llegar al nodo de la fecha. Ahora que lo tehemos, podemos - # filtrar nuestros resultados. Dado que estamos filtrando por fecha, - # debemos agregar la etiqueta ^^xsd:date después de nuestra cadena de fecha. - # Esta etiqueta le dice a la base de datos que interprete la cadena - # "1580-01-01" como la fecha 1 de enero de 1580. - - FILTER(?date >= "1580-01-01"^^xsd:date && - ?date <= "1600-01-01"^^xsd:date) -} -``` - -[Run query](https://collection.britishmuseum.org/sparql#query=PREFIX+bmo%3A+%3Chttp%3A%2F%2Fwww.researchspace.org%2Fontology%2F%3E%0APREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0APREFIX+ecrm%3A+%3Chttp%3A%2F%2Fwww.cidoc-crm.org%2Fcidoc-crm%2F%3E%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0A%0A%23+Return+object+links+and+creation+date%0ASELECT+%3Fobject+%3Fdate%0AWHERE+%7B%0A%0A++%23+We'll+use+our+previous+command+to+search+only+for%0A++%23+objects+of+type+%22print%22%0A++%3Fobject+bmo%3APX_object_type+%3Fobject_type+.%0A++%3Fobject_type+skos%3AprefLabel+%22print%22+.%0A%0A++%23+We+need+to+link+though+several+nodes+to+find+the%0A++%23+creation+date+associated+with+an+object%0A++%3Fobject+ecrm%3AP108i_was_produced_by+%3Fproduction+.%0A++%3Fproduction+ecrm%3AP9_consists_of+%3Fdate_node+.%0A++%3Fdate_node+ecrm%3AP4_has_time-span+%3Ftimespan+.%0A++%3Ftimespan+ecrm%3AP82a_begin_of_the_begin+%3Fdate+.%0A%0A++%23+As+you+can+see%2C+we+need+to+connect+quite+a+few+dots%0A++%23+to+get+to+the+date+node!+Now+that+we+have+it%2C+we+can%0A++%23+filter+our+results.+Because+we+are+filtering+by+date%2C%0A++%23+we+must+attach+the+tag+%5E%5Exsd%3Adate+after+our+date+strings.%0A++%23+This+tag+tells+the+database+to+interpret+the+string%0A++%23+%221580-01-01%22+as+the+date+1+January+1580.%0A%0A++FILTER(%3Fdate+%3E%3D+%221580-01-01%22%5E%5Exsd%3Adate+%26%26%0A+++++++++%3Fdate+%3C%3D+%221600-01-01%22%5E%5Exsd%3Adate)%0A%7D) - -{% include figure.html filename="sparql-lod-11.png" caption="Todos los grabados del British Museum realizados entre 1580-1600." %} - - -### Agregación - -Hasta ahora, solo hemos utilizado el comando `SELECT` para recuperar una tabla de objetos. Sin embargo, SPARQL nos permite realizar análisis muchos más avanzados, como agrupaciones, cálculos y clasificaciones. - -Pongamos por caso que estuviésemos interesados en examinar los objetos realizados entre 1580 y 1600, pero que asimismo quisiésemos conocer cuántos objetos de cada tipo tiene el British Museum en su colección. En vez de limitar nuestros resultados a los objetos del tipo "print", en este caso utilizaríamos el operador `COUNT` para sumar los resultados de nuestra búsqueda en función del tipo al que pertenezcan. - -``` -PREFIX bmo: -PREFIX skos: -PREFIX ecrm: -PREFIX xsd: - -SELECT ?type (COUNT(?type) as ?n) -WHERE { - # Es necesario que indiquemos la variable ?object_type, - # sin embargo, ahora no es necesario que esta coincida con el valor "print" - - ?object bmo:PX_object_type ?object_type . - ?object_type skos:prefLabel ?type . - - # De nuevo, filtraremos por fecha - ?object ecrm:P108i_was_produced_by ?production . - ?production ecrm:P9_consists_of ?date_node . - ?date_node ecrm:P4_has_time-span ?timespan . - ?timespan ecrm:P82a_begin_of_the_begin ?date . - FILTER(?date >= "1580-01-01"^^xsd:date && - ?date <= "1600-01-01"^^xsd:date) -} -# El comando GROUP BY designa la variable que se sumará, -# y el comando ORDER BY DESC() clasifica los resultados -# en orden descedente. - -GROUP BY ?type -ORDER BY DESC(?n) -``` - -[Run query](https://collection.britishmuseum.org/sparql#query=PREFIX+bmo%3A+%3Chttp%3A%2F%2Fwww.researchspace.org%2Fontology%2F%3E%0APREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0APREFIX+ecrm%3A+%3Chttp%3A%2F%2Fwww.cidoc-crm.org%2Fcidoc-crm%2F%3E%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0A%0ASELECT+%3Ftype+(COUNT(%3Ftype)+as+%3Fn)%0AWHERE+%7B%0A++%23+We+still+need+to+indicate+the+%3Fobject_type+variable%2C%0A++%23+however+we+will+not+require+it+to+match+%22print%22+this+time%0A%0A++%3Fobject+bmo%3APX_object_type+%3Fobject_type+.%0A++%3Fobject_type+skos%3AprefLabel+%3Ftype+.%0A%0A++%23+Once+again%2C+we+will+also+filter+by+date%0A++%3Fobject+ecrm%3AP108i_was_produced_by+%3Fproduction+.%0A++%3Fproduction+ecrm%3AP9_consists_of+%3Fdate_node+.%0A++%3Fdate_node+ecrm%3AP4_has_time-span+%3Ftimespan+.%0A++%3Ftimespan+ecrm%3AP82a_begin_of_the_begin+%3Fdate+.%0A++FILTER(%3Fdate+%3E%3D+%221580-01-01%22%5E%5Exsd%3Adate+%26%26%0A+++++++++%3Fdate+%3C%3D+%221600-01-01%22%5E%5Exsd%3Adate)%0A%7D%0A%23+The+GROUP+BY+command+designates+the+variable+to+tally+by%2C%0A%23+and+the+ORDER+BY+DESC()+command+sorts+the+results+by%0A%23+descending+number.%0AGROUP+BY+%3Ftype%0AORDER+BY+DESC(%3Fn)) - -{% include figure.html filename="sparql-lod-12.png" caption="Recuento de los objetos producidos entre 1580 y 1600 según el tipo al que pertenecen." %} - - -### Enlazando múltiples puntos de entrada SPARQL - -
    2018-06-13: Desafortunadamente, Europeana ha eliminado la opción de enlazar puntos de entrada externos por medio de consultas `SERVICE`, y, en consecuencia, esta sección ha dejado de funcionar. Mantenemos el texto que sigue porque creemos que puede tener valor como referencia y porque esperamos que el servicio de Europeana vuelva a estar operativo en el futuro.
    - -Hasta ahora, hemos construido consultas que buscan patrones en un único conjunto de datos. Sin embargo, el escenario ideal al que aspiran los partidarios de LOD viene dado por la posibilidad de enlazar múltiples bases de datos, lo que permitirá realizar consultas mucho más complejas al estar estas basadas en el conocimiento distribuido que es posible extraer de diversos espacios web. No obstante, esto resulta más fácil de decir que de hacer, y muchos puntos de entrada (incluido el del British Museum) todavía no referencian recursos de autoridad externos. - -Un punto de entrada que sí lo hace es el de [Europeana](http://sparql.europeana.eu/). Europeana ha creado enlaces entre los objetos de sus bases de datos y los registros de personas en [DBPedia](http://wiki.dbpedia.org/) y [VIAF](https://viaf.org/), los registros de lugares en [GeoNames](http://sws.geonames.org/), y los conceptos resgistrados el *Tesauro de Arte y Arquitectura* (AAT) del Getty Research Institute. SPARQL nos permite insertar declaraciones `SERVICE` que ordenan a la base de datos "llamar a un amigo" y ejecutar una porción de la consulta en una base de datos externa, utilizando estos resultados para completar la consulta en la base de datos local. Si bien esta lección no se dentendrá en los modelos de datos de Europeana y DBPedia en profundidad, la siguiente consulta nos permite ver cómo funciona la declaración `SELECT`. Cada uno de los lectores puede ejecutarla por sí mismo copiando y pegando el texto de la consulta en el punto de entrada de [Europeana](http://sparql.europeana.eu). (A fin de que la consulta funcione, en el punto de entrada de Europeana se debe configurar el menú "Sponging" para "Retrieve remote RDF data for all missing source graphs"). - -``` -PREFIX ore: -PREFIX edm: -PREFIX rdf: -PREFIX dbo: -PREFIX dbr: -PREFIX rdaGr2: - -# Encuentra todos los ?object relacionados por alguna ?property con un ?agent nacido en una -# ?dutch_city -SELECT ?object ?property ?agent ?dutch_city -WHERE { - ?proxy ?property ?agent . - ?proxy ore:proxyFor ?object . - - ?agent rdf:type edm:Agent . - ?agent rdaGr2:placeOfBirth ?dutch_city . - - # En DBPedia, ?dutch_city está definida por pertenecer al país "Netherlands" - # La declaración SERVICE pregunta a - # http://dbpdeia.org/sparql qué ciudades pertenecen al país - # "Netherlands". La respuesta obtenida de esta subconsulta se utilizará para - # completar nuestra consulta originaria sobre los objetos - # presentes en la base de datos de Europeana - - SERVICE { - ?dutch_city dbo:country dbr:Netherlands . - } -} -# Potencialmente, esta consulta puede devolvernos un elevado número de objetos, por lo que vamos -# a solicitar solo los cien primeros a fin de agilizar la búsqueda -LIMIT 100 -``` - -{% include figure.html filename="sparql-lod-13.png" caption="Visualización de la secuencia de la consulta de la solicitud SPARQL definida más arriba." %} - - -Una consulta interconectada como esta significa que podemos interrogar a Europeana sobre los objetos que cuentan con información geográfica (¿cuáles son las ciudades de Holanda?) sin necesidad de que Europeana tenga que almacenar y mantener esta información por sí misma. Es de esperar que, en el futuro, cada vez mayor cantidad de información LOD de carácter cultural esté enlazada con bases de datos autorizadas, como el ULAN (*Union List of Artist Names*) del [Getty Research Institute](http://www.getty.edu/research/). Esto permitirá, por ejemplo, que el British Museum "externalice" la información biográfica acudiendo a los recursos más completos del GRI. - -## Trabajando con resultados SPARQL - -Una vez que hemos construido y ejecutado una consulta, ¿qué hacemos ahora con estos resultados? Muchos puntos de entrada, como el del British Museum, ofrecen un navegador web que devuelve resultados legibles para los humanos. Sin embargo, el objetivo de los puntos de entrada SPARQL (y para eso están diseñados) es devolver datos estructurados para ser utilizados por otros programas. - -### Exportar resultados en formato CSV - -En la esquina superior derecha de la página de resultados del punto de entrada del BM, se encuentran enlaces para descargas en formato JSON y XML. Otros puntos de entrada también pueden ofrecer la opción de descargar los resultados en CSV/TSV; sin embargo, esta opción no siempre se encuentra disponible. Las salidas JSON y XML desde un punto de entrada SPARQL contienen no solo los valores devueltos por la declaración `SELECT`, sino también metadatos adicionales sobre tipos de variables e idiomas. - -El procesamiento de la versión XML de los resultados se puede realizar con herramientas tales como Beautiful Soup (véase la lección correspondiente en *[The Programming Historian](/lessons/intro-to-beautiful-soup.html)* u [OpenRefine](http://openrefine.org/)). Para convertir rápidamente los resultados JSON desde un punto de entrada SPARQL en un formato tabular, yo recomiendo la utilidad de la línea de comando gratuita [jg](https://stedolan.github.io/jq/download/). (Para un tutorial sobre cómo utilizar programas de línea de comando, véase ["Introduction to the Bash Command Line"](/lessons/intro-to-bash.html)). La siguiente consulta convertirá el formato especial JSON RDF en un fichero CSV, que podremos cargar en nuestro programa preferido para su posterior análisis y visualización: - -``` -jq -r '.head.vars as $fields | ($fields | @csv), (.results.bindings[] | [.[$fields[]].value] | @csv)' sparql.json > sparql.csv -``` - - -### Exportar resultados a Palladio - -La popular plataforma de análisis de datos [Palladio](http://hdlab.stanford.edu/palladio/) puede cargar directamente datos desde un punto de entrada SPARQL. En la parte inferior de la pantalla "Create a new project", el enlace "Load data from a SPARQL endpoint (beta)" nos proporciona un campo para escribir la dirección del punto de entrada y una caja para la consulta propiamente dicha. Dependiendo del punto de entrada, podemos necesitar especifidar el tipo de fichero de salida en la dirección del punto de entrada; por ejemplo, para cargar datos desde el punto de entrada del British Museum, debemos utilizar la dirección `http://collection.britishmuseum.org/sparql.json`. Trata de pegar la consulta de agregación que utilizamos más arriba para el recuento de obras de arte según su tipología y clica en "Run query". Palladio debería mostrar una tabla de previsualización como esta: - -{% include figure.html filename="sparql-lod-14.png" caption="Interfaz de Palladio para las consultas SPARQL." %} - - -Después de previsualizar los datos devueltos por el punto de entrada, clica en en botón "Load data" en la parte inferior de la pantalla para empezar a trabajar con ellos. (Véase esta lección de *[Programming Historian](/es/lecciones/creando-diagramas-de-redes-desde-fuentes-historicas)* para un tutorial más detallado sobre Palladio). [Por ejemplo, podríamos realizar una consulta que devuelva enlaces a las imágenes de los grabados realizados entre 1580 y 1600](https://collection.britishmuseum.org/sparql?query=%23+Return+object+links+and+creation+date%0D%0APREFIX+bmo%3A+%3Chttp%3A%2F%2Fcollection.britishmuseum.org%2Fid%2Fontology%2F%3E%0D%0APREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+ecrm%3A+%3Chttp%3A%2F%2Ferlangen-crm.org%2Fcurrent%2F%3E%0D%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0D%0ASELECT+DISTINCT+%3Fobject+%3Fdate+%3Fimage%0D%0AWHERE+%7B%0D%0A%0D%0A++%23+We%27ll+use+our+previous+command+to+search+only+for+objects+of+type+%22print%22%0D%0A++%3Fobject+bmo%3APX_object_type+%3Fobject_type+.%0D%0A++%3Fobject_type+skos%3AprefLabel+%22print%22+.%0D%0A%0D%0A++%23+We+need+to+link+though+several+nodes+to+find+the+creation+date+associated%0D%0A++%23+with+an+object%0D%0A++%3Fobject+ecrm%3AP108i_was_produced_by+%3Fproduction+.%0D%0A++%3Fproduction+ecrm%3AP9_consists_of+%3Fdate_node+.%0D%0A++%3Fdate_node+ecrm%3AP4_has_time-span+%3Ftimespan+.%0D%0A++%3Ftimespan+ecrm%3AP82a_begin_of_the_begin+%3Fdate+.%0D%0A%0D%0A++%23+Yes%2C+we+need+to+connect+quite+a+few+dots+to+get+to+the+date+node%21+Now+that%0D%0A++%23+we+have+it%2C+we+can+filter+our+results.+Because+we+are+filtering+a+date%2C+we%0D%0A++%23+must+attach+the+xsd%3Adate+tag+to+our+date+strings+so+that+SPARQL+knows+how+to%0D%0A++%23+parse+them.%0D%0A%0D%0A++FILTER%28%3Fdate+%3E%3D+%221580-01-01%22%5E%5Exsd%3Adate+%26%26+%3Fdate+%3C%3D+%221600-01-01%22%5E%5Exsd%3Adate%29%0D%0A++%0D%0A++%3Fobject+bmo%3APX_has_main_representation+%3Fimage+.%0D%0A%7D%0D%0ALIMIT+100#query=%23+Return+object+links+and+creation+date%0APREFIX+bmo%3A+%3Chttp%3A%2F%2Fwww.researchspace.org%2Fontology%2F%3E%0APREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0APREFIX+ecrm%3A+%3Chttp%3A%2F%2Fwww.cidoc-crm.org%2Fcidoc-crm%2F%3E%0ASELECT+DISTINCT+%3Fobject+%3Fdate+%3Fimage%0AWHERE+%7B%0A++%0A++%23+We'll+use+our+previous+command+to+search+only+for+objects+of+type+%22print%22%0A++%3Fobject+bmo%3APX_object_type+%3Fobject_type+.%0A++%3Fobject_type+skos%3AprefLabel+%22print%22+.%0A%0A++%23+We+need+to+link+though+several+nodes+to+find+the+creation+date+associated%0A++%23+with+an+object%0A++%3Fobject+ecrm%3AP108i_was_produced_by+%3Fproduction+.%0A++%3Fproduction+ecrm%3AP9_consists_of+%3Fdate_node+.%0A++%3Fdate_node+ecrm%3AP4_has_time-span+%3Ftimespan+.%0A++%3Ftimespan+ecrm%3AP82a_begin_of_the_begin+%3Fdate+.%0A%0A++%0A++%23+Yes%2C+we+need+to+connect+quite+a+few+dots+to+get+to+the+date+node!+Now+that%0A++%23+we+have+it%2C+we+can+filter+our+results.+Because+we+are+filtering+a+date%2C+we%0A++%23+must+attach+the+xsd%3Adate+tag+to+our+date+strings+so+that+SPARQL+knows+how+to%0A++%23+parse+them.%0A%0A++FILTER(%3Fdate+%3E%3D+%221580-01-01%22%5E%5Exsd%3Adate+%26%26+%3Fdate+%3C%3D+%221600-01-01%22%5E%5Exsd%3Adate)%0A++%0A++%3Fobject+bmo%3APX_has_main_representation+%3Fimage+.%0A%7D%0ALIMIT+100), y representar estos datos como una galería de imágenes clasificadas por fecha: - -{% include figure.html filename="sparql-lod-15.png" caption="Galería de imágenes con línea de tiempo de sus fechas de creación generada utilizando Palladio." %} - - -Adviértase que Palladio está diseñado para funcionar con un conjunto relativamente pequeño de datos (del orden de cientos de miles de filas, no decenas de miles), por lo que pudiera ser necesario utilizar el comando LIMIT, que ya empleamos anteriormente en la consulta en el punto de entrada de Europeana, para reducir el número de resultados obtenidos y así evitar que el programa se quede bloqueado. - -## Lecturas adicionales - -En este tutorial hemos examinado la estructura de LOD y hemos realizado un ejemplo real de cómo escribir consultas SPARQL para la base de datos del British Museum. También hemos aprendido cómo utilizar comandos de agregación en SPARQL para agrupar, contar y clasificar resultados más allá de la simple operación de listarlos. - -Con todo, existen otras muchas maneras de modificar estas consultas, tales como introducir operadores `OR` y `UNION` (para describir consultas condicionales) y declaraciones `CONSTRUCT` (para inferir nuevos enlaces basados en reglas definidas), búsqueda de texto completo o llevar a cabo otras operaciones matemáticas más complejas que la del recuento. Para un informe más detallado de los comandos disponibles en SPARQL, véanse estos enlaces: - -* [Wikibooks SPARQL tutorial](https://en.wikibooks.org/wiki/XQuery/SPARQL_Tutorial) -* [Full W3C Overview of SPARQL](https://www.w3.org/TR/sparql11-overview/) - -Tanto la web de Europeana como la del Getty Vocabularies ofrecen ejemplos extensos y bastante complejos de consultas que pueden constituir buenos recursos para comprender cómo buscar en sus datos: - -* [Europeana SPARQL how-to](http://labs.europeana.eu/api/linked-open-data-SPARQL-endpoint) -* [Getty Vocabularies Example Queries](http://vocab.getty.edu/queries#Finding_Subjects) +--- +title: | + Uso de SPARQL para acceder a datos abiertos enlazados +authors: +- Matthew Lincoln +date: 2015-11-24 +translation_date: 2017-05-20 +editors: +- Fred Gibbs +reviewers: +- Patrick Murray-John +- Jason Heppler +- Will Hanley +- Fred Gibbs +translator: +- Nuria Rodríguez Ortega +translation-editor: +- Antonio Rojas Castro +translation-reviewer: +- Antonio Rojas Castro +- Juan Antonio Pastor Sánchez +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/67 +layout: lesson +original: graph-databases-and-SPARQL +redirect_from: + - /es/lessons/graph-databases-and-SPARQL/ + - /es/lecciones/sparql-datos-abiertos-enlazados/ +difficulty: 2 +activity: acquiring +topics: [lod] +abstract: "Esta lección explica por qué numerosas instituciones culturales están adoptando bases de datos orientadas a grafos y cómo los investigadores pueden acceder a estos datos a través de consultas realizadas en el lenguaje llamado SPARQL." +retired: true +retirement-reason: | + El Museo Británico no ha mantenido el acceso a su base de datos de colecciones de una manera consistente. Aunque la sintaxis y los comandos de SPARQL siguen siendo correctos, las URLs a las que intentan conectarse son ahora demasiado inconsistentes para su uso en una lección funcional. +avatar_alt: Grabado con dos peces unidos por una rama en sus bocas. +doi: 10.46430/phes0027 +--- + +Objetivos de la lección +----------------------- + +Esta lección explica por qué numerosas instituciones culturales están adoptando bases de datos orientadas a grafos (*graph databases*) y cómo los investigadores pueden acceder a estos datos a través de consultas realizadas en el lenguaje llamado SPARQL. + +{% include toc.html %} + + + + + + +## Bases de datos orientadas a grafo, RDF y datos abiertos enlazados (Linked Open Data, LOD) + +Actualmente, numerosas instituciones culturales están ofreciendo información sobre sus colecciones a través de las denominadas API ([*Application Programming Interfaces*](/en/lessons/retired/intro-to-the-zotero-api)). Estas API son instrumentos muy eficaces para acceder de manera automatizada a registros individuales, sin embargo, no constituyen el procedimiento ideal cuando tratamos con datos culturales debido a que las API están estructuradas para trabajar con un conjunto predeterminado de consultas (*queries*). Por ejemplo, un museo puede tener información sobre donantes, artistas, obras de arte, exposiciones, procedencia de sus obras (*provenance*), etc., pero su API puede ofrecer solo una recuperación orientada a objetos, haciendo difícil o imposible buscar datos relacionados con donantes, artistas, etc. Así pues, esta estructura es interesante si el objetivo es buscar información sobre objetos particulares; sin embargo, puede complicar la operación de agregar información sobre los artistas o donantes que también se encuentran registrados en la base de datos. + +Las bases de datos RDF son muy apropiadas para expresar relaciones complejas entre múltiples entidades, como personas, lugares, eventos y conceptos ligados a objetos individuales. Estas bases de datos se denominan habitualmente bases de datos orientadas a grafos (*graph databases*) porque estructuran la información como un grafo o red, donde un conjunto de recursos o nodos están conectados entre sí mediante aristas (o enlaces) que describen las relaciones establecidas entre dichos recursos y/o nodos. + +Dado que las bases de datos RDF admiten el uso de URL, estas pueden estar accesibles *online* y también pueden enlazarse a otras bases de datos, de ahí el término "datos abiertos enlazados" (*Linked Open Data*, LOD). Importantes colecciones artísticas, entre las que se incluyen las del [British Museum](https://collection.britishmuseum.org/), [Europeana](https://labs.europeana.eu/api/linked-open-data-introduction), el [Smithsonian American Art Museum](https://americanart.si.edu/) y el [Yale Center for British Art](https://britishart.yale.edu/collections/using-collections/technology/linked-open-data), han publicado sus colecciones de datos como LOD. El [Getty Vocabulary Program](https://vocab.getty.edu/) también ha publicado sus vocabularios controlados (TGN, ULAN y AAT) como LOD. + +SPARQL es el lenguaje utilizado para interrogar este tipo de bases de datos. Este lenguaje es particularmente potente porque obvia las perspectivas que los usuarios transfieren a los datos. Una consulta sobre objetos y una consulta sobre donantes son prácticamente equivalentes en estas bases de datos. Lamentablemente, numerosos tutoriales sobre SPARQL utilizan modelos de datos tan extremadamente simplificados que no son operativos cuando se trata de utilizar las complejas bases de datos desarrolladas por las instituciones culturales. Este tutorial ofrece un curso intensivo sobre SPARQL utilizando un conjunto de datos (*dataset*) que un humanista podría realmente encontrar en Internet. En concreto, en este tutorial aprenderemos cómo interrogar la colección LOD del British Museum. + +### RDF en pocas palabras + +RDF representa la información en una declaración triple -también llamada tripleta- que sigue la estructura sujeto-predicado-objeto. Por ejemplo: + +``` + . + +``` + +(Observa que, como toda buena oración, estas declaraciones terminan con un punto y final). + +En este ejemplo, el sujeto `` y el objeto `` pueden ser considerados como dos nodos de un grafo, donde el predicado `` define la arista -o relación- entre ellos. (Técnicamente, puede ser tratado en otras consultas como un objeto o un sujeto, pero esta cuestión escapa el alcance de este tutorial). + +Una seudobase de datos RDF podría contener declaraciones interrelacionadas entre sí, como las siguientes: + +``` +... + . + <1642>. + <óleo sobre lienzo>. + <1606>. + . + . + . + <óleo sobre lienzo>. +... +``` + +Si visualizásemos estas declaraciones como nodos y aristas de un grafo o red, la representación sería como sigue: + +{% include figure.html caption="Visualización en red del seudoRDF mostrado más arriba. Las flechas indican la 'dirección' del predicado. Por ejemplo, que '*La tasadora de perlas* fue creada por Vermeer' y no al revés. Diagrama reconstruido por Nuria Rodríguez Ortega." filename="sparql-lod-01.png" %} + +Las tradicionales bases de datos relacionales pueden distribuir atributos sobre obras de arte y artistas en tablas separadas. En las bases de datos RDF u orientadas a grafos, todos estos datos pertenencen a un mismo mismo grafo interconectado, lo que permite a los usuarios una mayor flexibilidad a la hora de decidir cómo quieren interrogar estos recursos. + +### Buscando RDF con SPARQL + +SPARQL nos permite traducir datos en grafo, intensamente enlazados, en datos normalizados en formato tabular, esto es, distribuidos en filas y columnas, que se pueden abrir en programas como Excel o importar a programas de visualización, tales como [plot.ly](https://plot.ly/) o [Palladio](https://hdlab.stanford.edu/palladio/). + +Resulta útil pensar las consultas SPARQL como un [Mad Lib](https://en.wikipedia.org/wiki/Mad_Libs) -un conjunto de oraciones con espacios en blanco-. La base de datos tomará esta consulta y encontrará cada conjunto de oraciones que encaje correctamente en estos espacios en blanco, devolviéndonos los valores coincidentes como una tabla. Veamos esta consulta SPARQL: + +``` +SELECT ?pintura +WHERE { + ?pintura <óleo sobre lienzo> . +} +``` + +En este consulta, `?pintura` representa el nodo (o nodos) que la bases de datos nos devolverá. Una vez recibida la consulta, la base de datos buscará todos los valores para `?pintura` que adecuadamente complete la declaración RDF ` <óleo sobre lienzo>`. + +{% include figure.html caption="Visualización de lo que nuestra consulta está buscando. Diagrama reconstruido por Nuria Rodríguez Ortega." filename="sparql-lod-02.png" %} + + +Cuando la consulta interroga la base de datos completa, esta busca los sujetos, predicados y objetos que coinciden con esta declaración, exluyendo, al mismo tiempo, el resto de datos. + +{% include figure.html filename="sparql-lod-03.png" caption="Visualización de la consulta SPARQL con los elementos mencionados en naranja y los elementos seleccionados (aquellos que nos serán devueltos en los resultados) en rojo. Diagrama reconstruido por Nuria Rodríguez Ortega." %} + +Nuestros resultados podrían tener este aspecto: + +| **pinturas** | +| --------------------- | +| La ronda de noche | +| La tasadora de perlas | + +Ahora bien, lo que hace a RDF y a SPARQL herramientas tan potentes es su habilidad para crear consultas complejas que referencian múltiples variables al mismo tiempo. Por ejemplo, podríamos buscar en nuestra seudobase de datos RDF pinturas creadas por cualquier artista que fuese holandés: + +``` +SELECT ?artista ?pintura +WHERE { + ?artista . + ?pintura ?artista . + } +``` + +En este ejemplo, hemos introducido una segunda variable: `?artista`. La base de datos RDF devolverá todas las combinaciones conincidentes de `?artista` y `?pintura` que encajen en ambas declaraciones. + +{% include figure.html filename="sparql-lod-04.png" caption="Visualización de la consulta SPARQL con los elementos mencionados en naranja y los elementos seleccionados (aquellos que serán recuperados en los resultados en rojo). Diagrama reconstruido por Nuria Rodríguez Ortega." %} + +| artistas | pinturas | +| ------------------ | --------------------- | +| Rembrandt van Rijn | La ronda de noche | +| Johannes Vermeer | La tasadora de perlas | + +### URI y literales + +Hasta ahora, hemos visto una representación facticia de RDF que utiliza un texto fácil de leer. Sin embargo, RDF se almacena principalmente en formato URI (*Uniform Resource Identifiers*), que separa las entidades conceptuales de sus etiquetas lingüísticas. (Ten en cuenta que una URL, o *Uniform Resource Locator*, es una URI accesible desde la web). En RDF real, nuestra declaración original: + +``` + . +``` + +sería más parecido a lo siguiente: + +``` + . +``` + +*N.B. el Rijksmuseum todavía no ha desarrollado su propio sitio LOD, por lo que en esta consulta la URI responde únicamente a objetivos de demostración.* + +A fin de obtener una versión legible desde el punto de vista humano de la información representada por cada una de estas URI, lo que hacemos realmente es recuperar más declaraciones RDF. Incluso el predicado en esta declaración tiene su propia etiqueta literal: + +``` + "La ronda de noche". + "fue creado por". + "Rembrandt van Rijn". +``` + +Como se puede observar, a diferencia de las URI que en esta consulta están enmarcadas por los signos `<>`, los *objetos* son cadenas de texto entrecomilladas. Esto es lo que se conoce como *literales* (*literals*). Los literales representan valores, mientras que las URI representan referencias. Por ejemplo, `` representa una entidad que puede referenciar (y puede ser referenciada por) muchas otras declaraciones (fechas de nacimiento, discípulos, miembros de la familia, etc.), mientras que la cadena de texto `"Rembrandt van Rijn"` solo se representa a sí misma. Otros valores literales en RDF incluyen fechas y números. + +Fijémenos ahora en los predicados de estas declaraciones, con nombres de dominio como `purl.org`, `w3.org` y `xmlns.com`. Estos son algunos de los numerosos proveedores de ontologías que ayudan a estandarizar el modo en que describimos relaciones entre bits de información como, "título", "etiqueta", "creador" o "nombre". Cuanto más trabajemos con RDF/LOD, más proveedores de este tipo encontraremos. + +Las URI pueden llegar a ser difíciles de manejar cuando se componen consultas SPARQL. Para simplificar este proceso se utilizan los *prefijos* (*prefixes*). Los prefijos son atajos que nos liberan de tener que escribir toda la larga cadena de caracteres que constituye una URI. Por ejemplo, recordemos el predicado para recuperar el título de *La ronda de noche*, `http://purl.org/dc/terms/title>`. Con los siguientes prefijos, solo necesitamos escribir `dct:title` cuando queramos utilizar un predicado `purl.org`. `dct:` representa la cadena completa `http://purl.org.dc/terms,` y `'title'` simplemente se agrega al final de este enlace. + +Por ejemplo, con el prefijo `PREFIX rkm: que representa la cadena completa `, agregado al inicio de nuestra consulta SPARQL, `http://data.rijksmuseum.nl/item/8909812347 <` se convierte en `rkm:item/8909812347`. + +Debemos ser conscientes de que los prefijos se pueden asignar arbitrariamente a cualquier abreviatura que queramos; así, diferentes puntos de entrada (*endpoints*) pueden utilizar prefijos ligeramente diferentes para el mismo espacio de nombre (*namespace*) (por ejemplo: `dct vs.` `dcterms` para ``). + +### Términos para revisar + +* **SPARQL** - *Protocol and RDF Query Language* - El lenguaje utilizado para interrogar bases de datos RDF u orientadas a grafos. +* **RDF** - *Resource Description Framework* - Un método para estructurar datos en forma de grafo o como una red de declaraciones conectadas más que como una serie de tablas. +* **LOD** - *Linked Open Data* (datos abiertos enlazados) - LOD son datos RDF publicados *online* en formato URI de modo que los desarrolladores pueden referenciarlos de manera fiable y sin ambigüedad. +* **declaración** - a veces denominada "tripleta", una declaración RDF es una unidad de conocimiento que comprende sujeto, predicado y objeto. +* **URI** - *Uniform Resource Identifier* - una cadena de caracteres que identifica un recurso. Las declaraciones RDF utilizan URI para enlazar varios recursos. Una URL, o *Uniform Resource Locator*, es un tipo de URI que apunta a un determinado recurso en la web. +* **literal** - En las declaraciones RDF, algunos objetos no referencian recursos con una URI sino que vehiculan un valor, que puede ser un texto (`"Rembrandt van Rijn"`), un número (`5`) o una fecha (`1606-06-15`). Estos objetos se conocen como literales. +* **prefijo** - A fin de simplificar las consultas SPARQL, un usuario puede especificar prefijos que funcionan como abreviaturas de las URI completas. Estas abreviaturas, o **QNAmes**, se utilizan también en los espacios de nombre (*namespaces*) de los documentos XML. + +## Consultas basadas en casos reales + +### Todas las declaraciones para un objeto + +Vamos a empezar nuestra primera consulta utilizando el [punto de entrada SPARQL del British Museum](https://collection.britishmuseum.org/sparql). Un punto de entrada SPARQL es una dirección web que acepta consultas SPARQL y devuelve resultados. El punto de entrada del British Museum funciona como muchos otros: cuando accedemos a él a través de un navegador web, encontramos una caja de texto para componer las consultas. + +{% include figure.html filename="sparql-lod-05.png" caption="Web del punto de entrada SPARQL del British Museum. Para todas las consultas de este tutorial, hay que asegurarse de haber dejado las casillas 'Include inferred' y 'Expand results over equivalent URIs' sin marcar." %} + + +Cuando empezamos a explorar una nueva base de datos RDF, resulta últil examinar, a modo de ejemplo, las relaciones que emanan de un [objeto en concreto](https://collection.britishmuseum.org/resource?uri=https://collection.britishmuseum.org/id/object/PPA82633). + +(Para cada una de las siguientes consultas, clica en el enlace "Run query" situado más abajo para ver los resultados. La puedes ejecutar tal y como está o modificarla antes. En este último caso, recuerda que es necesario dejar sin marcar la casilla "Include inferred" antes de ejecutar la consulta). + +``` +SELECT ?p ?o +WHERE { + ?p ?o . +} +``` + +[Run query](https://collection.britishmuseum.org/sparql?query=SELECT+*%0D%0AWHERE+%7B%0D%0A++%3Chttp%3A%2F%2Fcollection.britishmuseum.org%2Fid%2Fobject%2FPPA82633%3E+%3Fp+%3Fo+.%0D%0A++%7D&_implicit=false&_equivalent=false&_form=%2Fsparql) + +Con la orden `SELECT ?p ?o`, le estamos diciendo a la base de datos que nos devuelva los valores de `?p` y `?o` descritos en el comando `WHERE {}`. Esta consulta devuelve cada declaración para la cual nuestra obra de arte seleccionada, ``, es el sujeto. `?p` ocupa la posición central en la declaración RDF en el comando `WHERE {}`, por lo que esta devuelve cualquier predicado que coincide con la declaración, mientras que `?o`, en la posición final, devuelve todos los objetos. Aunque yo las he nombrado como `?p` y `?o`, en realidad, tal y como se puede ver en el ejemplo inferior, es posible nombrar estas variables del modo que nosotros queramos. De hecho, será útil darles nombres significativos para las consultas complejas que siguen a continuación. + +{% include figure.html filename="sparql-lod-06.png" caption="Listado inicial de todos los predicados y objetos asociados con una obra de arte en el British Museum." %} + + +El punto de entrada del Britism Museum formatea la tabla de resultados con enlaces para cada una de las variables, que son, en realidad, nodos RDF, por lo que clicando en cada uno de estos enlaces podemos ver todos los predicados y objetos para cada uno de los nodos seleccionados. Advierte que el British Musuem incluye automáticamente un amplio rango de prefijos SPARQL en sus consultas, por lo que encontraremos numerosos enlaces mostrados en su versión abreviada; si pasamos el ratón sobre ellos, podremos ver las URI sin abreviar. + +{% include figure.html filename="sparql-lod-07.png" caption="Visualización del conjunto de nodos recuperados a través de la primera consulta realizada a la base de datos del British Museum. Los elementos de este grafo coloreados en rojo se encuentran también en la tabla de resultados mostrada más arriba. Se han incluido niveles adicionales en la jerarquía para mostrar cómo esta obra en particular se encuentra conectada en el grafo general que constituye la base de datos del BM." %} + + +Veamos ahora cómo se almacena la información de tipo objeto: busca el predicado `` (marcado en la tabla anterior) y clica en el enlace `thes:x8577` para acceder al nodo que describe el tipo de objeto "print" (grabado). + +{% include figure.html filename="sparql-lod-08.png" caption="Página del recurso `thes:x8577` ('print') en el conjunto de datos enlazados del British Museum." %} + +Como se puede observar, este nodo tiene una etiqueta (*label*) en texto plano, así como enlaces a nodos del tipo "objetos artísticos" con los que se relaciona en el conjunto de la base de datos. + +### Consultas complejas + +Para encontrar otros objetos del mismo tipo descritos con la etiqueta "print", podemos invocar esta consulta: + +``` +PREFIX bmo: +PREFIX skos: + +SELECT ?object +WHERE { + + # Busca todos los valores de ?object que tengan un "object type" dado + ?object bmo:PX_object_type ?object_type . + + # El "object type" debería tener la etiqueta "print" + ?object_type skos:prefLabel "print" . +} +LIMIT 10 +``` + +[Run query](https://collection.britishmuseum.org/sparql#query=PREFIX+bmo%3A+%3Chttp%3A%2F%2Fwww.researchspace.org%2Fontology%2F%3E%0APREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0A%0ASELECT+%3Fobject%0AWHERE+%7B%0A%0A++%23+Search+for+all+values+of+%3Fobject+that+have+a+given+%22object+type%22%0A++%3Fobject+bmo%3APX_object_type+%3Fobject_type+.%0A%0A++%23+That+object+type+should+have+the+label+%22print%22%0A++%3Fobject_type+skos%3AprefLabel+%22print%22+.%0A%7D%0ALIMIT+10) / [User-generated query](https://hypothes.is/a/AVLH7aAMvTW_3w8Ly19w) + +{% include figure.html filename="sparql-lod-09.png" caption="Tabla resultantes de nuestra consulta para todos los objetos del tipo 'print'." %} + +Recuerda que, dado que `"print"` funciona aquí como un literal, lo escribimos entrecomillado en nuestra consulta. Cuando se incluyen literales en las consultas SPARQL, la base de datos solo devuelve coincidencias exactas para estos valores. + +Advierte también que, dado que `?object_type` no se encuentra presente en el comando `SELECT`, este no se mostrará en la tabla de resultados. Sin embargo, resulta esencial estructurar nuestra consulta, porque es esto lo que permite conectar los puntos desde `?object` con la etiqueta `"print"`. + +### FILTER + +En los ejemplos anteriores, nuestra consulta SPARQL ha buscado una coincidencia exacta para el tipo de objeto con la etiqueta "print". Sin embargo, con frecuencia querremos encontrar valores literales que caen dentro de un determinado rango, como son las fechas. Para ello utilizaremos el comando `FILTER`. + +Para localizar las URI de todos los grabados presentes en la base de datos del British Museum creados entre 1580 y 1600, necesitaremos, en primer lugar, averiguar dónde se almacenan en la base de datos las fechas en relación con los objetos, y entonces añadir referencias a estas fechas en nuestra consulta. De manera similar al procedimiento que hemos seguido de un único enlace para determinar un tipo de objeto, debemos ahora movernos a través de diversos nodos para encontrar las fechas de producción asociadas a un objeto dado: + +{% include figure.html filename="sparql-lod-10.png" caption="Visualización de la parte del modelo de datos del British Museum donde las fechas de producción están conectadas a los objetos." %} + +``` +PREFIX bmo: +PREFIX skos: +PREFIX ecrm: +PREFIX xsd: + +# Recupera enlaces de objetos y fechas de creación +SELECT ?object ?date +WHERE { + + # Utilizaremos nuestro comando previo para buscar solo + # objetos del tipo "print" + ?object bmo:PX_object_type ?object_type . + ?object_type skos:prefLabel "print" . + + # Necesitamos enlazar diversos nodos para encontrar la + # fecha de creación asociada con un objeto + ?object ecrm:P108i_was_produced_by ?production . + ?production ecrm:P9_consists_of ?date_node . + ?date_node ecrm:P4_has_time-span ?timespan . + ?timespan ecrm:P82a_begin_of_the_begin ?date . + + # Como se ve, es necesario conectar unos cuantos pocos de puntos + # para llegar al nodo de la fecha. Ahora que lo tehemos, podemos + # filtrar nuestros resultados. Dado que estamos filtrando por fecha, + # debemos agregar la etiqueta ^^xsd:date después de nuestra cadena de fecha. + # Esta etiqueta le dice a la base de datos que interprete la cadena + # "1580-01-01" como la fecha 1 de enero de 1580. + + FILTER(?date >= "1580-01-01"^^xsd:date && + ?date <= "1600-01-01"^^xsd:date) +} +``` + +[Run query](https://collection.britishmuseum.org/sparql#query=PREFIX+bmo%3A+%3Chttp%3A%2F%2Fwww.researchspace.org%2Fontology%2F%3E%0APREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0APREFIX+ecrm%3A+%3Chttp%3A%2F%2Fwww.cidoc-crm.org%2Fcidoc-crm%2F%3E%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0A%0A%23+Return+object+links+and+creation+date%0ASELECT+%3Fobject+%3Fdate%0AWHERE+%7B%0A%0A++%23+We'll+use+our+previous+command+to+search+only+for%0A++%23+objects+of+type+%22print%22%0A++%3Fobject+bmo%3APX_object_type+%3Fobject_type+.%0A++%3Fobject_type+skos%3AprefLabel+%22print%22+.%0A%0A++%23+We+need+to+link+though+several+nodes+to+find+the%0A++%23+creation+date+associated+with+an+object%0A++%3Fobject+ecrm%3AP108i_was_produced_by+%3Fproduction+.%0A++%3Fproduction+ecrm%3AP9_consists_of+%3Fdate_node+.%0A++%3Fdate_node+ecrm%3AP4_has_time-span+%3Ftimespan+.%0A++%3Ftimespan+ecrm%3AP82a_begin_of_the_begin+%3Fdate+.%0A%0A++%23+As+you+can+see%2C+we+need+to+connect+quite+a+few+dots%0A++%23+to+get+to+the+date+node!+Now+that+we+have+it%2C+we+can%0A++%23+filter+our+results.+Because+we+are+filtering+by+date%2C%0A++%23+we+must+attach+the+tag+%5E%5Exsd%3Adate+after+our+date+strings.%0A++%23+This+tag+tells+the+database+to+interpret+the+string%0A++%23+%221580-01-01%22+as+the+date+1+January+1580.%0A%0A++FILTER(%3Fdate+%3E%3D+%221580-01-01%22%5E%5Exsd%3Adate+%26%26%0A+++++++++%3Fdate+%3C%3D+%221600-01-01%22%5E%5Exsd%3Adate)%0A%7D) + +{% include figure.html filename="sparql-lod-11.png" caption="Todos los grabados del British Museum realizados entre 1580-1600." %} + + +### Agregación + +Hasta ahora, solo hemos utilizado el comando `SELECT` para recuperar una tabla de objetos. Sin embargo, SPARQL nos permite realizar análisis muchos más avanzados, como agrupaciones, cálculos y clasificaciones. + +Pongamos por caso que estuviésemos interesados en examinar los objetos realizados entre 1580 y 1600, pero que asimismo quisiésemos conocer cuántos objetos de cada tipo tiene el British Museum en su colección. En vez de limitar nuestros resultados a los objetos del tipo "print", en este caso utilizaríamos el operador `COUNT` para sumar los resultados de nuestra búsqueda en función del tipo al que pertenezcan. + +``` +PREFIX bmo: +PREFIX skos: +PREFIX ecrm: +PREFIX xsd: + +SELECT ?type (COUNT(?type) as ?n) +WHERE { + # Es necesario que indiquemos la variable ?object_type, + # sin embargo, ahora no es necesario que esta coincida con el valor "print" + + ?object bmo:PX_object_type ?object_type . + ?object_type skos:prefLabel ?type . + + # De nuevo, filtraremos por fecha + ?object ecrm:P108i_was_produced_by ?production . + ?production ecrm:P9_consists_of ?date_node . + ?date_node ecrm:P4_has_time-span ?timespan . + ?timespan ecrm:P82a_begin_of_the_begin ?date . + FILTER(?date >= "1580-01-01"^^xsd:date && + ?date <= "1600-01-01"^^xsd:date) +} +# El comando GROUP BY designa la variable que se sumará, +# y el comando ORDER BY DESC() clasifica los resultados +# en orden descedente. + +GROUP BY ?type +ORDER BY DESC(?n) +``` + +[Run query](https://collection.britishmuseum.org/sparql#query=PREFIX+bmo%3A+%3Chttp%3A%2F%2Fwww.researchspace.org%2Fontology%2F%3E%0APREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0APREFIX+ecrm%3A+%3Chttp%3A%2F%2Fwww.cidoc-crm.org%2Fcidoc-crm%2F%3E%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0A%0ASELECT+%3Ftype+(COUNT(%3Ftype)+as+%3Fn)%0AWHERE+%7B%0A++%23+We+still+need+to+indicate+the+%3Fobject_type+variable%2C%0A++%23+however+we+will+not+require+it+to+match+%22print%22+this+time%0A%0A++%3Fobject+bmo%3APX_object_type+%3Fobject_type+.%0A++%3Fobject_type+skos%3AprefLabel+%3Ftype+.%0A%0A++%23+Once+again%2C+we+will+also+filter+by+date%0A++%3Fobject+ecrm%3AP108i_was_produced_by+%3Fproduction+.%0A++%3Fproduction+ecrm%3AP9_consists_of+%3Fdate_node+.%0A++%3Fdate_node+ecrm%3AP4_has_time-span+%3Ftimespan+.%0A++%3Ftimespan+ecrm%3AP82a_begin_of_the_begin+%3Fdate+.%0A++FILTER(%3Fdate+%3E%3D+%221580-01-01%22%5E%5Exsd%3Adate+%26%26%0A+++++++++%3Fdate+%3C%3D+%221600-01-01%22%5E%5Exsd%3Adate)%0A%7D%0A%23+The+GROUP+BY+command+designates+the+variable+to+tally+by%2C%0A%23+and+the+ORDER+BY+DESC()+command+sorts+the+results+by%0A%23+descending+number.%0AGROUP+BY+%3Ftype%0AORDER+BY+DESC(%3Fn)) + +{% include figure.html filename="sparql-lod-12.png" caption="Recuento de los objetos producidos entre 1580 y 1600 según el tipo al que pertenecen." %} + + +### Enlazando múltiples puntos de entrada SPARQL + +
    2018-06-13: Desafortunadamente, Europeana ha eliminado la opción de enlazar puntos de entrada externos por medio de consultas `SERVICE`, y, en consecuencia, esta sección ha dejado de funcionar. Mantenemos el texto que sigue porque creemos que puede tener valor como referencia y porque esperamos que el servicio de Europeana vuelva a estar operativo en el futuro.
    + +Hasta ahora, hemos construido consultas que buscan patrones en un único conjunto de datos. Sin embargo, el escenario ideal al que aspiran los partidarios de LOD viene dado por la posibilidad de enlazar múltiples bases de datos, lo que permitirá realizar consultas mucho más complejas al estar estas basadas en el conocimiento distribuido que es posible extraer de diversos espacios web. No obstante, esto resulta más fácil de decir que de hacer, y muchos puntos de entrada (incluido el del British Museum) todavía no referencian recursos de autoridad externos. + +Un punto de entrada que sí lo hace es el de [Europeana](https://sparql.europeana.eu/). Europeana ha creado enlaces entre los objetos de sus bases de datos y los registros de personas en [DBPedia](https://wiki.dbpedia.org/) y [VIAF](https://www.oclc.org/es/viaf.html), los registros de lugares en [GeoNames](https://sws.geonames.org/), y los conceptos resgistrados el *Tesauro de Arte y Arquitectura* (AAT) del Getty Research Institute. SPARQL nos permite insertar declaraciones `SERVICE` que ordenan a la base de datos "llamar a un amigo" y ejecutar una porción de la consulta en una base de datos externa, utilizando estos resultados para completar la consulta en la base de datos local. Si bien esta lección no se dentendrá en los modelos de datos de Europeana y DBPedia en profundidad, la siguiente consulta nos permite ver cómo funciona la declaración `SELECT`. Cada uno de los lectores puede ejecutarla por sí mismo copiando y pegando el texto de la consulta en el punto de entrada de [Europeana](https://sparql.europeana.eu). (A fin de que la consulta funcione, en el punto de entrada de Europeana se debe configurar el menú "Sponging" para "Retrieve remote RDF data for all missing source graphs"). + +``` +PREFIX ore: +PREFIX edm: +PREFIX rdf: +PREFIX dbo: +PREFIX dbr: +PREFIX rdaGr2: + +# Encuentra todos los ?object relacionados por alguna ?property con un ?agent nacido en una +# ?dutch_city +SELECT ?object ?property ?agent ?dutch_city +WHERE { + ?proxy ?property ?agent . + ?proxy ore:proxyFor ?object . + + ?agent rdf:type edm:Agent . + ?agent rdaGr2:placeOfBirth ?dutch_city . + + # En DBPedia, ?dutch_city está definida por pertenecer al país "Netherlands" + # La declaración SERVICE pregunta a + # http://dbpdeia.org/sparql qué ciudades pertenecen al país + # "Netherlands". La respuesta obtenida de esta subconsulta se utilizará para + # completar nuestra consulta originaria sobre los objetos + # presentes en la base de datos de Europeana + + SERVICE { + ?dutch_city dbo:country dbr:Netherlands . + } +} +# Potencialmente, esta consulta puede devolvernos un elevado número de objetos, por lo que vamos +# a solicitar solo los cien primeros a fin de agilizar la búsqueda +LIMIT 100 +``` + +{% include figure.html filename="sparql-lod-13.png" caption="Visualización de la secuencia de la consulta de la solicitud SPARQL definida más arriba." %} + + +Una consulta interconectada como esta significa que podemos interrogar a Europeana sobre los objetos que cuentan con información geográfica (¿cuáles son las ciudades de Holanda?) sin necesidad de que Europeana tenga que almacenar y mantener esta información por sí misma. Es de esperar que, en el futuro, cada vez mayor cantidad de información LOD de carácter cultural esté enlazada con bases de datos autorizadas, como el ULAN (*Union List of Artist Names*) del [Getty Research Institute](https://www.getty.edu/research/). Esto permitirá, por ejemplo, que el British Museum "externalice" la información biográfica acudiendo a los recursos más completos del GRI. + +## Trabajando con resultados SPARQL + +Una vez que hemos construido y ejecutado una consulta, ¿qué hacemos ahora con estos resultados? Muchos puntos de entrada, como el del British Museum, ofrecen un navegador web que devuelve resultados legibles para los humanos. Sin embargo, el objetivo de los puntos de entrada SPARQL (y para eso están diseñados) es devolver datos estructurados para ser utilizados por otros programas. + +### Exportar resultados en formato CSV + +En la esquina superior derecha de la página de resultados del punto de entrada del BM, se encuentran enlaces para descargas en formato JSON y XML. Otros puntos de entrada también pueden ofrecer la opción de descargar los resultados en CSV/TSV; sin embargo, esta opción no siempre se encuentra disponible. Las salidas JSON y XML desde un punto de entrada SPARQL contienen no solo los valores devueltos por la declaración `SELECT`, sino también metadatos adicionales sobre tipos de variables e idiomas. + +El procesamiento de la versión XML de los resultados se puede realizar con herramientas tales como Beautiful Soup (véase la lección correspondiente en *[The Programming Historian](/en/lessons/intro-to-beautiful-soup)* u [OpenRefine](https://openrefine.org/)). Para convertir rápidamente los resultados JSON desde un punto de entrada SPARQL en un formato tabular, yo recomiendo la utilidad de la línea de comando gratuita [jg](https://stedolan.github.io/jq/download/). (Para un tutorial sobre cómo utilizar programas de línea de comando, véase ["Introduction to the Bash Command Line"](/en/lessons/intro-to-bash)). La siguiente consulta convertirá el formato especial JSON RDF en un fichero CSV, que podremos cargar en nuestro programa preferido para su posterior análisis y visualización: + +``` +jq -r '.head.vars as $fields | ($fields | @csv), (.results.bindings[] | [.[$fields[]].value] | @csv)' sparql.json > sparql.csv +``` + + +### Exportar resultados a Palladio + +La popular plataforma de análisis de datos [Palladio](https://hdlab.stanford.edu/palladio/) puede cargar directamente datos desde un punto de entrada SPARQL. En la parte inferior de la pantalla "Create a new project", el enlace "Load data from a SPARQL endpoint (beta)" nos proporciona un campo para escribir la dirección del punto de entrada y una caja para la consulta propiamente dicha. Dependiendo del punto de entrada, podemos necesitar especifidar el tipo de fichero de salida en la dirección del punto de entrada; por ejemplo, para cargar datos desde el punto de entrada del British Museum, debemos utilizar la dirección `http://collection.britishmuseum.org/sparql.json`. Trata de pegar la consulta de agregación que utilizamos más arriba para el recuento de obras de arte según su tipología y clica en "Run query". Palladio debería mostrar una tabla de previsualización como esta: + +{% include figure.html filename="sparql-lod-14.png" caption="Interfaz de Palladio para las consultas SPARQL." %} + + +Después de previsualizar los datos devueltos por el punto de entrada, clica en en botón "Load data" en la parte inferior de la pantalla para empezar a trabajar con ellos. (Véase esta lección de *[Programming Historian](/es/lecciones/creando-diagramas-de-redes-desde-fuentes-historicas)* para un tutorial más detallado sobre Palladio). [Por ejemplo, podríamos realizar una consulta que devuelva enlaces a las imágenes de los grabados realizados entre 1580 y 1600](https://collection.britishmuseum.org/sparql?query=%23+Return+object+links+and+creation+date%0D%0APREFIX+bmo%3A+%3Chttp%3A%2F%2Fcollection.britishmuseum.org%2Fid%2Fontology%2F%3E%0D%0APREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0D%0APREFIX+ecrm%3A+%3Chttp%3A%2F%2Ferlangen-crm.org%2Fcurrent%2F%3E%0D%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0D%0ASELECT+DISTINCT+%3Fobject+%3Fdate+%3Fimage%0D%0AWHERE+%7B%0D%0A%0D%0A++%23+We%27ll+use+our+previous+command+to+search+only+for+objects+of+type+%22print%22%0D%0A++%3Fobject+bmo%3APX_object_type+%3Fobject_type+.%0D%0A++%3Fobject_type+skos%3AprefLabel+%22print%22+.%0D%0A%0D%0A++%23+We+need+to+link+though+several+nodes+to+find+the+creation+date+associated%0D%0A++%23+with+an+object%0D%0A++%3Fobject+ecrm%3AP108i_was_produced_by+%3Fproduction+.%0D%0A++%3Fproduction+ecrm%3AP9_consists_of+%3Fdate_node+.%0D%0A++%3Fdate_node+ecrm%3AP4_has_time-span+%3Ftimespan+.%0D%0A++%3Ftimespan+ecrm%3AP82a_begin_of_the_begin+%3Fdate+.%0D%0A%0D%0A++%23+Yes%2C+we+need+to+connect+quite+a+few+dots+to+get+to+the+date+node%21+Now+that%0D%0A++%23+we+have+it%2C+we+can+filter+our+results.+Because+we+are+filtering+a+date%2C+we%0D%0A++%23+must+attach+the+xsd%3Adate+tag+to+our+date+strings+so+that+SPARQL+knows+how+to%0D%0A++%23+parse+them.%0D%0A%0D%0A++FILTER%28%3Fdate+%3E%3D+%221580-01-01%22%5E%5Exsd%3Adate+%26%26+%3Fdate+%3C%3D+%221600-01-01%22%5E%5Exsd%3Adate%29%0D%0A++%0D%0A++%3Fobject+bmo%3APX_has_main_representation+%3Fimage+.%0D%0A%7D%0D%0ALIMIT+100#query=%23+Return+object+links+and+creation+date%0APREFIX+bmo%3A+%3Chttp%3A%2F%2Fwww.researchspace.org%2Fontology%2F%3E%0APREFIX+skos%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2004%2F02%2Fskos%2Fcore%23%3E%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0APREFIX+ecrm%3A+%3Chttp%3A%2F%2Fwww.cidoc-crm.org%2Fcidoc-crm%2F%3E%0ASELECT+DISTINCT+%3Fobject+%3Fdate+%3Fimage%0AWHERE+%7B%0A++%0A++%23+We'll+use+our+previous+command+to+search+only+for+objects+of+type+%22print%22%0A++%3Fobject+bmo%3APX_object_type+%3Fobject_type+.%0A++%3Fobject_type+skos%3AprefLabel+%22print%22+.%0A%0A++%23+We+need+to+link+though+several+nodes+to+find+the+creation+date+associated%0A++%23+with+an+object%0A++%3Fobject+ecrm%3AP108i_was_produced_by+%3Fproduction+.%0A++%3Fproduction+ecrm%3AP9_consists_of+%3Fdate_node+.%0A++%3Fdate_node+ecrm%3AP4_has_time-span+%3Ftimespan+.%0A++%3Ftimespan+ecrm%3AP82a_begin_of_the_begin+%3Fdate+.%0A%0A++%0A++%23+Yes%2C+we+need+to+connect+quite+a+few+dots+to+get+to+the+date+node!+Now+that%0A++%23+we+have+it%2C+we+can+filter+our+results.+Because+we+are+filtering+a+date%2C+we%0A++%23+must+attach+the+xsd%3Adate+tag+to+our+date+strings+so+that+SPARQL+knows+how+to%0A++%23+parse+them.%0A%0A++FILTER(%3Fdate+%3E%3D+%221580-01-01%22%5E%5Exsd%3Adate+%26%26+%3Fdate+%3C%3D+%221600-01-01%22%5E%5Exsd%3Adate)%0A++%0A++%3Fobject+bmo%3APX_has_main_representation+%3Fimage+.%0A%7D%0ALIMIT+100), y representar estos datos como una galería de imágenes clasificadas por fecha: + +{% include figure.html filename="sparql-lod-15.png" caption="Galería de imágenes con línea de tiempo de sus fechas de creación generada utilizando Palladio." %} + + +Adviértase que Palladio está diseñado para funcionar con un conjunto relativamente pequeño de datos (del orden de cientos de miles de filas, no decenas de miles), por lo que pudiera ser necesario utilizar el comando LIMIT, que ya empleamos anteriormente en la consulta en el punto de entrada de Europeana, para reducir el número de resultados obtenidos y así evitar que el programa se quede bloqueado. + +## Lecturas adicionales + +En este tutorial hemos examinado la estructura de LOD y hemos realizado un ejemplo real de cómo escribir consultas SPARQL para la base de datos del British Museum. También hemos aprendido cómo utilizar comandos de agregación en SPARQL para agrupar, contar y clasificar resultados más allá de la simple operación de listarlos. + +Con todo, existen otras muchas maneras de modificar estas consultas, tales como introducir operadores `OR` y `UNION` (para describir consultas condicionales) y declaraciones `CONSTRUCT` (para inferir nuevos enlaces basados en reglas definidas), búsqueda de texto completo o llevar a cabo otras operaciones matemáticas más complejas que la del recuento. Para un informe más detallado de los comandos disponibles en SPARQL, véanse estos enlaces: + +* [Wikibooks SPARQL tutorial](https://en.wikibooks.org/wiki/XQuery/SPARQL_Tutorial) +* [Full W3C Overview of SPARQL](https://www.w3.org/TR/sparql11-overview/) + +Tanto la web de Europeana como la del Getty Vocabularies ofrecen ejemplos extensos y bastante complejos de consultas que pueden constituir buenos recursos para comprender cómo buscar en sus datos: + +* [Europeana SPARQL how-to](https://labs.europeana.eu/api/linked-open-data-SPARQL-endpoint) +* [Getty Vocabularies Example Queries](https://vocab.getty.edu/queries#Finding_Subjects) diff --git a/es/lecciones/reutilizacion-de-codigo-y-modularidad.md b/es/lecciones/reutilizacion-de-codigo-y-modularidad.md index 04d875752f..d3d1b1131b 100644 --- a/es/lecciones/reutilizacion-de-codigo-y-modularidad.md +++ b/es/lecciones/reutilizacion-de-codigo-y-modularidad.md @@ -20,8 +20,8 @@ translation-reviewer: review-ticket: https://github.com/programminghistorian/ph-submissions/issues/41 layout: lesson categories: [lessons, original-ph, python] -next: trabajar-con-paginas-web -previous: trabajar-con-archivos-de-texto +next: /es/lecciones/trabajar-con-paginas-web +previous: /es/lecciones/trabajar-con-archivos-de-texto original: code-reuse-and-modularity difficulty: 2 activity: transforming diff --git a/es/lecciones/reutilizando-colecciones-digitales-glam-labs.md b/es/lecciones/reutilizando-colecciones-digitales-glam-labs.md index 10e650ffbc..a5e5152667 100644 --- a/es/lecciones/reutilizando-colecciones-digitales-glam-labs.md +++ b/es/lecciones/reutilizando-colecciones-digitales-glam-labs.md @@ -43,11 +43,11 @@ Tradicionalmente las instituciones de patrimonio cultural conocidas como [GLAM]( El avance de las tecnologías ha favorecido un nuevo contexto en el que las colecciones digitales pueden ser utilizadas en investigación por medio de diferentes métodos, como visión por computador o técnicas de aprendizaje automático. Actualmente, las instituciones GLAM promueven e incentivan la reutilización de sus colecciones digitales a través de programas de colaboración directa con investigadores pero también con empresas e instituciones académicas. Las instituciones de patrimonio cultural han comenzado a experimentar de forma creativa e innovadora con las colecciones digitales, que tradicionalmente han puesto a disposición del público, lo que ha favorecido la creación de nuevos espacios en el seno de las instituciones, conocidos como "Labs". -Uno de los primeros, líder en este ámbito, y que ha establecido las bases para el resto, es el de la [Biblioteca Británica](http://labs.bl.uk), financiado por la [Mellon Foundation](https://mellon.org/). Como resultado de dos encuentros de carácter internacional en la sede de la Biblioteca Británica y en la [Biblioteca Real de Dinamarca](https://www.kb.dk/en), en Copenhague, se creó la [Comunidad Internacional GLAM Labs](https://glamlabs.io) compuesta por numerosas instituciones, que se muestran en la Figura 1. +Uno de los primeros, líder en este ámbito, y que ha establecido las bases para el resto, es el de la [Biblioteca Británica](https://labs.bl.uk), financiado por la [Mellon Foundation](https://mellon.org/). Como resultado de dos encuentros de carácter internacional en la sede de la Biblioteca Británica y en la [Biblioteca Real de Dinamarca](https://www.kb.dk/en), en Copenhague, se creó la [Comunidad Internacional GLAM Labs](https://glamlabs.io) compuesta por numerosas instituciones, que se muestran en la Figura 1. {% include figure.html filename="reutilizando-colecciones-digitales-glam-labs1.png" caption="Mapa que representa las instituciones de la Comunidad Internacional GLAM Labs" %} -En septiembre de 2019, dieciséis personas pertenecientes a dicha comunidad se reunieron en Doha, Catar, para escribir, a partir de la metodología [Book Sprint](https://www.booksprints.net/book/book-sprint-open-a-glam-lab/), el libro [Open a GLAM Lab](https://www.glamlabs.io/publications/open-a-glam-lab) que actualmente ha sido traducido a diversos idiomas, entre ellos [español](http://rua.ua.es/dspace/handle/10045/110281) y [árabe](https://qspace.qu.edu.qa/handle/10576/13484). +En septiembre de 2019, dieciséis personas pertenecientes a dicha comunidad se reunieron en Doha, Catar, para escribir, a partir de la metodología [Book Sprint](https://www.booksprints.net/book/book-sprint-open-a-glam-lab/), el libro [Open a GLAM Lab](https://www.glamlabs.io/publications/open-a-glam-lab) que actualmente ha sido traducido a diversos idiomas, entre ellos [español](https://rua.ua.es/dspace/handle/10045/110281) y [árabe](https://qspace.qu.edu.qa/handle/10576/13484). Una colección digital publicada por una institución GLAM puede estar formada por cualquier tipo de contenido incluyendo metadatos, textos, imágenes, mapas, videos o audios. En este sentido, reutilizar una colección digital consiste en analizar el contenido para adquirir nuevo conocimiento. El análisis puede constar de fases tales como extracción, transformación y enriquecimiento. Como resultado podemos obtener una nueva colección descrita con otro vocabulario más expresivo y rico, una visualización que facilite el descubrimiento de conocimiento, o una agregación de diferentes colecciones digitales basadas en un tema específico. @@ -55,7 +55,7 @@ A la hora de reutilizar una colección digital existen diferentes aspectos que d Recientemente se publicó el estudio *[Collections as data](https://collectionsasdata.github.io/)*, que proporciona un nuevo enfoque para publicar las colecciones digitales que facilitan el procesamiento por parte de las computadoras. Por ejemplo, es posible utilizar un corpus de miles de textos para identificar personas o lugares de forma automática. Las computadoras permiten la aplicación de métodos de investigación en Humanidades Digitales como [minería de textos](https://es.wikipedia.org/wiki/Miner%C3%ADa_de_textos), [visualización de datos](https://es.wikipedia.org/wiki/Visualizaci%C3%B3n_de_datos) o el uso de [Sistemas de Información Geográfica (SIG)](https://es.wikipedia.org/wiki/Sistema_de_informaci%C3%B3n_geogr%C3%A1fica), como también [procesamiento de lenguaje natural](https://es.wikipedia.org/wiki/Procesamiento_de_lenguajes_naturales), [inteligencia artificial](https://es.wikipedia.org/wiki/Inteligencia_artificial) y [visión por computador](https://es.wikipedia.org/wiki/Visi%C3%B3n_artificial). -La combinación de las colecciones digitales proporcionadas por las instituciones GLAM, junto a código y narrativa, proporcionan el marco ideal para la reproducción de los resultados de investigación. En este sentido, los Jupyter Notebooks permiten integrar estos tres elementos y se han convertido en un recurso muy popular tanto en la comunidad investigadora como en la educativa. Numerosos proyectos se centran en la publicación de colecciones de notebooks, como por ejemplo [GLAM Workbench](https://glam-workbench.github.io/) o [GLAM Jupyter Notebooks](http://data.cervantesvirtual.com/blog/notebooks/). Los Labs favorecen un espacio para poner de manifiesto estas nuevas tendencias para mejorar y mantener la relevancia de las instituciones de patrimonio cultural. +La combinación de las colecciones digitales proporcionadas por las instituciones GLAM, junto a código y narrativa, proporcionan el marco ideal para la reproducción de los resultados de investigación. En este sentido, los Jupyter Notebooks permiten integrar estos tres elementos y se han convertido en un recurso muy popular tanto en la comunidad investigadora como en la educativa. Numerosos proyectos se centran en la publicación de colecciones de notebooks, como por ejemplo [GLAM Workbench](https://glam-workbench.github.io/) o [GLAM Jupyter Notebooks](https://data.cervantesvirtual.com/blog/notebooks/). Los Labs favorecen un espacio para poner de manifiesto estas nuevas tendencias para mejorar y mantener la relevancia de las instituciones de patrimonio cultural. En esta lección se incluyen varias opciones para localizar colecciones digitales publicadas por instituciones GLAM para su reutilización. A continuación, se introducen dos ejemplos implementados como Jupyter Notebooks que muestran de forma reproducible cómo reutilizar las colecciones digitales a través de diferentes técnicas que se encuentran disponibles en [Zenodo](https://zenodo.org/record/5340157)[^1]. El último apartado corresponde a las conclusiones. @@ -65,10 +65,10 @@ Actualmente existen numerosos sitios web donde es posible localizar colecciones | Institución | Colección | URL | | ------------- | ------------- | ------------- | -| Bibliotèque Nationale de France | BnF API et jeux de données | [http://api.bnf.fr/](http://api.bnf.fr/) | +| Bibliotèque Nationale de France | BnF API et jeux de données | [https://api.bnf.fr/](https://api.bnf.fr/) | | Bibliothèque Nationale du Luxembourg | BnL Open Data | [https://data.bnl.lu/](https://data.bnl.lu/) | | British Library | BL Labs | [temporalmente no disponible] | -| Biblioteca Virtual Miguel de Cervantes | BVMC Labs | [http://data.cervantesvirtual.com/blog/labs](http://data.cervantesvirtual.com/blog/labs) | +| Biblioteca Virtual Miguel de Cervantes | BVMC Labs | [https://data.cervantesvirtual.com/blog/labs](https://data.cervantesvirtual.com/blog/labs) | | Det Kgl. Bibliotek | KB Labs | [https://labs.kb.dk/](https://labs.kb.dk/) | | Europeana | Europeana IIIF APIs | [https://pro.europeana.eu/page/iiif](https://pro.europeana.eu/page/iiif) | | History Trust of South Australia | Learn section | [https://history.sa.gov.au/](https://history.sa.gov.au/) | @@ -79,7 +79,7 @@ Actualmente existen numerosos sitios web donde es posible localizar colecciones | Staatsbibliothek zu Berlin | SBB Labs | [https://lab.sbb.berlin/?lang=en](https://lab.sbb.berlin/?lang=en)| | State Library New South Wales | DX Lab | [https://dxlab.sl.nsw.gov.au](https://dxlab.sl.nsw.gov.au)| -Las instituciones GLAM publican colecciones digitales en diferentes formatos. Tradicionalmente han publicado diversos tipos de materiales como imágenes, textos y mapas. Recientemente, han aparecido nuevas formas de publicación que utilizan tecnologías basadas en la [Web Semántica](https://es.wikipedia.org/wiki/Web_sem%C3%A1ntica). Estas técnicas permiten el enriquecimiento con repositorios externos a partir de la creación de enlaces. [Wikidata](https://www.wikidata.org) se ha convertido en un repositorio muy popular en el ámbito de las instituciones GLAM y muchas de ellas ya disponen de propiedades específicas para enlazar sus recursos como autores y obras. Por ejemplo, la [Biblioteca Virtual Miguel de Cervantes](http://www.cervantesvirtual.com/) dispone de la propiedad [P2799](https://www.wikidata.org/wiki/Property:P2799) para enlazar autores desde su repositorio de datos abiertos hacia Wikidata. +Las instituciones GLAM publican colecciones digitales en diferentes formatos. Tradicionalmente han publicado diversos tipos de materiales como imágenes, textos y mapas. Recientemente, han aparecido nuevas formas de publicación que utilizan tecnologías basadas en la [Web Semántica](https://es.wikipedia.org/wiki/Web_sem%C3%A1ntica). Estas técnicas permiten el enriquecimiento con repositorios externos a partir de la creación de enlaces. [Wikidata](https://www.wikidata.org) se ha convertido en un repositorio muy popular en el ámbito de las instituciones GLAM y muchas de ellas ya disponen de propiedades específicas para enlazar sus recursos como autores y obras. Por ejemplo, la [Biblioteca Virtual Miguel de Cervantes](https://www.cervantesvirtual.com/) dispone de la propiedad [P2799](https://www.wikidata.org/wiki/Property:P2799) para enlazar autores desde su repositorio de datos abiertos hacia Wikidata. @@ -233,7 +233,7 @@ Este ejemplo se basa en la recuperación de localizaciones geográficas relacion En este sentido, este ejemplo pretende introducir los pasos necesarios para reutilizar una colección digital publicada, siguiendo los principios de Linked Open Data que facilitan el establecimiento de enlaces a repositorios externos. Los repositorios semánticos publicados por instituciones GLAM son una fuente de información de gran valor que se encuentran a disposición de los investigadores sin ningún tipo de restricción para su uso. Sin embargo, su reutilización no es sencilla ya que requiere conocimientos avanzados en tecnologías como [RDF](https://es.wikipedia.org/wiki/Resource_Description_Framework) (del inglés Resource Description Framework) o SPARQL para poder realizar las consultas. -Este ejemplo utiliza los metadatos del repositorio que indican localizaciones, como por ejemplo las propiedades `blt:publication` y `blt:projectedPublication` que indican lugares de publicación. Gracias a que los registros están enlazados a GeoNames, vamos a poder acceder a Wikidata para recuperar las coordenadas geográficas de las localizaciones y mostrar los beneficios de Linked Open Data. El vocabulario utilizado por BNB Linked Data es [Bibliographic Ontology (BIBO)](http://bibliontology.com/) que es un vocabulario sencillo que permite describir los metadatos de un repositorio bibliográfico. +Este ejemplo utiliza los metadatos del repositorio que indican localizaciones, como por ejemplo las propiedades `blt:publication` y `blt:projectedPublication` que indican lugares de publicación. Gracias a que los registros están enlazados a GeoNames, vamos a poder acceder a Wikidata para recuperar las coordenadas geográficas de las localizaciones y mostrar los beneficios de Linked Open Data. El vocabulario utilizado por BNB Linked Data es [Bibliographic Ontology (BIBO)](https://bibliontology.com/) que es un vocabulario sencillo que permite describir los metadatos de un repositorio bibliográfico. En primer lugar, importamos las librerías necesarias para procesar esta colección: [folium](https://pypi.org/project/folium/0.1.4/)[^4] para visualizar información geográfica en un mapa; csv y json para el procesamiento de los formatos de entrada y salida; request para la realización de peticiones HTTP; pandas para la gestión de datos tabulares con columnas de tipo heterogéneo y [matplotlib](https://matplotlib.org/)[^5] para la creación de gráficas. @@ -455,7 +455,7 @@ De forma similar a como se ha creado el mapa en el ejemplo de Miguel de Cervante En el primer ejemplo se han reutilizado dos colecciones digitales descritas con MARCXML. Aunque la mayoría del código es reutilizable para ambos casos, los campos utilizados para describir los metadatos en cada colección son diferentes y por lo tanto es necesario un análisis previo. -En el caso de la BNB, y teniendo en cuenta la forma de representar los distintos roles que se pueden dar en un repositorio bibliográfico, la elección del vocabulario a utilizar puede ser crucial a la hora de dotar de suficiente expresividad a los metadatos. En este sentido, vocabularios ricos en términos semánticos como [Resource Description and Access (RDA)](http://www.rdaregistry.info) proporcionan un listado de elementos para representar numerosos roles con el objetivo de relacionar las obras con los autores como por ejemplo director, ilustrador, impresor o narrador. Además, es relevante resaltar que tan solo alrededor de un 50% de las obras se encuentran enlazadas a GeoNames y que el mapa que obtenemos como resultado no incluye el total de ubicaciones del repositorio. +En el caso de la BNB, y teniendo en cuenta la forma de representar los distintos roles que se pueden dar en un repositorio bibliográfico, la elección del vocabulario a utilizar puede ser crucial a la hora de dotar de suficiente expresividad a los metadatos. En este sentido, vocabularios ricos en términos semánticos como [Resource Description and Access (RDA)](https://www.rdaregistry.info) proporcionan un listado de elementos para representar numerosos roles con el objetivo de relacionar las obras con los autores como por ejemplo director, ilustrador, impresor o narrador. Además, es relevante resaltar que tan solo alrededor de un 50% de las obras se encuentran enlazadas a GeoNames y que el mapa que obtenemos como resultado no incluye el total de ubicaciones del repositorio. ## Conclusiones diff --git a/es/lecciones/salida-de-datos-como-archivo-html.md b/es/lecciones/salida-de-datos-como-archivo-html.md index 109144725a..11c0630e23 100644 --- a/es/lecciones/salida-de-datos-como-archivo-html.md +++ b/es/lecciones/salida-de-datos-como-archivo-html.md @@ -20,8 +20,8 @@ translation-reviewer: - Antonio Rojas Castro review-ticket: https://github.com/programminghistorian/ph-submissions/issues/49 layout: lesson -next: palabras-clave-en-contexto-n-grams -previous: crear-y-ver-archivos-html-con-python +next: /es/lecciones/palabras-clave-en-contexto-n-grams +previous: /es/lecciones/crear-y-ver-archivos-html-con-python original: output-data-as-html-file python_warning: false difficulty: 2 diff --git a/es/lecciones/salida-palabras-clave-contexto-ngrams.md b/es/lecciones/salida-palabras-clave-contexto-ngrams.md index 1c5a2aee91..381bad1db7 100644 --- a/es/lecciones/salida-palabras-clave-contexto-ngrams.md +++ b/es/lecciones/salida-palabras-clave-contexto-ngrams.md @@ -20,8 +20,8 @@ translation-reviewer: - Antonio Rojas Castro review-ticket: https://github.com/programminghistorian/ph-submissions/issues/51 layout: lesson -previous: palabras-clave-en-contexto-n-grams -next: descarga-multiples-registros-usando-cadenas-de-consulta +previous: /es/lecciones/palabras-clave-en-contexto-n-grams +next: /es/lecciones/descarga-multiples-registros-usando-cadenas-de-consulta original: output-keywords-in-context-in-html-file python_warning: false difficulty: 2 @@ -265,9 +265,9 @@ Esta lección marca el final de la serie de lecciones originales sobre Python. E - python-es-lecciones9.zip [zip sync][] -*Nota:* Ahora puedes ir a la siguiente lección (en inglés) para aprender a [Descargar registros múltiples](/lessons/downloading-multiple-records-using-query-strings) +*Nota:* Ahora puedes ir a la siguiente lección (en inglés) para aprender a [Descargar registros múltiples](/en/lessons/downloading-multiple-records-using-query-strings) -[Palabras clave en contexto (usando n-grams)]: /es/lecciones/palabras-clave-en-contexto-n-grams -[archivo zip de las lecciones anteriores]: /assets/python-es-lecciones8.zip -[Salida de datos como archivo HTML]: /es/lecciones/salida-de-datos-como-archivo-html -[zip sync]: /assets/python-es-lecciones9.zip +- [Palabras clave en contexto (usando n-grams)](/es/lecciones/palabras-clave-en-contexto-n-grams) +- [archivo zip de las lecciones anteriores](/assets/python-es-lecciones8.zip) +- [Salida de datos como archivo HTML](/es/lecciones/salida-de-datos-como-archivo-html) +- [zip sync](/assets/python-es-lecciones9.zip) diff --git a/es/lecciones/sitios-estaticos-con-jekyll-y-github-pages.md b/es/lecciones/sitios-estaticos-con-jekyll-y-github-pages.md index 66010e4a58..86ea33baf6 100644 --- a/es/lecciones/sitios-estaticos-con-jekyll-y-github-pages.md +++ b/es/lecciones/sitios-estaticos-con-jekyll-y-github-pages.md @@ -52,11 +52,11 @@ doi: 10.46430/phes0050 * TOC {:toc} -## ¿Qué son los sitios estáticos, Jekyll, etc. y por qué deberían importarme? +## ¿Qué son los sitios estáticos, Jekyll, etc. y por qué deberían importarme? -Este tutorial se basa en la [Documentación oficial de Jekyll](https://jekyllrb.com/docs/home/) escrita por la comunidad de Jekyll. Revisa la sección ["Leer más"](#section9-3) al final de la lección si deseas profundizar más sobre estos temas. +Este tutorial se basa en la [Documentación oficial de Jekyll](https://jekyllrb.com/docs/home/) escrita por la comunidad de Jekyll. Revisa la sección ["Lecturas"](#lecturas) al final de la lección si deseas profundizar más sobre estos temas. -### Sitios dinámicos, sitios estáticos y Jekyll +### Sitios dinámicos, sitios estáticos y Jekyll Tanto los *sitios web dinámicos*, como los que son creados y administrados por gestores de contenidos tales como [Drupal](https://www.drupal.com/), [WordPress](https://wordpress.org/) y [Omeka](https://omeka.org/) extraen información de una base de datos para completar el contenido de una página web. Cuando buscamos un libro en Amazon.com, por ejemplo, la página de resultados de búsqueda no existe como una página HTML completa; en cambio, Amazon.com tiene una plantilla para la página de resultados de búsqueda, que incluye elementos que comparten todas las páginas de resultados (como el menú principal y el logotipo de Amazon) y consulta una base de datos para insertar en esa plantilla los resultados de la búsqueda que nosotros realizamos. @@ -72,15 +72,15 @@ Hay que tener en cuenta que cuando alguien se refiere a un "sitio web de Jekyll" Dado que los sitios estáticos no son más que archivos de texto (sin una base de datos que complique las cosas), es posible *versionarlos* fácilmente, es decir, usar una herramienta para llevar un registro de las diferentes versiones del sitio a lo largo del tiempo rastreando los cambios en los archivos de texto que lo componen. El control de versiones es muy útil cuando deseamos combinar ambas versiones (por ejemplo, dos estudiantes escriben una publicación de blog juntos y deseamos combinar sus dos versiones) o cuando queremos comparar archivos para buscar diferencias entre ellos (por ejemplo, "¿Cómo se describía el proyecto en la página 'Acerca de' original?"). El control de versiones es muy útil cuando se trabaja en equipo (por ejemplo, permite combinar y rastrear el trabajo de diferentes personas), pero también es útil al crear o ejecutar un sitio web por nuestra propia cuenta. -Puedes leer más acerca de [Jekyll](http://jekyllrb.com/docs/home/) o [generadores de sitios estáticos](https://davidwalsh.name/introduction-static-site-generators) (en inglés). +Puedes leer más acerca de [Jekyll](https://jekyllrb.com/docs/home/) o [generadores de sitios estáticos](https://davidwalsh.name/introduction-static-site-generators) (en inglés). -### GitHub & GitHub Pages +### GitHub & GitHub Pages *[GitHub Pages](https://pages.github.com/)* es un espacio gratuito para almacenar los archivos que ejecutan un sitio web y alojar ese sitio para que las personas lo visiten (solo funciona para tipos particulares de sitios web, como sitios HTML básicos o sitios Jekyll; no aloja bases de datos). -*[GitHub](https://github.com/)* es una plataforma visual para utilizar *[git](https://git-scm.com/doc)*, un sistema de *versionado* o, en otras palabras, de registro de cambios realizados en los archivos (código y documentos de texto, entre otros) a través del tiempo (como ya explicamos [más arriba](#section0-1)). Si tienes curiosidad, puedes explorar este [minitutorial de GitHub](https://guides.github.com/activities/hello-world/) (en inglés). +*[GitHub](https://github.com/)* es una plataforma visual para utilizar *[git](https://git-scm.com/doc)*, un sistema de *versionado* o, en otras palabras, de registro de cambios realizados en los archivos (código y documentos de texto, entre otros) a través del tiempo (como ya explicamos [más arriba](#sitios-dinámicos-sitios-estáticos-y-jekyll)). Si tienes curiosidad, puedes explorar este [minitutorial de GitHub](https://guides.github.com/activities/hello-world/) (en inglés). -### ¿Por qué usar sitios estáticos? +### ¿Por qué usar sitios estáticos? Opciones como [Drupal](https://www.drupal.com/), [WordPress](https://wordpress.org/) y [Omeka](https://omeka.org/) son últiles para la creación de sitios web complejos e interactivos como Amazon o una edición digital interactiva de una novela, pero para muchos blogs, sitios web de proyectos y portafolios en línea, un sitio web estático (como un sitio web creado con Jekyll) puede hacer todo lo que se necesita al mismo tiempo que proporciona algunas ventajas: @@ -94,7 +94,7 @@ Opciones como [Drupal](https://www.drupal.com/), [WordPress](https://wordpress.o - **Alojamiento gratuito:** Si bien muchas herramientas de sitios web como Drupal, WordPress y Omeka son gratuitas, alojarlas (pagar a alguien para que muestre los archivos de tu sitio web a los visitantes del sitio) puede costar dinero. -- **Control de versiones:** Hospedar en GitHub Pages significa que tu sitio está vinculado a la interfaz visual de GitHub para el control de versiones de git, por lo que puede realizarse un seguimiento de los cambios en tu sitio y, si fuera necesario, volver al estado anterior de cualquier publicación de blog, o página. Esto incluye archivos cargados que tal vez desees almacenar en el sitio, como programas de estudio y publicaciones antiguas (el control de versiones se explicó [con más detalle anteriormente](#section0-1)). +- **Control de versiones:** Hospedar en GitHub Pages significa que tu sitio está vinculado a la interfaz visual de GitHub para el control de versiones de git, por lo que puede realizarse un seguimiento de los cambios en tu sitio y, si fuera necesario, volver al estado anterior de cualquier publicación de blog, o página. Esto incluye archivos cargados que tal vez desees almacenar en el sitio, como programas de estudio y publicaciones antiguas (el control de versiones se explicó [con más detalle anteriormente](#sitios-dinámicos-sitios-estáticos-y-jekyll)). - **Seguridad:** No hay una base de datos a la que haya que proteger de posibles ataques maliciosos. @@ -108,31 +108,31 @@ La creación de un sitio web estático con Jekyll ofrece aún más ventajas, sin - **La plantilla automatiza las tareas repetitivas:** Jekyll facilita la automatización de las tareas repetitivas del sitio web a través de su sistema de "plantillas": puedes crear contenido que, por ejemplo, debe aparecer en el encabezado y pie de cada página (por ejemplo, el logotipo o el menú principal), o repetir información en cada publicación de blog (por ejemplo, nombre del autor y fecha de publicación). Esta información de la plantilla se repetirá automáticamente en las páginas web que desees, en lugar de obligarte a reescribir manualmente esa información. Esto no solo ahorra mucho tiempo de copiar y pegar si alguna vez deseas cambiar algo que aparece en cada página de tu sitio web (por ejemplo, un nuevo logotipo o un nuevo elemento en el menú principal), ya que si lo cambias una vez en una plantilla, lo cambiarás en cada lugar que aparece en tu sitio web. -## Antes de la instalación -¡Estamos listos, manos a la obra! En el resto de esta lección, vamos a instalar algunos programas en nuestras computadoras, usar la línea de comandos para instalar algunas cosas que solo se pueden instalar de esa manera, ver y personalizar una versión privada de tu sitio web y finalmente hacer que tu sitio web sea accesible públicamente en la web. Si tienes problemas en algún momento de esta lección, consulta la [sección de ayuda sobre cómo hacer preguntas o informar problemas](#section1-9) +## Antes de la instalación +¡Estamos listos, manos a la obra! En el resto de esta lección, vamos a instalar algunos programas en nuestras computadoras, usar la línea de comandos para instalar algunas cosas que solo se pueden instalar de esa manera, ver y personalizar una versión privada de tu sitio web y finalmente hacer que tu sitio web sea accesible públicamente en la web. Si tienes problemas en algún momento de esta lección, consulta la [sección de ayuda sobre cómo hacer preguntas o informar problemas](#ayuda-créditos-y-lecturas) En esta sección vamos a asegurarnos de tener todo lo necesario para crear un sitio web estático con Jekyll y GitHub Pages. Para eso, vamos a abordar: -- [qué sistema operativo es posible usar (es decir, Mac / Windows / Linux)](#section1-0) -- [crear una cuenta de GitHub](#section1-1) -- [por qué es necesario usar un "editor de texto" para trabajar en nuestro sitio web](#section1-3) -- [cómo usar la línea de comandos](#section1-4) +- qué sistema operativo es posible usar (es decir, Mac / Windows / Linux) +- crear una cuenta de GitHub +- por qué es necesario usar un "editor de texto" para trabajar en nuestro sitio web +- cómo usar la línea de comandos Todos los elementos que vamos a instalar son herramientas de desarrollo web estándar. Se trata de herramientas confiables, por lo que no es indispensable saber exactamente qué hace cada una de ellas. Brindaremos una breve explicación de los elementos que hay que comprender en profundidad y dejaremos enlaces en caso de desear saber más sobre lo que se está instalando. -### Sistemas operativos +### Sistemas operativos Este tutorial está destinado a usuarios de Windows y Mac. Jekyll también funciona en Linux; sin embargo, para fines pedagógicos, este tutorial utiliza el software GitHub Desktop (disponible para Windows y Mac únicamente); los usuarios de Linux tienen que usar [git](https://git-scm.com/docs/gittutorial) para ello ((algo que este tutorial no aborda)). -Jekyll no es oficialmente compatible con Windows, lo que significa que la documentación oficial de Jekyll (las páginas que guían a través de la configuración y que explican su funcionamiento) no aborda el uso de Windows. Este tutorial se basa en [las instrucciones de Windows de David Burela](https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/) para las partes de la sección [Instalación de dependencias](#section2) en las que los usuarios de Windows deben hacer algo diferente; sin embargo, como parte de esta traducción al español hemos revisado el proceso de instalación en Windows. +Jekyll no es oficialmente compatible con Windows, lo que significa que la documentación oficial de Jekyll (las páginas que guían a través de la configuración y que explican su funcionamiento) no aborda el uso de Windows. Este tutorial se basa en [las instrucciones de Windows de David Burela](https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/) para las partes de la sección [Instalación de dependencias](#instalación-de-dependencias) en las que los usuarios de Windows deben hacer algo diferente; sin embargo, como parte de esta traducción al español hemos revisado el proceso de instalación en Windows. -### Cuenta de usuario de GitHub +### Cuenta de usuario de GitHub La cuenta de usuario de GitHub nos permite alojar nuestro sitio web (ponerlo a disposición para que otros lo visiten) de forma gratuita en esa plataforma. Como beneficio adicional, también nos permite llevar un registro de las versiones de nuestro sitio y tu escritura a medida que crece o cambia con el tiempo. 1\. Visita [GitHub.com](https://github.com/) y haz clic en el botón verde "Sign up" (Registrarse). -2\. En la página siguiente, ingresa el nombre de usuario deseado. El nombre de usuario es visible para otros usuarios, nos identifica en GitHub y también es parte de la URL de nuestro sitio. Por ejemplo, si el nombre de usuario de GitHub es *hdcaicyt*, la URL del sitio será http://hdcaicyt.github.io/. (Ten en cuenta que uno también puede comprar su propio nombre de dominio y usarlo para este sitio, pero eso no se tratará en este tutorial). Escribe una dirección de correo electrónico de uso habitual y añade una contraseña que contenga al menos un número y una letra minúscula. +2\. En la página siguiente, ingresa el nombre de usuario deseado. El nombre de usuario es visible para otros usuarios, nos identifica en GitHub y también es parte de la URL de nuestro sitio. Por ejemplo, si el nombre de usuario de GitHub es *hdcaicyt*, la URL del sitio será https://hdcaicyt.github.io/. (Ten en cuenta que uno también puede comprar su propio nombre de dominio y usarlo para este sitio, pero eso no se tratará en este tutorial). Escribe una dirección de correo electrónico de uso habitual y añade una contraseña que contenga al menos un número y una letra minúscula. 3\. En el recuadro "Verify your account", presiona el botón "Verify" (Verificar). Usa las flechas para poner la imagen en el sentido correcto. Finalmente, haz clic en "Select a plan" (Seleccionar un plan). @@ -144,7 +144,7 @@ La cuenta de usuario de GitHub nos permite alojar nuestro sitio web (ponerlo a d 7\. *Opcional*: puedes visitar https://github.com/settings/profile para agregar un nombre completo (puede ser tu nombre real, nombre de usuario de GitHub u otra cosa) y más información de perfil público, si lo deseas. -### Aplicación GitHub Desktop +### Aplicación GitHub Desktop La aplicación GitHub Desktop facilita la actualización del sitio web luego de haberlo configurado. En lugar de usar la línea de comandos cada vez que queramos actualizar nuestro sitio, es posible usar este herramienta visual. @@ -162,15 +162,15 @@ La aplicación GitHub Desktop facilita la actualización del sitio web luego de 7\. *Opcional:* puedes hacer el tutorial de uso de GitHub Desktop si lo deseas, pero en esta lección cubriremos todo lo que necesitas saber sobre GitHub). -### Editor de texto +### Editor de texto -Es necesario descargar e instalar un editor de texto para realizar pequeñas personalizaciones al código de nuestro sitio Jekyll. Algunas buenas opciones gratuitas incluyen [jEdit](https://www.jedit.org), [Atom](https://atom.io/), [SublimeText](https://www.sublimetext.com/3), [Notepad ++](https://notepad-plus-plus.org/) para Windows o [BBedit](http://www.barebones.com/products/bbedit) para Mac. Los procesadores de texto, como Microsoft Word o WordPad, no son una buena opción porque es fácil olvidar cómo formatear y guardar el archivo; es posible agregar accidentalmente formatos y caracteres extra y/o invisibles que pueden generar problemas en el sitio. Por eso es mejor usar programas que puedan guardar lo que escribimos como texto plano (por ejemplo, HTML o Markdown). +Es necesario descargar e instalar un editor de texto para realizar pequeñas personalizaciones al código de nuestro sitio Jekyll. Algunas buenas opciones gratuitas incluyen [jEdit](https://www.jedit.org), [Atom](https://atom.io/), [SublimeText](https://www.sublimetext.com/3), [Notepad ++](https://notepad-plus-plus.org/) para Windows o [BBedit](https://www.barebones.com/products/bbedit) para Mac. Los procesadores de texto, como Microsoft Word o WordPad, no son una buena opción porque es fácil olvidar cómo formatear y guardar el archivo; es posible agregar accidentalmente formatos y caracteres extra y/o invisibles que pueden generar problemas en el sitio. Por eso es mejor usar programas que puedan guardar lo que escribimos como texto plano (por ejemplo, HTML o Markdown). -*Opcional:* Consulta la sección ["Creación en Markdown"](#section5-2) más abajo, para más información sobre un programa de edición específico en Markdown, que también puedes instalar cuando ya estemos en la etapa de crear páginas web y/o publicaciones (posts) de blog. +*Opcional:* Consulta la sección ["Creación en Markdown"](#escritura-en-markdown) más abajo, para más información sobre un programa de edición específico en Markdown, que también puedes instalar cuando ya estemos en la etapa de crear páginas web y/o publicaciones (posts) de blog. -### Línea de comandos +### Línea de comandos -La línea de comandos puede ser definida como una forma de interactuar con la computadora mediante texto: permite escribir comandos para llevar a cabo acciones sencillas (como "mostrar una lista de los archivos en este directorio" o "cambiar quién tiene permiso para acceder a este archivo"), así como para realizar acciones más complejas. No obstante, existen buenas alternativas visuales para efectuar acciones en la computadora (por ejemplo, la aplicación GitHub Desktop [que instalamos arriba](#section1-2)) y otras veces tendremos que usar la línea de comandos para indicarle qué hacer a la computadora. Si deseas más información de la que se proporciona en este tutorial, [The Programming Historian](/es/) tiene una [lección que explora en profundidad la línea de comandos](/es/lecciones/introduccion-a-bash), pero aquí cubriremos todo lo necesario para completar la creación de nuestro sitio web y solo usaremos la línea de comandos cuando sea necesario o más sencillo que una interfaz visual. +La línea de comandos puede ser definida como una forma de interactuar con la computadora mediante texto: permite escribir comandos para llevar a cabo acciones sencillas (como "mostrar una lista de los archivos en este directorio" o "cambiar quién tiene permiso para acceder a este archivo"), así como para realizar acciones más complejas. No obstante, existen buenas alternativas visuales para efectuar acciones en la computadora (por ejemplo, la aplicación GitHub Desktop [que instalamos arriba](#aplicación-github-desktop)) y otras veces tendremos que usar la línea de comandos para indicarle qué hacer a la computadora. Si deseas más información de la que se proporciona en este tutorial, [The Programming Historian](/es/) tiene una [lección que explora en profundidad la línea de comandos](/es/lecciones/introduccion-a-bash), pero aquí cubriremos todo lo necesario para completar la creación de nuestro sitio web y solo usaremos la línea de comandos cuando sea necesario o más sencillo que una interfaz visual. Mientras que la línea de comandos usa comandos de texto, la mayoría de los usuarios utilizan una "interfaz gráfica de usuario" (también conocida como GUI, "graphical user interface"). Cualquier programa en el que las interacciones usuario-computadora se dan a través de una interfaz visual que contiene íconos, imágenes, funciones de clic con el mouse, etc. es una GUI. ¿Por qué usaríamos la línea de comandos si existen las GUI? Muchas veces es más simple y rápido escribir (o cortar y pegar de un tutorial) una serie de comandos en la línea de comandos que hacer lo mismo usando una GUI. Otras veces, hay cosas para las cuales nadie ha creado una GUI y solo es posible hacerlas a través de la línea de comandos. @@ -198,11 +198,11 @@ Siempre que en este tutorial pidamos abrir una ventana de línea de comandos e i 3\. Algo muy útil cuando escribimos los mismos comandos muchas veces o queremos recordar algo que escribimos antes: podemos presionar **↑** (flecha hacia arriba) en la línea de comandos para desplazarnos por los comandos recientemente escritos y presionar "Enter" después de que aparezca el que deseamos usar. -## Instalación de dependencias +## Instalación de dependencias -A continuación, vamos a instalar algunas dependencias de software (es decir, programas de los que depende Jekyll para poder trabajar) usando la línea de comandos ya que no hay una interfaz visual para hacerlo. Esta sección se divide en instrucciones para Mac e instrucciones para Windows, así que puedes ir a la sección de [instalación de dependencias en Mac](#sectionMac) si estás usando Mac, o a la sección de [instalación de dependencias en Windows](#sectionwindows) si estás usando Windows. +A continuación, vamos a instalar algunas dependencias de software (es decir, programas de los que depende Jekyll para poder trabajar) usando la línea de comandos ya que no hay una interfaz visual para hacerlo. Esta sección se divide en instrucciones para Mac e instrucciones para Windows, así que puedes ir a la sección de [instalación de dependencias en Mac](#en-mac) si estás usando Mac, o a la sección de [instalación de dependencias en Windows](#en-windows) si estás usando Windows. -### En Mac +### En Mac Si estás utilizando una computadora Mac, sigue las instrucciones que se encuentran a continuación. @@ -210,11 +210,11 @@ Si estás utilizando una computadora Mac, sigue las instrucciones que se encuent Ten en cuenta que si estás utilizando una Mac con un chip de la serie M puede que encuentres problemas de compatibilidad al instalar Jekyll o sus dependencias. Si ese es el caso, te recomendamos que sigas las instrucciones de instalación adicionales para el paquete correspondiente, tanto de la documentación original como de foros online.
    -Abre una ventana de línea de comandos (*Aplicaciones > Utilidades > Terminal*) e ingresa el código que se muestra en los pasos a continuación (`el código es el texto que aparece formateado así`) siguiendo [las sugerencias de uso de la línea de comandos detalladas más arriba](#section1-4). +Abre una ventana de línea de comandos (*Aplicaciones > Utilidades > Terminal*) e ingresa el código que se muestra en los pasos a continuación (`el código es el texto que aparece formateado así`) siguiendo [las sugerencias de uso de la línea de comandos detalladas más arriba](#línea-de-comandos). -### Herramientas de línea de comandos +### Herramientas de línea de comandos -Primero vamos a instalar las "herramientas de línea de comandos" de Mac para poder usar [Homebrew](http://brew.sh/) (que instalaremos a continuación). Homebrew permite descargar e instalar desde la línea de comandos software de código abierto (es un "administrador de paquetes"), lo que facilitará la instalación de Ruby (el lenguaje en el que se basa Jekyll). +Primero vamos a instalar las "herramientas de línea de comandos" de Mac para poder usar [Homebrew](https://brew.sh/) (que instalaremos a continuación). Homebrew permite descargar e instalar desde la línea de comandos software de código abierto (es un "administrador de paquetes"), lo que facilitará la instalación de Ruby (el lenguaje en el que se basa Jekyll). En el Terminal, pega el siguiente código y presiona Enter: @@ -233,9 +233,9 @@ Una vez que termine la instalación, va a aparecer un mensaje de instalación ex {% include figure.html filename="building-static-sites-with-jekyll-github-pages-2.5.png" caption="Captura de pantalla de la ventana emergente luego de la instalación exitosa." %} -### Homebrew +### Homebrew -Al terminar la instalación de las herramientas de la línea de comandos, regresa a la ventana de la línea de comandos y copia el siguiente texto para instalar [Homebrew](http://brew.sh/): +Al terminar la instalación de las herramientas de la línea de comandos, regresa a la ventana de la línea de comandos y copia el siguiente texto para instalar [Homebrew](https://brew.sh/): ``` /usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" @@ -245,7 +245,7 @@ Presiona "Enter" cuando sea necesario, e ingresa la contraseña de tu computador {% include figure.html filename="building-static-sites-with-jekyll-github-pages-4.png" caption="Captura de pantalla del proceso de instalación de Homebrew." %} -### Ruby y Ruby Gems +### Ruby y Ruby Gems Jekyll está construido a partir del [lenguaje de programación Ruby](https://es.wikipedia.org/wiki/Ruby). [Ruby Gems](https://rubygems.org/) es un administrador de paquetes que facilita la configuración de programas Ruby tales como Jekyll (Ruby Gems agrega algunas cosas para simplificar las instalaciones de Ruby). @@ -268,7 +268,7 @@ Algunas personas que utilizan macOS Catalina y macOS Big Sur han reportado dific
    -### NodeJS +### NodeJS [NodeJS](https://nodejs.org/en/) (o Node.js) es una plataforma de desarrollo (específicamente, es un "entorno de ejecución") que, por ejemplo, ayuda a que Javascript se ejecute más rápido. @@ -276,9 +276,9 @@ En la línea de comandos, ingresa: `brew install node` -### Jekyll +### Jekyll -[Jekyll](https://jekyllrb.com/) es el programa que crea nuestro sitio web, simplificando ciertas tareas comunes, como usar la misma plantilla (el mismo logotipo, menú, información del autor, etc.) en todas las páginas de nuestro blog. Puedes ver más información sobre Jekyll en [Sitios dinámicos, sitios estáticos & Jekyll](#section0-1) y en [¿Por qué usar sitios estáticos?](#section0-3), más arriba. +[Jekyll](https://jekyllrb.com/) es el programa que crea nuestro sitio web, simplificando ciertas tareas comunes, como usar la misma plantilla (el mismo logotipo, menú, información del autor, etc.) en todas las páginas de nuestro blog. Puedes ver más información sobre Jekyll en [Sitios dinámicos, sitios estáticos & Jekyll](#sitios-dinámicos-sitios-estáticos-y-jekyll) y en [¿Por qué usar sitios estáticos?](#por-qué-usar-sitios-estáticos), más arriba. En la línea de comandos, ingresa: @@ -286,7 +286,7 @@ En la línea de comandos, ingresa: **¡Felicitaciones, hemos terminado de instalar todo lo necesario para crear nuestro sitio web! Omite los siguientes pasos (que son solo para usuarios de Windows).** -### En Windows +### En Windows En esta sección, las instrucciones para usuarios de Windows difieren de las de los usuarios de Mac. Debes hacer estos pasos únicamente si estás utilizando Windows. @@ -301,7 +301,7 @@ En esta sección, las instrucciones para usuarios de Windows difieren de las de `Install MSYS2 and MINGW development toolchain succeeded` o `Se han instalado con éxito las herramientas de desarrollo MSYS2 y MINGW` -5\. Cierra esta ventana y abre una **nueva** para instalar Jekyll. [Jekyll](https://jekyllrb.com/) es el código que crea o genera tu página web (por ejemplo, "generación de página), haciendo más fácil las tareas comunes como usar la misma plantilla (mismo logo, menú, información de autora...) en todas las páginas de entradas de blog. Hay más información sobre [qué es Jekyll y qué son las páginas estáticas](#section0-1) y [razones por las que puedes usar Jekyll para crear una página web estática](#section0-3), arriba. Ahora instalaremos Jekyll (si la Seguridad de Windows te muestra un aviso, ignóralo): +5\. Cierra esta ventana y abre una **nueva** para instalar Jekyll. [Jekyll](https://jekyllrb.com/) es el código que crea o genera tu página web (por ejemplo, "generación de página), haciendo más fácil las tareas comunes como usar la misma plantilla (mismo logo, menú, información de autora...) en todas las páginas de entradas de blog. Hay más información sobre [qué es Jekyll y qué son las páginas estáticas](#sitios-dinámicos-sitios-estáticos-y-jekyll) y [razones por las que puedes usar Jekyll para crear una página web estática](#por-qué-usar-sitios-estáticos), arriba. Ahora instalaremos Jekyll (si la Seguridad de Windows te muestra un aviso, ignóralo): `gem install jekyll bundler` @@ -310,7 +310,7 @@ En esta sección, las instrucciones para usuarios de Windows difieren de las de **¡Felicitaciones, hemos terminado de instalar todo lo necesario para crear nuestro sitio web! De aquí en adelante, las instrucciones son iguales para Windows y Mac.** -## Configuración de Jekyll +## Configuración de Jekyll Ya hemos instalado todo lo necesario para crear un sitio web. En esta sección utilizaremos Jekyll para generar una nueva carpeta con los archivos que conforman el sitio web. También ubicaremos esta carpeta en un lugar accesible para la aplicación GitHub Desktop para que estén en el lugar correcto cuando deseemos publicarlos como un sitio web público más adelante en la lección. @@ -340,7 +340,7 @@ Haz clic derecho en la carpeta "GitHub" y elige "Copiar GitHub". La ruta de la c Es necesario esperar a que vuelva a aparecer el prompt para continuar con el siguiente paso. -4\. La URL pública de tu sitio tendrá la siguiente forma: [http://amandavisconti.github.io/JekyllDemo/](http://amandavisconti.github.io/JekyllDemo/) (*amandavisconti* es el usuario de GitHub de la autora y *JekyllDemo* el nombre del sitio que ingresamos en este paso (*es posible pagar y usar tu propia [URL personalizada](#section7-2), pero no lo cubriremos en este tutorial*). **Los sitios en mayúsculas y minúsculas *no* dirigen al mismo sitio web**, así que a diferencia del ejemplo **JekyllDemo** es recomendable elegir un nombre todo en minúsculas para asegurarse de que la gente lo escriba correctamente. +4\. La URL pública de tu sitio tendrá la siguiente forma: [https://amandavisconti.github.io/JekyllDemo/](https://amandavisconti.github.io/JekyllDemo/) (*amandavisconti* es el usuario de GitHub de la autora y *JekyllDemo* el nombre del sitio que ingresamos en este paso (*es posible pagar y usar tu propia [URL personalizada](#funcionalidad), pero no lo cubriremos en este tutorial*). **Los sitios en mayúsculas y minúsculas *no* dirigen al mismo sitio web**, así que a diferencia del ejemplo **JekyllDemo** es recomendable elegir un nombre todo en minúsculas para asegurarse de que la gente lo escriba correctamente. En la línea de comandos, escribe lo siguiente (reemplaza *JekyllDemo* con el nombre que desees para tu sitio): @@ -352,11 +352,11 @@ Haz clic derecho en la carpeta "GitHub" y elige "Copiar GitHub". La ruta de la c `cd JekyllDemo` - Si miras en la carpeta *GitHub > JekyllDemo* en el explorador de archivos, verás una serie de archivos nuevos -los archivos que ejecutarán tu sitio web- que han sido instalados ([más abajo](#section4-2) explicaremos qué hace cada uno): + Si miras en la carpeta *GitHub > JekyllDemo* en el explorador de archivos, verás una serie de archivos nuevos -los archivos que ejecutarán tu sitio web- que han sido instalados ([más abajo](#dónde-está-y-qué-es-cada-cosa) explicaremos qué hace cada uno): {% include figure.html filename="building-static-sites-with-jekyll-github-pages-9.png" caption="Captura de pantalla de la carpeta creada." %} -## Ejecutar un sitio web localmente +## Ejecutar un sitio web localmente Esta sección describe cómo ejecutar un sitio web **localmente**. Esto significa que podrás ver cómo se ve tu sitio web en un navegador, pero únicamente en tu computadora (a eso se refiere lo de "localmente"). Trabajar en una versión local de un sitio web quiere decir que el sitio es privado, nadie puede verlo todavía (el sitio no es público, nadie puede escribir la URL y verlo en su computadora). @@ -372,7 +372,7 @@ Esto te permite experimentar todo lo que desees y publicar el sitio al mundo cua *--watch* precedido de *bundle exec* le indica a Jekyll que busque cambios en los archivos del sitio web (por ejemplo, nuevos posts o páginas) y que los muestre al actualizar el navegador. **Una excepción** es el archivo `_config.yml`, que será explicado en detalle en la próxima sección (los cambios realizados en este archivo solo se muestran luego de detener y reiniciar Jekyll). -2\. Luego de escribir el comando previo, aparecerá en el terminal un proceso que no se detiene. ¿Recuerdas que te contamos que si escribías algo en la línea de comandos mientras este todavía está ejecutando el comando previo se pueden ocasionar problemas? Ahora Jekyll está corriendo en esta línea de comandos, de manera que si deseas ejecutar comandos mientras visualizas tu sitio local, deberás abrir una nueva ventana de línea de comandos (ver la sección acerca del uso de la [línea de comandos](#section1-4)) +2\. Luego de escribir el comando previo, aparecerá en el terminal un proceso que no se detiene. ¿Recuerdas que te contamos que si escribías algo en la línea de comandos mientras este todavía está ejecutando el comando previo se pueden ocasionar problemas? Ahora Jekyll está corriendo en esta línea de comandos, de manera que si deseas ejecutar comandos mientras visualizas tu sitio local, deberás abrir una nueva ventana de línea de comandos (ver la sección acerca del uso de la [línea de comandos](#línea-de-comandos)) {% include figure.html filename="building-static-sites-with-jekyll-github-pages-10.png" caption="Captura de pantalla de el terminal ejecutando localmente el sitio." %} @@ -384,7 +384,7 @@ Esto te permite experimentar todo lo que desees y publicar el sitio al mundo cua {% include figure.html filename="building-static-sites-with-jekyll-github-pages-11.png" caption="Vista del sitio web en el navegador" %} -### Mini ayudamemoria +### Mini ayudamemoria - Escribe `bundle exec jekyll serve --watch` en la línea de comandos para ejecutar el sitio web localmente. Visita **localhost:4000** en un navegador para visualizar el sitio localmente. En la próxima sección haremos modificaciones que nos obligarán a visitar **localhost:4000/JekyllDemo/** para poder visualizar el sitio (ingresando el nombre de la carpeta de tu sitio web en lugar de *JekyllDemo* y asegurándote de incluir la barra final **/**). @@ -394,13 +394,13 @@ Esto te permite experimentar todo lo que desees y publicar el sitio al mundo cua - ¿Escribes, copias o pegas mucho `bundle exec jekyll serve --watch`? Puedes presionar la tecla **↑** (flecha hacia arriba) en la línea de comandos para hacer desfilar los comandos ingresados recientemente. Presiona "Enter" cuando aparezca el comando que deseas ejecutar. -## Modificar la configuración del sitio +## Modificar la configuración del sitio Ya tenemos un sitio web básico privado, accesible únicamente en nuestra computadora. En esta sección, vamos a personalizar el sitio cambiando el título y el autor. También vamos a dar un panorama de lo que hacen los diferentes archivos del sitio web. -### Configuración básica del sitio con _config.yml +### Configuración básica del sitio con _config.yml -1\. Abre la carpeta de tu sitio web en el explorador de archivos. El sitio de la autora del tutorial se encuentra en `/Users/DrJekyll/GitHub/JekyllDemo` (*DrJekyll* es el nombre de usuario de la autora y *JekyllDemo* es el nombre de la carpeta del sitio web de este tutorial). Visita la [sección "Configuración de Jekyll"](#section3) si necesitas ayuda para encontrar la carpeta de tu sitio web. +1\. Abre la carpeta de tu sitio web en el explorador de archivos. El sitio de la autora del tutorial se encuentra en `/Users/DrJekyll/GitHub/JekyllDemo` (*DrJekyll* es el nombre de usuario de la autora y *JekyllDemo* es el nombre de la carpeta del sitio web de este tutorial). Visita la [sección "Configuración de Jekyll"](#configuración-de-jekyll) si necesitas ayuda para encontrar la carpeta de tu sitio web. {% include figure.html filename="building-static-sites-with-jekyll-github-pages-18.png" caption="Captura de pantalla de la carpeta que contiene los archivos del sitio web." %} @@ -428,7 +428,7 @@ Ya tenemos un sitio web básico privado, accesible únicamente en nuestra comput - **email**: tu dirección de email. - **description**: la descripción del sitio web que será usada por los motores de búsqueda y que será utilizada por RSS. - **baseurl**: completa entre las comillas con una barra oblicua **/** seguida del nombre de la carpeta de tu sitio web (por ej., "/JekyllDemo") para que el sitio tome la URL correcta. Asegúrate de que tu carpeta está en el mismo repositorio de GitHub con el mismo nombre y termina con la barra oblicua ("/"). Esto se requiere para publicarlo en GitHub Pages. - - **url**: reemplaza "http://yourdomain.com" por "localhost:4000" para que el navegador tome la versión local de tu sitio en la URL correcta. + - **url**: reemplaza "https://yourdomain.com" por "localhost:4000" para que el navegador tome la versión local de tu sitio en la URL correcta. - **twitter_username**: tu nombre de usuario de Twitter (no incluir @). - **github_username**: tu nombre de usuario de GitHub. @@ -443,44 +443,44 @@ Ya tenemos un sitio web básico privado, accesible únicamente en nuestra comput {% include figure.html filename="building-static-sites-with-jekyll-github-pages-17.png" caption="Ejecución local del sitio web" %} -### ¿Dónde está (y qué es) cada cosa? +### ¿Dónde está (y qué es) cada cosa? -Para tener una idea de cómo funciona el sitio y con qué archivos se puede experimentar para hacer cosas más avanzadas, aquí hay algunas notas sobre lo que hace cada carpeta o archivo de tu sitio web. Recuerda siempre abrir y editar cualquier archivo con un editor de texto (por ejemplo, Notepad++) y no con un procesador de textos (no utilices Microsoft Word ni nada que permita agregar formato como cursiva y negrita). Es muy importante no usar Word o procesadores de texto porque estos programas agregan caracteres invisibles que si se guardan en los archivos de nuestro sitio web pueden dañarlo. Si ya deseas comenzar a agregar contenido a tu sitio y hacerlo público, puedes [saltar a la siguiente sección](#section5). +Para tener una idea de cómo funciona el sitio y con qué archivos se puede experimentar para hacer cosas más avanzadas, aquí hay algunas notas sobre lo que hace cada carpeta o archivo de tu sitio web. Recuerda siempre abrir y editar cualquier archivo con un editor de texto (por ejemplo, Notepad++) y no con un procesador de textos (no utilices Microsoft Word ni nada que permita agregar formato como cursiva y negrita). Es muy importante no usar Word o procesadores de texto porque estos programas agregan caracteres invisibles que si se guardan en los archivos de nuestro sitio web pueden dañarlo. Si ya deseas comenzar a agregar contenido a tu sitio y hacerlo público, puedes [saltar a la siguiente sección](#redacción-de-páginas-y-entradas-de-blog). {% include figure.html filename="building-static-sites-with-jekyll-github-pages-18.png" caption="Carpeta con los archivos de nuestro sitio" %} -- **_config.yml** fue explicado [más arriba](#section4-1); contiene información básica de la configuración del sitio, como el título y otras posibilidades que no abordaremos aquí (por ej., cómo estructurar los links) +- **_config.yml** fue explicado [más arriba](#configuración-básica-del-sitio-con-_configyml); contiene información básica de la configuración del sitio, como el título y otras posibilidades que no abordaremos aquí (por ej., cómo estructurar los links) - la carpeta **_includes** contiene archivos que son incluidos en todas o varias páginas (por ej., el código para que el encabezado del sitio tenga el título y el menú principal en todas las páginas del sitio) - la carpeta **_layouts** contiene código que controla cómo se ven las páginas de nuestro sitio web (default.html), así como también modificaciones de ese código para darle un estilo más específico a las entradas (post.html) y las páginas (page.html) -- la carpeta **_posts** contiene los archivos que representan cada una de las entradas de nuestro sitio web. Si creamos un nuevo archivo en esta carpeta aparecerá una nueva entrada de blog en el sitio web en orden cronológico inverso (de la más reciente a la más vieja). Detallaremos cómo crear entradas de blog en la [próxima sección](#section5-2) +- la carpeta **_posts** contiene los archivos que representan cada una de las entradas de nuestro sitio web. Si creamos un nuevo archivo en esta carpeta aparecerá una nueva entrada de blog en el sitio web en orden cronológico inverso (de la más reciente a la más vieja). Detallaremos cómo crear entradas de blog en la [próxima sección](#creación-de-entradas) - la carpeta **_sass** contiene archivos SCSS que controlan el diseño visual del sitio web - la carpeta **_site** almacena las páginas HTML que aparecen en Internet (por ej., nuestras entradas de blog serán escritas como archivos Markdown pero Jekyll las convertirá a HTML para mostrarlas en Internet) -- **about.md** es un ejemplo de *página de Jekyll*. Ya se encuentra linkeada en el encabezado de nuestro sitio web y podemos cambiar el contenido de esta página abriendo el archivo about.md y modificando el texto. Detallaremos cómo crear nuevas páginas en la [próxima sección](#section5-3) +- **about.md** es un ejemplo de *página de Jekyll*. Ya se encuentra linkeada en el encabezado de nuestro sitio web y podemos cambiar el contenido de esta página abriendo el archivo about.md y modificando el texto. Detallaremos cómo crear nuevas páginas en la [próxima sección](#creación-de-páginas) - la carpeta **css** contiene CSS obtenido a partir del SCSS que controla el diseño visual del sitio - **feed.xml** permite que el público siga el feed RSS de las entradas de nuestro blog - **index.html** controla la estructura de la página de inicio del sitio -## Redacción de páginas y entradas de blog +## Redacción de páginas y entradas de blog Esta sección describirá cómo crear páginas o entradas de blog en tu sitio web. **Páginas** y **entradas de blog** son dos tipos de contenido escrito, pero con estilos diferentes. Las páginas (como "Acerca de") no están organizadas ni se muestran cronológicamente; sin embargo, pueden ser incluidas en el menú principal de tu sitio web. Las entradas de blog están pensadas para ser utilizadas como contenido organizado por fecha de publicación. Las URLs para páginas y entradas también son diferentes por defecto (pero tú puedes cambiar eso): las URLs de página se ven como `MySite.com/about/`, mientras que las URLs de entradas se ven como `MySite.com/2016/02/29/my-post-title.html`. -#### Escritura en Markdown +#### Escritura en Markdown Markdown es un lenguaje de marcado para dar formato a tus escritos para que puedan ser leídos en la web: es un conjunto de símbolos, fáciles de recordar, que muestran dónde debe añadirse el formato del texto (por ejemplo, un # delante del texto significa que se le da formato como encabezado, mientras que un * significa que tendrá formato como elemento de lista con viñetas). Para Jekyll en particular, Markdown permite escribir páginas web y entradas de blog de una manera cómoda para los autores (por ejemplo, no es necesario buscar/añadir etiquetas HTML mientras se intenta escribir un ensayo), y que el escrito aparezca con un buen formato en la web (es decir, convertido de texto a HTML). -En esta lección no cubriremos Markdown; si no estás familiarizado con él, puedes crear entradas y páginas sin formato (es decir, sin negrita / cursiva, encabezados, listas enumeradas o viñetas). Pero es sencillo aprender a agregarlos: aquí hay una guía de [referencias](http://kramdown.gettalong.org/quickref.html) de markdown en inglés, también puedes consultar esta guía en [español](https://docs.github.com/es/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax), así como la lección en [Programming Historian de Sarah Simpkin sobre el cómo y porque escribir con Markdown](/es/lecciones/introduccion-a-markdown). Consulta estos enlaces si quieres dar formato al texto (cursiva, negrita, encabezados, listas enumeradas o viñetas), añadir hipervínculos, incrustar imágenes u otros archivos. +En esta lección no cubriremos Markdown; si no estás familiarizado con él, puedes crear entradas y páginas sin formato (es decir, sin negrita / cursiva, encabezados, listas enumeradas o viñetas). Pero es sencillo aprender a agregarlos: aquí hay una guía de [referencias](https://kramdown.gettalong.org/quickref.html) de markdown en inglés, también puedes consultar esta guía en [español](https://docs.github.com/es/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax), así como la lección en [Programming Historian de Sarah Simpkin sobre el cómo y porque escribir con Markdown](/es/lecciones/introduccion-a-markdown). Consulta estos enlaces si quieres dar formato al texto (cursiva, negrita, encabezados, listas enumeradas o viñetas), añadir hipervínculos, incrustar imágenes u otros archivos. -Asegúrate que la guía de referencias de Markdown que consultes sea similar a "[kramdown](http://kramdown.gettalong.org/quickref.html)", porque es lo que admite GitHub Pages (donde alojaremos nuestro sitio web). (Hay [varios "tipos" de Markdown](https://github.com/jgm/CommonMark/wiki/Markdown-Flavors) con sutiles diferencias en lo que respecta a símbolos, pero en su mayoría los que se usan más frecuentemente, como los que crean el formato de encabezados, son iguales. Por lo tanto, puedes utilizar una hoja de referencia Markdown que no especifique que se trate de kramdown, pero si recibes errores en tu sitio web usando símbolos que no están incluidos en kramdown, este podría ser el motivo). +Asegúrate que la guía de referencias de Markdown que consultes sea similar a "[kramdown](https://kramdown.gettalong.org/quickref.html)", porque es lo que admite GitHub Pages (donde alojaremos nuestro sitio web). (Hay [varios "tipos" de Markdown](https://github.com/jgm/CommonMark/wiki/Markdown-Flavors) con sutiles diferencias en lo que respecta a símbolos, pero en su mayoría los que se usan más frecuentemente, como los que crean el formato de encabezados, son iguales. Por lo tanto, puedes utilizar una hoja de referencia Markdown que no especifique que se trate de kramdown, pero si recibes errores en tu sitio web usando símbolos que no están incluidos en kramdown, este podría ser el motivo). -Si te interesa un editor de Markdown, puedes utilizar uno como [Typora](http://www.typora.io/) (OS X y Windows; de descarga gratuita), que te permitirá utilizar atajos de teclado (por ejemplo, resaltar texto y presionar cmd-B o Ctrl-B para ponerlo en negrita) y/o hacer que se muestre tal y cómo se verá en la web (ver los encabezados con el estilo de los encabezados, en lugar del texto normal con un # delante de ellos). +Si te interesa un editor de Markdown, puedes utilizar uno como [Typora](https://www.typora.io/) (OS X y Windows; de descarga gratuita), que te permitirá utilizar atajos de teclado (por ejemplo, resaltar texto y presionar cmd-B o Ctrl-B para ponerlo en negrita) y/o hacer que se muestre tal y cómo se verá en la web (ver los encabezados con el estilo de los encabezados, en lugar del texto normal con un # delante de ellos). -### Creación de páginas +### Creación de páginas -1\. Para ver una página existente en tu sitio web (creada por defecto en tu sitio web de Jekyll [con el resto de los archivos](#section3)), navega hasta la carpeta de tu sitio web y abre el archivo `about.md` en un editor de texto (por ej. TextWrangler) o en un editor de Markdown (p. ej. Typora). Allí verás el archivo creado como "Acerca de (About)". Puedes hacer clic en el enlace "Acerca de", situado en la parte superior derecha de la página web, y podrás observar cómo aparece la página web que crea el archivo en un navegador. +1\. Para ver una página existente en tu sitio web (creada por defecto en tu sitio web de Jekyll [con el resto de los archivos](#configuración-de-jekyll)), navega hasta la carpeta de tu sitio web y abre el archivo `about.md` en un editor de texto (por ej. TextWrangler) o en un editor de Markdown (p. ej. Typora). Allí verás el archivo creado como "Acerca de (About)". Puedes hacer clic en el enlace "Acerca de", situado en la parte superior derecha de la página web, y podrás observar cómo aparece la página web que crea el archivo en un navegador. 2\. El material entre guiones \--- se llama "front matter" (*al abrir el archivo en un editor de Markdown este puede aparecer sobre un fondo gris en lugar de entre guiones*). Este apartado le dice a tu sitio si el contenido posterior debe formatearse como página o entrada de blog, el título de la entrada, la fecha y la hora en que fue publicada, y cualquier categoría que quieras que aparezca en la entrada o la página. @@ -490,16 +490,16 @@ Si te interesa un editor de Markdown, puedes utilizar uno como [Typora](http://w - **title:** Cámbialo al título deseado (a diferencia de las entradas, no hay comillas alrededor del título). En la siguiente captura de pantalla, se ha agregado una página con el título "Resume". - **permalink:** Cambia el texto entre las dos barras diagonales por la palabra (o frase, ¡pero necesitarás usar guiones y no espacios!) que desees que continúe la URL principal de tu sitio para llegar a la página. Por ejemplo, **enlace permanente:/about/** ubica la página en `localhost:4000/yourwebsitefoldername/about/`. -3\. El espacio debajo del segundo guión del texto preliminar (o debajo del recuadro gris si usa un editor Markdown) es donde debes escribir el contenido de tu página, usando [el formato Markdown descrito anteriormente](#section5-1) +3\. El espacio debajo del segundo guión del texto preliminar (o debajo del recuadro gris si usa un editor Markdown) es donde debes escribir el contenido de tu página, usando [el formato Markdown descrito anteriormente](#escritura-en-markdown) 4\. Para crear una nueva página además de la existente "Acerca de (About)" (que puede ser personalizada o eliminada), crea una copia del archivo `about.md` en la misma carpeta (la principal del sitio web) y cambia el nombre al título que desees, utilizando guiones en lugar de espacios (por ejemplo, `resume.md` o `contact-me.md`). También cambia el título, el enlace permanente en el texto preliminar del archivo y el contenido. La nueva página debe aparecer automáticamente en el menú principal en el encabezado del sitio: {% include figure.html filename="building-static-sites-with-jekyll-github-pages-22.png" caption="La nueva página en nuestro sitio aparece en el menú" %} -Como referencia, puedes consultar [un ejemplo de página](http://amandavisconti.github.io/JekyllDemo/resume/) en mi sitio de demostración, o ver [el archivo que está detrás de esa página](https://raw.githubusercontent.com/amandavisconti/JekyllDemo/gh-pages/resume.md). +Como referencia, puedes consultar [un ejemplo de página](https://amandavisconti.github.io/JekyllDemo/resume/) en mi sitio de demostración, o ver [el archivo que está detrás de esa página](https://raw.githubusercontent.com/amandavisconti/JekyllDemo/gh-pages/resume.md). -### Creación de entradas +### Creación de entradas 1\. En Finder (en macOS, en Windows en *Explorador de archivos*), navega hasta la carpeta de tu sitio web (por ejemplo, *JekyllDemo*) y luego dentro de ella, ingresa a la carpeta `_posts`. Abre el archivo que se encuentra allí con un editor de texto (p. ej. TextWrangler) o un editor de Markdown (por ej. Typora). El archivo se llamará algo así como `2016-02-28-welcome-to-jekyll.markdown` (la fecha coincidirá con la de la creación del sitio de Jekyll). @@ -517,7 +517,7 @@ Como referencia, puedes consultar [un ejemplo de página](http://amandavisconti. - **title:** Cambia "Welcome to Jekyll!" a cualquier título que desees para tu nueva entrada (manteniendo las comillas alrededor del título). La norma es hacer que el título sea igual que las palabras en el nombre del archivo (excepto con espacios añadidos y mayúsculas). Así es como aparecerá el título en la página web de la publicación. - **date:** Cambia esto cuando desees que la publicación muestre fecha y hora de publicación, asegurándote que coincida con la fecha que forma parte del nombre del archivo. (La fecha *y* hora deben ser pasadas, para que tu publicación aparezca). - **categories:** Elimina, por ahora, las palabras "jekyll update (actualización de jekyll)", y no agregues aquí nada más, ya que el tema actual no las utiliza y desordena las URL de las publicaciones. (*Otros temas pueden usar este campo para ordenar las publicaciones de blog por categorías*.) - - **El espacio debajo del segundo \--- (o debajo del recuadro gris si usa un editor Markdown)** es donde debes escribir el contenido de tu post, usando [el formato Markdown descrito anteriormente](#section5-1) + - **El espacio debajo del segundo \--- (o debajo del recuadro gris si usa un editor Markdown)** es donde debes escribir el contenido de tu post, usando [el formato Markdown descrito anteriormente](#escritura-en-markdown) Después de guardar el archivo, deberías poder ver tu segunda entrada en la página principal de tu sitio, y al hacer clic en el enlace, debería ir a la página de la entrada: @@ -531,13 +531,13 @@ Ten en cuenta que **la URL de la publicación** es la URL de tu sitio web local **Para crear nuevos posts**, duplica un archivo existente. Recuerda cambiar el texto preliminar, el contenido dentro de la entrada, así como el nombre del archivo (fecha y título). -Como referencia, puedes consultar [el siguiente ejemplo de entrada](https://amandavisconti.github.io/JekyllDemo/2016/11/12/a-post-about-my-research.html) en mi sitio de demostración, o acceder al [código que ejecuta esa entrada](http://raw.githubusercontent.com/amandavisconti/JekyllDemo/gh-pages/_posts/2016-02-29-a-post-about-my-research.markdown). +Como referencia, puedes consultar [el siguiente ejemplo de entrada](https://amandavisconti.github.io/JekyllDemo/2016/11/12/a-post-about-my-research.html) en mi sitio de demostración, o acceder al [código que ejecuta esa entrada](https://raw.githubusercontent.com/amandavisconti/JekyllDemo/gh-pages/_posts/2016-02-29-a-post-about-my-research.markdown). -## "Hosting" en GitHub Pages +## "Hosting" en GitHub Pages Ahora que ya sabes cómo añadir páginas y publicaciones, en esta sección moveremos tu sitio local a la web, para que otros puedan visitarlo.* **En este punto, estaremos haciendo una versión pública de tu sitio** *(tanto para motores de búsqueda como para cualquiera que conozca o encuentre casualmente el enlace). -[Anteriormente en esta lección,](#section1-2) instalamos la aplicación GitHub Desktop. Ahora la utilizaremos para mover los archivos de tu sitio a un servidor que los presentará como páginas web (GitHub Pages), que el público podrá visitar en línea. Esta será la primera vez en la que subiremos todos los archivos de tu sitio a la web. En el futuro, utilizarás esta aplicación siempre que hayas realizado cambios en los archivos de tu sitio local y desees que esos cambios se vean reflejados en la versión pública del sitio (al final de esta sección encontrarás una [guía](#section8) con información útil para realizar esta tarea). +[Anteriormente en esta lección,](#aplicación-github-desktop) instalamos la aplicación GitHub Desktop. Ahora la utilizaremos para mover los archivos de tu sitio a un servidor que los presentará como páginas web (GitHub Pages), que el público podrá visitar en línea. Esta será la primera vez en la que subiremos todos los archivos de tu sitio a la web. En el futuro, utilizarás esta aplicación siempre que hayas realizado cambios en los archivos de tu sitio local y desees que esos cambios se vean reflejados en la versión pública del sitio (al final de esta sección encontrarás una [guía](#guía) con información útil para realizar esta tarea). 1\. Abre la aplicación GitHub Desktop y haz clic en el signo “+” (Mac) o en la pestaña “File” (Windows) que se encuentra en la esquina superior izquierda. Después, haz clic en la opción “Add” (o “Add local repository…”) que aparece en el menú desplegable. @@ -598,11 +598,11 @@ Ahora que ya sabes cómo añadir páginas y publicaciones, en esta sección move 10\. Ahora ya puedes visitar tu sitio web (¡y compartir el enlace para que otros lo exploren!). La URL sigue la estructura de tu nombre de usuario de GitHub PUNTO github.io BARRA nombre de tu sitio web BARRA. (por ejemplo, la URL del sitio de ejemplo de la autora es [amandavisconti.github.io/JekyllDemo/](https://amandavisconti.github.io/JekyllDemo/)). -## Poniéndonos elegantes +## Poniéndonos elegantes Esta lección no cubre el trabajo avanzado para personalizar la apariencia de tu sitio web ni la adición de nuevas funcionalidades; sin embargo, aquí compartimos algo de información para que puedas comenzar a investigar por tu cuenta. -### Diseño visual +### Diseño visual El diseño visual de un sitio web es referido usualmente como el *tema* (aunque propiamente, un tema es el conjunto de código y archivos de imagen que generan un cambio importante en la apariencia de un sitio web). @@ -610,18 +610,18 @@ Puedes personalizar el tema de tu sitio realizando cambios en los archivos que s - Tema ["Ed" para ediciones digitales mínimas](https://github.com/minicomp/ed/), de Alex Gil (gratis) - Tema ["Digital Edition"](https://github.com/emory-libraries-ecds/digitaledition-jekylltheme), de Rebecca Sutton Koese (gratis) -- El directorio de [Jekyll Themes](http://jekyllthemes.org/) (gratis) -- [JekyllThemes.io](http://jekyllthemes.io/) (gratis y pago) +- El directorio de [Jekyll Themes](https://jekyllrb.com/docs/themes/) (gratis) +- [JekyllThemes.io](https://jekyllthemes.io/) (gratis y pago) -### Funcionalidad +### Funcionalidad -- Los [plugins de Jekyll](http://jekyllrb.com/docs/plugins/) te permiten añadir pequeños segmentos de código que permiten sumar funcionalidades a tu sitio, tales como [realizar búsquedas de texto](https://github.com/PascalW/jekyll_indextank), [permitir el uso de emojis](https://github.com/yihangho/emoji-for-jekyll), o [crear nubes de palabras](https://gist.github.com/ilkka/710577). +- Los [plugins de Jekyll](https://jekyllrb.com/docs/plugins/) te permiten añadir pequeños segmentos de código que permiten sumar funcionalidades a tu sitio, tales como [realizar búsquedas de texto](https://github.com/PascalW/jekyll_indextank), [permitir el uso de emojis](https://github.com/yihangho/emoji-for-jekyll), o [crear nubes de palabras](https://gist.github.com/ilkka/710577). - Si deseas alojar tu sitio en GitHub Pages, como lo hicimos en esta lección, solo podrás utilizar los plugins de Jekyll que ya están incluidos en las _gems_ de GitHub Pages que instalamos (aquí tienes una [lista completa de lo que hemos instalado](https://pages.github.com/versions/), cuando añadimos la _gem_ de GitHub Pages a nuestro Gemfile). -- Si decides alojar tu sitio de Jekyll en otro servidor que no sea GitHub Pages, puedes utilizar cualquier plugin de Jekyll (las instrucciones para alojar tu sitio varían entre diferentes proveedores de hosting web y no las desarrollaremos en esta lección, pero [aquí](http://jekyllrb.com/docs/plugins/) tienes una página que explica cómo instalar plugins, una vez que poseas tu sitio con hosting propio). Puedes realizar una búsqueda utilizando “Jekyll plugin” y añadir la funcionalidad que necesites para explorar si hay una herramienta apropiada disponible, o revisar la [documentación sobre plugins](http://jekyllrb.com/docs/plugins/) en el sitio oficial de Jekyll. +- Si decides alojar tu sitio de Jekyll en otro servidor que no sea GitHub Pages, puedes utilizar cualquier plugin de Jekyll (las instrucciones para alojar tu sitio varían entre diferentes proveedores de hosting web y no las desarrollaremos en esta lección, pero [aquí](https://jekyllrb.com/docs/plugins/) tienes una página que explica cómo instalar plugins, una vez que poseas tu sitio con hosting propio). Puedes realizar una búsqueda utilizando “Jekyll plugin” y añadir la funcionalidad que necesites para explorar si hay una herramienta apropiada disponible, o revisar la [documentación sobre plugins](https://jekyllrb.com/docs/plugins/) en el sitio oficial de Jekyll. -- También puedes mantener GitHub Pages como hosting gratuito para tu sitio, pero darle un **nombre de dominio personalizado** (los dominios pueden ser adquiridos por un costo razonable -que suele rondar los 10 dólares anuales- a través de un registrador de dominios como [NearlyFreeSpeech.net](https://www.nearlyfreespeech.net/services/domains)). Por ejemplo, el blog de la autora de este tutorial, [LiteratureGeek.com](http://literaturegeek.com/), fue hecho con Jekyll y está alojado en GitHub Pages, al igual que el sitio que creaste en esta lección, pero utiliza un dominio personalizado que la autora compró y configuró para que condujera a su sitio web. Las instrucciones para establecer un dominio personalizado pueden ser encontradas [aquí](https://help.github.com/articles/using-a-custom-domain-with-github-pages/). +- También puedes mantener GitHub Pages como hosting gratuito para tu sitio, pero darle un **nombre de dominio personalizado** (los dominios pueden ser adquiridos por un costo razonable -que suele rondar los 10 dólares anuales- a través de un registrador de dominios como [NearlyFreeSpeech.net](https://www.nearlyfreespeech.net/services/domains)). Por ejemplo, el blog de la autora de este tutorial, [LiteratureGeek.com](https://literaturegeek.com/), fue hecho con Jekyll y está alojado en GitHub Pages, al igual que el sitio que creaste en esta lección, pero utiliza un dominio personalizado que la autora compró y configuró para que condujera a su sitio web. Las instrucciones para establecer un dominio personalizado pueden ser encontradas [aquí](https://help.github.com/articles/using-a-custom-domain-with-github-pages/).
    Si configuras un dominio personalizado para tu sitio web alojado en GitHub Pages, asegúrate de leer y seguir los pasos de la documentación de GitHub para verificar tu dominio y evitar el uso de registros DNS comodín, para prevenir así un fallo de seguridad conocido. @@ -629,7 +629,7 @@ Si configuras un dominio personalizado para tu sitio web alojado en GitHub Pages - Además, puedes **migrar un blog existente** desde otras plataformas, incluyendo WordPress, Blogger, Drupal y Tumblr, para lo cual debes seguir el enlace que se encuentra en el sector derecho de [esta página](https://import.jekyllrb.com/docs/home/). Cuando migres un sitio, asegúrate de tener una copia de seguridad de tu sitio original, en caso de que necesites realizar más de un intento para que las publicaciones del sitio queden en la misma URL que antes (y que de esta forma el sitio se mantenga en los resultados de los buscadores y en los marcadores). -## Guía +## Guía **Para realizar pruebas en el sitio de forma local** (nuevos plugins, temas, o explorar cómo luce una nueva publicación): @@ -645,22 +645,22 @@ Si configuras un dominio personalizado para tu sitio web alojado en GitHub Pages * Una vez que el commit haya finalizado, haz clic en el botón “Sync” en la sección superior derecha de la pantalla (Mac) o en el botón “Push origin” que aparece destacado en azul (Windows). * Espera un poco a que GitHub reciba los cambios (usualmente entre 10 a 90 segundos) y refresca tu sitio online para ver los cambios allí reflejados. -## Ayuda, créditos y lecturas +## Ayuda, créditos y lecturas -### Ayuda +### Ayuda -Si encuentras algún problema, [Jekyll tiene una página para problemas, conocidos como troubleshooting](https://jekyllrb.com/docs/troubleshooting/), que te puede ayudar. Si estás trabajando en la línea de comandos y recibes un mensaje de error, no te olvides de buscar más acerca del error en la web. Más allá de los motores de búsqueda tradicionales, [el sitio StackExchange](http://stackexchange.com/) es un buen lugar para encontrar preguntas y respuestas de gente que tuvo este tipo de problemas. +Si encuentras algún problema, [Jekyll tiene una página para problemas, conocidos como troubleshooting](https://jekyllrb.com/docs/troubleshooting/), que te puede ayudar. Si estás trabajando en la línea de comandos y recibes un mensaje de error, no te olvides de buscar más acerca del error en la web. Más allá de los motores de búsqueda tradicionales, [el sitio StackExchange](https://stackexchange.com/) es un buen lugar para encontrar preguntas y respuestas de gente que tuvo este tipo de problemas. -### Creditos +### Creditos Gracias a Fred Gibbs, editor del *Programming Historian* por editar, debatir y revisar la lección original. A Paige Morgan por revisar la lección; a Scott Weingart y sus estudiantes por poner en práctica y testear esta lección en Windows; a Tod Robbins y Matthew Lincoln por sugerencias en [DH Slack](https://digitalhumanities.slack.com) sobre lo que debería enseñar esta lección. Asimismo, agradecemos a Marc Bria por su revisión y sugerencias con respecto a la traducción de esta lección al español. -### Lecturas +### Lecturas Puedes visitar estos sitios para más documentación, inspiración y para aprender más sobre Jekyll: * [Documentación oficial de Jekyll](https://jekyllrb.com/docs/home/) -* Jekyll tiene links a recursos "no oficiales" sobre su funcionamiento en Windows: [https://jekyll-windows.juthilo.com/](http://jekyll-windows.juthilo.com/) y [https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/](https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/) +* Jekyll tiene links a recursos "no oficiales" sobre su funcionamiento en Windows: [https://github.com/juthilo/run-jekyll-on-windows](https://github.com/juthilo/run-jekyll-on-windows) y [https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/](https://davidburela.wordpress.com/2015/11/28/easily-install-jekyll-on-windows-with-3-command-prompt-entries-and-chocolatey/) * [https://help.github.com/articles/using-jekyll-with-pages/](https://help.github.com/articles/using-jekyll-with-pages/) * Amanda Visconti, ["Introducing Static Sites for Digital Humanities Projects (why & what are Jekyll, GitHub, etc.?)"](https://literaturegeek.com/2015/12/08/WhyJekyllGitHub) * Alex Gil, ["How (and Why) to Generate a Static Website Using Jekyll, Part 1"](https://chronicle.com/blogs/profhacker/jekyll1/60913) diff --git a/es/lecciones/topic-modeling-y-mallet.md b/es/lecciones/topic-modeling-y-mallet.md index 80d55c7497..57e411ae95 100644 --- a/es/lecciones/topic-modeling-y-mallet.md +++ b/es/lecciones/topic-modeling-y-mallet.md @@ -35,7 +35,7 @@ doi: 10.46430/phes0040 Nota del editor --------------- -En esta lección es necesario utilizar la línea de comandos. Si no tienes experiencia previa utilizándola, consulta la lección [Introducción a la línea de comandos en Bash](introduccion-a-bash) de *Programming Historian*. +En esta lección es necesario utilizar la línea de comandos. Si no tienes experiencia previa utilizándola, consulta la lección [Introducción a la línea de comandos en Bash](/es/lecciones/introduccion-a-bash) de *Programming Historian*. Objetivos de la lección ----------------------- @@ -44,16 +44,16 @@ En esta lección, primero aprenderás qué es *topic modeling*[^1] y por qué po Aplicaremos el modelador de tópicos a algunos archivos de ejemplo y veremos los tipos de *output* que genera MALLET. Esto nos dará una buena idea de cómo se puede aplicar *topic modeling* a un corpus de textos para identificar tópicos o temas que se encuentran en los documentos, sin tener que leerlos individualmente. -Por favor, remítete a la [lista de discusión](http://mallet.cs.umass.edu/mailinglist.php) de los usuarios de MALLET para aprender más sobre todo lo que se pueda hacer con este programa. +Por favor, remítete a la [lista de discusión](https://mallet.cs.umass.edu/mailinglist.php) de los usuarios de MALLET para aprender más sobre todo lo que se pueda hacer con este programa. (Queremos agradecer a Robert Nelson y Elijah Meeks por consejos y sugerencias sobre cómo empezar a utilizar MALLET por primera vez y por sus ejemplos de lo que se puede hacer con esta herramienta.) ¿Qué es *Topic Modeling* y para quién es útil? ---------------------------------------------- -Una herramienta de *topic modeling* toma un texto individual (o un corpus) y busca patrones en el uso de las palabras; es un intento de encontrar significado semántico en el vocabulario de ese texto (o corpus). Antes de empezar con *topic modeling* deberías preguntarte si es o no útil para tu proyecto. Para empezar a entender en qué circunstancias una técnica como esta es la más efectiva, te recomendamos *[Distant Reading](http://www.cs.umbc.edu/~hillol/NGDM07/abstracts/talks/MKirschenbaum.pdf)* de Matthew Kirschenbaum (una charla dada en el simposio de la Fundación Nacional de Ciencias de los Estados Unidos en 2009, sobre la próxima generación de extracción de datos y descubrimiento cibernético para la inovación) y *[Reading Machines](http://www.worldcat.org/title/reading-machines-toward-an-algorithmic-criticism/oclc/708761605&referer=brief_results)* de Stephen Ramsay. +Una herramienta de *topic modeling* toma un texto individual (o un corpus) y busca patrones en el uso de las palabras; es un intento de encontrar significado semántico en el vocabulario de ese texto (o corpus). Antes de empezar con *topic modeling* deberías preguntarte si es o no útil para tu proyecto. Para empezar a entender en qué circunstancias una técnica como esta es la más efectiva, te recomendamos *[Distant Reading](https://www.cs.umbc.edu/~hillol/NGDM07/abstracts/talks/MKirschenbaum.pdf)* de Matthew Kirschenbaum (una charla dada en el simposio de la Fundación Nacional de Ciencias de los Estados Unidos en 2009, sobre la próxima generación de extracción de datos y descubrimiento cibernético para la inovación) y *[Reading Machines](https://www.worldcat.org/title/reading-machines-toward-an-algorithmic-criticism/oclc/708761605&referer=brief_results)* de Stephen Ramsay. -Como toda herramienta, el hecho de que se pueda utilizar no significa que deberías hacerlo. Si trabajas con pocos documentos (o incluso con un solo documento) puede ser que cálculos de frecuencia sean suficientes, en cuyo caso algo como las [herramientas Voyant](http://voyant-tools.org/) quizá serían convenientes. Si, en cambio, tienes cientos de documentos procedentes de un archivo y quieres comprender qué contiene el archivo, pero sin necesariamente leer cada documento, entonces *topic modeling* podría ser una buena opción. +Como toda herramienta, el hecho de que se pueda utilizar no significa que deberías hacerlo. Si trabajas con pocos documentos (o incluso con un solo documento) puede ser que cálculos de frecuencia sean suficientes, en cuyo caso algo como las [herramientas Voyant](https://voyant-tools.org/) quizá serían convenientes. Si, en cambio, tienes cientos de documentos procedentes de un archivo y quieres comprender qué contiene el archivo, pero sin necesariamente leer cada documento, entonces *topic modeling* podría ser una buena opción. Los modelos de tópicos son una familia de programas informáticos que extraen *tópicos* de *textos*. Para la computadora, un *tópico* es una lista de palabras que se presenta de manera que sea estadísticamente significativa. Un *texto* puede ser un email, una entrada de blog, un capítulo de libro, un artículo periodístico, una entrada de diario – es decir, todo tipo de texto no estructurado. No estructurado quiere decir que no haya anotaciones legibles por la computadora que indiquen el significado semántico de las palabras del texto. @@ -69,20 +69,20 @@ Hay muchos programas diferentes para *topic modeling*; esta lección utiliza uno Examinando las palabras clave podemos ver que el político que dio los discursos se refirió a la economía, los empleos, el Medio Oriente, las próximas elecciones, etc. -Como advierte Scott Weingart, quienes utilizan *topic modeling* sin entenderlo completamente enfrentan muchos [peligros](https://web.archive.org/web/20240602215348/https://www.scottbot.net/HIAL/index.html@p=16713.html). Por ejemplo, podría interesarnos el uso de las palabras como un indicador para la ubicación en un espectro político. *Topic modeling* sin duda podría ayudar con eso, pero hay que recordar que el indicador no es en sí lo que queremos comprender - como lo muestra Andrew Gelman en su [estudio de maqueta sobre zombis, utilizando Google Trends](http://arxiv.org/abs/1003.6087/). Ted Underwood y Lisa Rhody (véase Lecturas adicionales) sostienen que para nosotros como historiadores sería mejor considerar estas categorías como discursos; sin embargo, para nuestros objetivos, continuaremos utilizando la palabra: tópico. +Como advierte Scott Weingart, quienes utilizan *topic modeling* sin entenderlo completamente enfrentan muchos [peligros](https://web.archive.org/web/20240602215348/https://www.scottbot.net/HIAL/index.html@p=16713.html). Por ejemplo, podría interesarnos el uso de las palabras como un indicador para la ubicación en un espectro político. *Topic modeling* sin duda podría ayudar con eso, pero hay que recordar que el indicador no es en sí lo que queremos comprender - como lo muestra Andrew Gelman en su [estudio de maqueta sobre zombis, utilizando Google Trends](https://arxiv.org/abs/1003.6087/). Ted Underwood y Lisa Rhody (véase Lecturas adicionales) sostienen que para nosotros como historiadores sería mejor considerar estas categorías como discursos; sin embargo, para nuestros objetivos, continuaremos utilizando la palabra: tópico. Nota: En la bibliografía sobre *topic modeling*, a veces encontrarás el término "*LDA*". Muchas veces, LDA y *topic modeling* se usan como sinónimos, pero la técnica LDA es, en realidad, un caso especial de *topic modeling* desarrollado por [David Blei y amigos](https://es.wikipedia.org/wiki/Latent_Dirichlet_Allocation) en 2002. No fue la primera técnica considerada como *topic modeling* pero es la más popular. Las innumerables variaciones de *topic modeling* han resultado en una sopa de letras de técnicas y programas para implementarlas, lo cual puede ser desconcertante o agobiante para los no iniciados en la materia y por esto no nos detendremos en ellos por ahora. Todos los algoritmos trabajan casi del mismo modo y MALLET en particular utiliza LDA. ### Ejemplos de modelos de tópicos usados por historiadores: -- Rob Nelson, *[Mining the Dispatch](http://dsl.richmond.edu/dispatch/)* +- Rob Nelson, *[Mining the Dispatch](https://dsl.richmond.edu/dispatch/)* - Cameron Blevins, "[Topic Modeling Martha Ballard's Diary](https://perma.cc/39CG-MNLH)" *Historying*, April 1, 2010. - David J Newman y Sharon Block, "Probabilistic topic decomposition of an eighteenth century American newspaper," *Journal of the American Society for Information Science and Technology* vol. 57, no. 6 (April 1, 2006): 753-767.[^2] Instalar MALLET --------------- -Hay muchas herramientas que se podrían utilizar para crear modelos de tópicos, pero al momento de escribir estas líneas (en el verano de 2007) la herramienta más sencilla es MALLET.[^3] [MALLET](http://mallet.cs.umass.edu/index.php) utiliza una implementación del [*Muestreo de Gibbs*](https://es.wikipedia.org/wiki/Muestreo_de_Gibbs), una técnica estadística destinada a construir rápidamente una distribución de muestras, para luego crear los modelos de tópicos correspondientes. Para utilizar MALLET es necesario trabajar en la línea de comandos – hablaremos más de esto en un instante. Lo bueno es que normalmente los mismos comandos se usan repetidamente. +Hay muchas herramientas que se podrían utilizar para crear modelos de tópicos, pero al momento de escribir estas líneas (en el verano de 2007) la herramienta más sencilla es MALLET.[^3] [MALLET](https://mallet.cs.umass.edu/index.php) utiliza una implementación del [*Muestreo de Gibbs*](https://es.wikipedia.org/wiki/Muestreo_de_Gibbs), una técnica estadística destinada a construir rápidamente una distribución de muestras, para luego crear los modelos de tópicos correspondientes. Para utilizar MALLET es necesario trabajar en la línea de comandos – hablaremos más de esto en un instante. Lo bueno es que normalmente los mismos comandos se usan repetidamente. Las instrucciones de instalación son diferentes para Windows y Mac. Sigue las instrucciones apropiadadas para ti: @@ -91,8 +91,8 @@ Las instrucciones de instalación son diferentes para Windows y Mac. Sigue las i ### Instrucciones para Windows -1. Ve a la página del proyecto [MALLET](http://mallet.cs.umass.edu/index.php). Puedes [descargar MALLET aquí](http://mallet.cs.umass.edu/download.php). -2. También necesitarás el [Kit de desarrollo de Java (JDK)](http://www.oracle.com/technetwork/java/javase/downloads/index.html) - esto es, no el Java normal que se encuentra en cada computadora sino el que permite programar cosas. Instala este en tu computadora. +1. Ve a la página del proyecto [MALLET](https://mallet.cs.umass.edu/index.php). Puedes [descargar MALLET aquí](https://mallet.cs.umass.edu/download.php). +2. También necesitarás el [Kit de desarrollo de Java (JDK)](https://www.oracle.com/technetwork/java/javase/downloads/index.html) - esto es, no el Java normal que se encuentra en cada computadora sino el que permite programar cosas. Instala este en tu computadora. 3. Descomprime MALLET en tu directorio `C:`. Esto es importante: no puede ser en ningún otro lugar. Tendrás un directorio llamado `C:\mallet-2.0.8` o parecido. Para simplificar, cambia el nombre simplemente a `mallet`. 4. MALLET utiliza una *variable de entorno* para indicar a la computadora donde encontrar todos los componentes necesarios para sus procesos en el momento de ejecutarse. Es como un atajo para el programa. Un(a) programador(a) no puede saber exactamente donde cada usuario instala un programa. Por eso, él o ella crea una variable en el código que representa el lugar de instalación en cada momento. Por medio de la variable de entorno indicamos a la computadora donde se encuentra ese lugar. Si mueves el programa a otro lugar tendrás que cambiar esa variable. @@ -130,8 +130,8 @@ Ahora estás preparado para avanzar a la próxima sección. Muchas de las instrucciones para la instalación en OS X se parecen a las instrucciones para Windows, con pocas excepciones. En realidad, es un poco más fácil ejecutar comandos de MALLET en Mac. -1. Descarga e [instala MALLET](http://mallet.cs.umass.edu/download.php). -2. Descarga el [Kit de desarrollo de Java (JDK)](http://www.oracle.com/technetwork/java/javase/downloads/index.html). +1. Descarga e [instala MALLET](https://mallet.cs.umass.edu/download.php). +2. Descarga el [Kit de desarrollo de Java (JDK)](https://www.oracle.com/technetwork/java/javase/downloads/index.html). Descomprime MALLET en un directorio en tu sistema (para seguir esta lección con facilidad, escoge tu directorio `/User/`, aunque otro lugar funcionará igualmente). Cuando esté descomprimido, abre tu ventana Terminal (dentro del directorio `Aplicaciones` en tu Finder). Usando la Terminal, navega al directorio donde descomprimiste MALLET (será `mallet-2.0.8` o `mallet` si cambiaste el nombre de la carpeta para simplificarlo. Si descomprimiste MALLET en tu directorio `/User/` como se sugiere en esta lección, puedes navegar al directorio correcto tecleando `cd mallet-2.0.8` o bien `cd mallet`). `cd` es la abreviatura para "cambiar directorio" cuando se trabaja en la Terminal. @@ -198,7 +198,7 @@ El parámetro `--output` junto con una ruta de fichero indica donde se guarda el También podrías utilizar tus propios datos. Cambia `C:\Users\User\Desktop\ensayos-jose-marti` a un directorio que contenga tus propios archivos de investigación. ¡Buena suerte! -Si no estás seguro de cómo funcionan los directorios, te recomendamos la lección [Introducción a la línea de comandos en Bash](introduccion-a-bash) de *Programming Historian*. +Si no estás seguro de cómo funcionan los directorios, te recomendamos la lección [Introducción a la línea de comandos en Bash](/es/lecciones/introduccion-a-bash) de *Programming Historian*. ### Para Mac Las instrucciones para Mac son parecidas a las de Windows, con algunas diferencias que puedes notar en el siguiente ejemplo: @@ -297,32 +297,32 @@ Puede resultar difícil leer estos datos. Los tópicos comienzan en la tercera c A partir de esto, se puede ver que en el documento número 0 (es decir, el primer documento cargado en MALLET), `a-aprender-en-las-haciendas.txt`, el tópico 0 tiene un porcentaje de 0.33% (columna C). Si buscamos el valor más alto en esta fila, podemos ver que el tópico 3 es el más importante en este documento, con un porcentaje de 69.24%. Dada la naturaleza de MALLET, tus propios tópicos pueden tener valores diferentes. -Si tienes un corpus de archivos de texto que están organizados en orden cronológico (por ejemplo que `1.txt` sea anterior a `2.txt`), podrías generar un gráfico en tu programa de hoja de cálculo y empezar a ver cambios con el tiempo, tal como lo hizo Robert Nelson en [Mining the Dispatch](http://dsl.richmond.edu/dispatch/). +Si tienes un corpus de archivos de texto que están organizados en orden cronológico (por ejemplo que `1.txt` sea anterior a `2.txt`), podrías generar un gráfico en tu programa de hoja de cálculo y empezar a ver cambios con el tiempo, tal como lo hizo Robert Nelson en [Mining the Dispatch](https://dsl.richmond.edu/dispatch/). ¿Cómo puedes saber cuál es la cantidad adecuada de tópicos? ¿Hay una cantidad *natural* de tópicos? Hemos descubierto que hay que ejecutar `train-topics` varias veces con distintas cantidades de tópicos para ver cómo la distribución de los tópicos en los documentos cambia. Si encontramos que la mayoría de los textos están dominados por muy pocos tópicos, lo interpretamos como una señal de necesitar aumentar la cantidad de tópicos; las preferencias fueron demasiado amplias. Hay maneras de buscar la mejor configuración automáticamente, por ejemplo mediante el comando `hlda` de MALLET, pero para los lectores de esta lección probablemente es más rápido realizar algunas iteraciones (para más información consulta Griffiths, T. L., & Steyvers, M. (2004). *Finding scientific topics.* Proceedings of the National Academy of Science, 101, 5228-5235). ### Analizar tus propios textos con MALLET -La carpeta `sample data` en el directorio de MALLET (`C:\mallet\sample-data`) te puede servir como guía para saber cómo organizar tus textos. Pon todo lo que deseas en una sola carpeta, por ejemplo `C:\mis-datos`. Tus archivos deben contener texto llano y estar en el formato `.txt` (puedes crearlos en un procesador de textos como Notepad, [Sublime Text](https://www.sublimetext.com/) o [Atom](https://atom.io/), por ejemplo, y guardarlos como `Texto (*.txt)` o `Texto sin formato`). Tienes que tomar algunas decisiones. ¿Quieres explorar los tópicos a nivel de párrafos? Entonces cada archivo `.txt` debería contener solo un párrafo. En los nombres de los archivos puedes agregar información como el número de la página u otros identificadores, por ejemplo: `pag32_parr1.txt`. Si trabajas con un diario, cada archivo de texto podría ser una entrada de diario, por ejemplo: `abril_25_1887.txt`. (Nota que es importante no dejar espacios en los nombres de carpetas y archivos). Si los textos que te interesan están en la red, podrías [automatizar](http://electricarchaeology.ca/2012/07/09/mining-a-day-of-archaeology/) este proceso. +La carpeta `sample data` en el directorio de MALLET (`C:\mallet\sample-data`) te puede servir como guía para saber cómo organizar tus textos. Pon todo lo que deseas en una sola carpeta, por ejemplo `C:\mis-datos`. Tus archivos deben contener texto llano y estar en el formato `.txt` (puedes crearlos en un procesador de textos como Notepad, [Sublime Text](https://www.sublimetext.com/) o [Atom](https://atom.io/), por ejemplo, y guardarlos como `Texto (*.txt)` o `Texto sin formato`). Tienes que tomar algunas decisiones. ¿Quieres explorar los tópicos a nivel de párrafos? Entonces cada archivo `.txt` debería contener solo un párrafo. En los nombres de los archivos puedes agregar información como el número de la página u otros identificadores, por ejemplo: `pag32_parr1.txt`. Si trabajas con un diario, cada archivo de texto podría ser una entrada de diario, por ejemplo: `abril_25_1887.txt`. (Nota que es importante no dejar espacios en los nombres de carpetas y archivos). Si los textos que te interesan están en la red, podrías [automatizar](https://electricarchaeology.ca/2012/07/09/mining-a-day-of-archaeology/) este proceso. ### Lecturas adicionales sobre *Topic Modeling* Para ver un ejemplo desarrollado de *topic modeling* basado en materiales obtenidos de páginas web, véase [Mining the Open Web with Looted -Heritage Draft](http://electricarchaeology.ca/2012/06/08/mining-the-open-web-with-looted-heritage-draft/). +Heritage Draft](https://electricarchaeology.ca/2012/06/08/mining-the-open-web-with-looted-heritage-draft/). Puedes reutilizar los datos tomándolos de [Figshare.com](https://ndownloader.figshare.com/files/90972) donde están incluidos algunos archivos `.txt`. Cada uno de los ficheros `.txt` contiene una noticia individual. - Para amplia información adicional y una bibliografía sobre *topic modeling* podrías empezar con el [Guided Tour to Topic Modeling](https://web.archive.org/web/20240520155820/https://www.scottbot.net/HIAL/index.html@p=19113.html) de Scott Weingart. -- Una discusión importante sobre la interpretación del significado de los tópicos es '[Topic modeling made just simple enough](http://tedunderwood.wordpress.com/2012/04/07/topic-modeling-made-just-simple-enough/)' de Ted Underwood. -- El artículo de blog '[Some Assembly Required](http://web.archive.org/web/20160704150726/http://www.lisarhody.com:80/some-assembly-required/)' *Lisa @ Work* 22 de agosto de 2012 escrito por Lisa Rhody también es muy revelador. -- Clay Templeton, '[Topic Modeling in the Humanities: An Overview](https://web.archive.org/web/20130116223500/http://mith.umd.edu/topic-modeling-in-the-humanities-an-overview/)', Maryland Institute for Technology in the Humanities, n.d. -- David Blei, Andrew Ng, and Michael Jordan, '[Latent dirichlet allocation](http://dl.acm.org/citation.cfm?id=944937)', The Journal of Machine Learning Research 3 (2003). -- Finalmente, te recomendamos que consultes la [bibliografía de artículos sobre *topic modeling*](http://mimno.infosci.cornell.edu/topics.html) de David Mimno. Están clasificados por temas para facilitar encontrar el artículo más adecuado para una aplicación determinada. También puedes echar un vistazo a su reciente artículo sobre [Historiografía Computacional](http://www.perseus.tufts.edu/publications/02-jocch-mimno.pdf) en la revista *ACM Transactions on Computational Logic* en el que analiza revistas científicas de los Clásicos a lo largo de cien años para aprender algo sobre este campo. Mientras el artículo debe leerse como un buen ejemplo de *topic modeling*, su sección sobre 'métodos' es especialmente relevante porque incluye una discusión sobre cómo preparar los textos para un análisis de ese tipo.[^13] +- Una discusión importante sobre la interpretación del significado de los tópicos es '[Topic modeling made just simple enough](https://tedunderwood.wordpress.com/2012/04/07/topic-modeling-made-just-simple-enough/)' de Ted Underwood. +- El artículo de blog '[Some Assembly Required](https://web.archive.org/web/20160704150726/https://www.lisarhody.com:80/some-assembly-required/)' *Lisa @ Work* 22 de agosto de 2012 escrito por Lisa Rhody también es muy revelador. +- Clay Templeton, '[Topic Modeling in the Humanities: An Overview](https://web.archive.org/web/20130116223500/https://mith.umd.edu/topic-modeling-in-the-humanities-an-overview/)', Maryland Institute for Technology in the Humanities, n.d. +- David Blei, Andrew Ng, and Michael Jordan, '[Latent dirichlet allocation](https://dl.acm.org/citation.cfm?id=944937)', The Journal of Machine Learning Research 3 (2003). +- Finalmente, te recomendamos que consultes la [bibliografía de artículos sobre *topic modeling*](https://mimno.infosci.cornell.edu/topics.html) de David Mimno. Están clasificados por temas para facilitar encontrar el artículo más adecuado para una aplicación determinada. También puedes echar un vistazo a su reciente artículo sobre [Historiografía Computacional](https://www.perseus.tufts.edu/publications/02-jocch-mimno.pdf) en la revista *ACM Transactions on Computational Logic* en el que analiza revistas científicas de los Clásicos a lo largo de cien años para aprender algo sobre este campo. Mientras el artículo debe leerse como un buen ejemplo de *topic modeling*, su sección sobre 'métodos' es especialmente relevante porque incluye una discusión sobre cómo preparar los textos para un análisis de ese tipo.[^13] ## Notas de traducción [^1]: En esta traducción se utiliza la expresión *topic modeling* en inglés porque en la literatura publicada sobre el tema en español es lo más común. Por supuesto sería posible traducir *topic modeling* por modelaje de tópicos o algo parecido, pero hasta ahora no es habitual. Por otro lado, se ha optado por traducir todas las demás palabras relacionadas al método para estimular su uso en español, por ejemplo *topic* por tópico o *topic model* por modelo de tópicos. -[^2]: También hay algunos ejemplos de modelos de tópicos creados a partir de textos (literarios) en español. Por ejemplo: Borja Navarro-Colorado, *[On Poetic Topic Modeling: Extracting Themes and Motifs From a Corpus of Spanish Poetry](https://www.frontiersin.org/articles/10.3389/fdigh.2018.00015/full)*, frontiers in Digital Humanities, 20 de junio de 2018, [https://doi.org/10.3389/fdigh.2018.00015](https://doi.org/10.3389/fdigh.2018.00015); Borja Navarro-Colorado y David Tomás, *[A fully unsupervised Topic Modeling approach to metaphor identification / Una aproximación no supervisada a la detección de metáforas basada en Topic Modeling](https://www.dlsi.ua.es//~borja/NavarroTomas_PosterSEPLN2015.pdf)*, Actas del XXXI Congreso de la Sociedad Española para el Procesamiento del Lenguaje Natural, 2015; Christof Schöch, Ulrike Henny, José Calvo Tello, Daniel Schlör, Stefanie Popp, *[Topic, Genre, Text. Topics im Textverlauf von Untergattungen des spanischen und hispanoamerikanischen Romans (1880-1930)](https://web.archive.org/web/20180828160609/http://www.dhd2016.de/abstracts/vortr%C3%A4ge-055.html)*, DHd 2016. Modellierung, Vernetzung, Visualisierung. Die Digital Humanities als fächerübergreifendes Forschungsparadigma. Universität Leipzig, 7.-12. März 2016. +[^2]: También hay algunos ejemplos de modelos de tópicos creados a partir de textos (literarios) en español. Por ejemplo: Borja Navarro-Colorado, *[On Poetic Topic Modeling: Extracting Themes and Motifs From a Corpus of Spanish Poetry](https://www.frontiersin.org/articles/10.3389/fdigh.2018.00015/full)*, frontiers in Digital Humanities, 20 de junio de 2018, [https://doi.org/10.3389/fdigh.2018.00015](https://doi.org/10.3389/fdigh.2018.00015); Borja Navarro-Colorado y David Tomás, *[A fully unsupervised Topic Modeling approach to metaphor identification / Una aproximación no supervisada a la detección de metáforas basada en Topic Modeling](https://www.dlsi.ua.es//~borja/NavarroTomas_PosterSEPLN2015.pdf)*, Actas del XXXI Congreso de la Sociedad Española para el Procesamiento del Lenguaje Natural, 2015; Christof Schöch, Ulrike Henny, José Calvo Tello, Daniel Schlör, Stefanie Popp, *[Topic, Genre, Text. Topics im Textverlauf von Untergattungen des spanischen und hispanoamerikanischen Romans (1880-1930)](https://web.archive.org/web/20180828160609/https://www.dhd2016.de/abstracts/vortr%C3%A4ge-055.html)*, DHd 2016. Modellierung, Vernetzung, Visualisierung. Die Digital Humanities als fächerübergreifendes Forschungsparadigma. Universität Leipzig, 7.-12. März 2016. [^3]: En esta traducción, las instrucciones para la instalación de MALLET fueron actualizadas para ajustarse a Windows 10. En el original inglés las instrucciones se refieren a Windows 7. Las capturas de pantalla fueron sustituidas para que el idioma de la pantalla sea español. [^4]: En todos los ejemplos de esta lección en los que aparece la palabra `User`, deberás sustituirla con tu propio nombre de usuario. [^5]: Al final de un comando escrito en la línea de comandos siempre se teclea Entrar para confirmar el comando y ejecutarlo. En adelante no lo mencionaremos más. @@ -333,4 +333,4 @@ Puedes reutilizar los datos tomándolos de [Figshare.com](https://ndownloader.fi [^10]: Nótese que MALLET no reconoce palabras compuestas como `New York` y las trata como dos palabras separadas. Para evitar eso, sería necesario preprocesar el texto y conectar las varias partes de la palabra compuesta con un símbolo, por ejemplo una barra baja (`New_York`) para que MALLET las reconozca como tales. [^11]: Si comparas los tópicos en la figura 10 con los de la figura 9, puedes ver el efecto del elemento aleatorio del *topic modeling*. Esas dos listas de tópicos son los resultados de dos pasadas diferentes y aunque los tópicos se parezcan no son exactamente iguales. [^12]: Como en la línea de comandos, también en el programa de hoja de cálculo puede ser necesario cambiar la codificación de caracteres a UTF-8 para que las letras con acento o ñ salgan correctamente. Esto se puede hacer durante el proceso de importar los datos o ajustando las preferencias del programa. -[^13]: Para introducciones a *topic modeling* escritas en español, véanse la entrada de blog de José Calvo Tello *[Topic modeling: ¿qué, cómo, cuándo?](http://www.morethanbooks.eu/topic-modeling-introduccion/)* y la presentación *[Text Mining con Topic Modeling](https://web.archive.org/web/20180831094856/http://www.dlsi.ua.es/~borja/riilua/6.TopicModeling_v02.pdf)* de Borja Navarro-Colorado. +[^13]: Para introducciones a *topic modeling* escritas en español, véanse la entrada de blog de José Calvo Tello *[Topic modeling: ¿qué, cómo, cuándo?](https://www.morethanbooks.eu/topic-modeling-introduccion/)* y la presentación *[Text Mining con Topic Modeling](https://web.archive.org/web/20180831094856/https://www.dlsi.ua.es/~borja/riilua/6.TopicModeling_v02.pdf)* de Borja Navarro-Colorado. diff --git a/es/lecciones/trabajar-con-archivos-de-texto.md b/es/lecciones/trabajar-con-archivos-de-texto.md index 7ed26ea182..1f7f28bd63 100644 --- a/es/lecciones/trabajar-con-archivos-de-texto.md +++ b/es/lecciones/trabajar-con-archivos-de-texto.md @@ -19,8 +19,8 @@ translation-reviewer: - Antonio Rojas Castro review-ticket: https://github.com/programminghistorian/ph-submissions/issues/40 layout: lesson -next: reutilizacion-de-codigo-y-modularidad -previous: ver-archivos-html +next: /es/lecciones/reutilizacion-de-codigo-y-modularidad +previous: /es/lecciones/ver-archivos-html original: working-with-text-files python_warning: false difficulty: 2 @@ -179,4 +179,4 @@ Lecturas recomendadas [print]: https://docs.python.org/2/reference/simple_stmts.html#the-print-statement [palabra reservada]: https://docs.python.org/release/2.5.4/ref/keywords.html [file objects]: https://docs.python.org/2/library/stdtypes.html#bltin-file-objects -[Non-Programmer’s Tutorial for Python 2.6/Hello, World]: http://en.wikibooks.org/wiki/Non-Programmer%27s_Tutorial_for_Python_2.6/Hello,_World +[Non-Programmer’s Tutorial for Python 2.6/Hello, World]: https://en.wikibooks.org/wiki/Non-Programmer%27s_Tutorial_for_Python_2.6/Hello,_World diff --git a/es/lecciones/trabajar-con-paginas-web.md b/es/lecciones/trabajar-con-paginas-web.md index 558e81c019..b1112a79bf 100644 --- a/es/lecciones/trabajar-con-paginas-web.md +++ b/es/lecciones/trabajar-con-paginas-web.md @@ -26,8 +26,8 @@ difficulty: 2 activity: acquiring topics: [python] abstract: "Esta lección muestra qué es un Localizador de recursos uniforme (Uniform Resource Locator = URL) y explica cómo utilizar Python para descargar y guardar los contenidos de una página web en tu disco duro." -next: manipular-cadenas-de-caracteres-en-python -previous: reutilizacion-de-codigo-y-modularidad +next: /es/lecciones/manipular-cadenas-de-caracteres-en-python +previous: /es/lecciones/reutilizacion-de-codigo-y-modularidad original: working-with-web-pages python_warning: false avatar_alt: Grabado de una mujer y un hombre de pies. @@ -162,7 +162,7 @@ f.write(contenidoWeb) f.close ``` -Ahora, si tu puedes guardar un solo archivo así de fácil, ¿es posible escribir un programa que te permita descargar un puñado de archivos? ¿Es posible incrementar la cantidad de IDs de los juicios y obtener copias de todos ellos en una sola descarga? Efectivamente. Puedes aprender cómo hacerlo en la lección [Downloading Multiple Files using Query Strings](/lessons/downloading-multiple-records-using-query-strings), que te recomendamos ver una vez completadas las lecciones introductorias de esta serie. +Ahora, si tu puedes guardar un solo archivo así de fácil, ¿es posible escribir un programa que te permita descargar un puñado de archivos? ¿Es posible incrementar la cantidad de IDs de los juicios y obtener copias de todos ellos en una sola descarga? Efectivamente. Puedes aprender cómo hacerlo en la lección [Downloading Multiple Files using Query Strings](/en/lessons/downloading-multiple-records-using-query-strings), que te recomendamos ver una vez completadas las lecciones introductorias de esta serie. Lecturas sugeridas ------------------ @@ -177,12 +177,12 @@ Para seguir a lo largo de las lecciones futuras es importante que tengas los arc - python-es-lecciones1.zip ([zip][]) -[The Old Bailey Online]: http://oldbaileyonline.org +[The Old Bailey Online]: https://oldbaileyonline.org [*cadena de consulta*]: https://es.wikipedia.org/wiki/Query_string [Descarga de registros múltiples usando cadenas de consulta]: /lessons/downloading-multiple-records-using-query-strings -[revueltas de Gordon]: http://en.wikipedia.org/wiki/Gordon_Riots -[View as XML]: http://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/17800628.xml&div=t17800628-33&xml=yes +[revueltas de Gordon]: https://en.wikipedia.org/wiki/Gordon_Riots +[View as XML]: https://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/17800628.xml&div=t17800628-33&xml=yes [XML]: https://es.wikipedia.org/wiki/Extensible_Markup_Language -[documento original]: http://www.oldbaileyonline.org/images.jsp?doc=178006280084 +[documento original]: https://www.oldbaileyonline.org/images.jsp?doc=178006280084 [urllib]: https://docs.python.org/3/library/urllib.html [zip]: /assets/python-es-lecciones1.zip diff --git a/es/lecciones/transformacion-datos-xml-xsl.md b/es/lecciones/transformacion-datos-xml-xsl.md index 1ad9abcf35..4b0c77d42a 100644 --- a/es/lecciones/transformacion-datos-xml-xsl.md +++ b/es/lecciones/transformacion-datos-xml-xsl.md @@ -256,7 +256,7 @@ Los ejemplos de código de línea de comandos que mostraremos aquí presupondrá Para empezar a transformar un documento XML, primero es necesario obtener un archivo *bien formado*.[^2] Muchas bases de datos históricas disponibles en línea están modeladas en XML y, a veces, ofrecen sus datos en abierto. -Para realizar este tutorial utilizaremos la base de datos [Scissors and Paste](http://scissors-and-paste.net). +Para realizar este tutorial utilizaremos la base de datos [Scissors and Paste](https://scissors-and-paste.net). La base de datos *Scissors and Paste* es una colección colaborativa, en continuo crecimiento, que contiene noticias procedentes de periódicos británicos e imperiales de los siglos XVIII y XIX. Los dos objetivos originales del proyecto eran facilitar la comparación de reediciones aparecidas en distintos periódicos y detectar temas similares en distintas publicaciones inglesas. Como muchas bases de datos XML, *Scissors and Paste* contiene datos (el texto), información sobre el formato (como las cursivas o las justificación de los párrafos) y metadatos.[^3] Los metadatos recogen la paginación de la noticia, la fecha de impresión, algunos detalles adicionales sobre el periódico, los temas principales y una lista con las personas y lugares mencionados. @@ -336,7 +336,7 @@ La primeras tres líneas de tu archivo XSL serán las siguientes: La primera línea declara que este es un documento XML versión 1.0, codificado como UTF-8. (¡Nota que un documento XSL es en últimas un tipo especial de documento XML!) -La segunda línea declara que se trata de la versión 1.0 de XSL y que el uso del [espacio de nombres](https://es.wikipedia.org/wiki/Espacio_de_nombres_XML) (*namespace*, en inglés) es el estándar establecido por el [Consorcio World Wide Web](http://www.w3.org/), cuya URI (*Uniform Resource Identifier*) figura en la instrucción. +La segunda línea declara que se trata de la versión 1.0 de XSL y que el uso del [espacio de nombres](https://es.wikipedia.org/wiki/Espacio_de_nombres_XML) (*namespace*, en inglés) es el estándar establecido por el [Consorcio World Wide Web](https://www.w3.org/), cuya URI (*Uniform Resource Identifier*) figura en la instrucción. Finalmente, la tercera línea le indica al procesador XSL que queremos generar un archivo de texto plano. (También podrías haber puesto `xml` o `html`, en lugar de `text`, para generar un documento XML o uno HTML, respectivamente.) diff --git a/es/lecciones/uso-las-colecciones-hathitrust-mineria-textual-R.md b/es/lecciones/uso-las-colecciones-hathitrust-mineria-textual-R.md index f7ceaf13c1..ca84efcfac 100644 --- a/es/lecciones/uso-las-colecciones-hathitrust-mineria-textual-R.md +++ b/es/lecciones/uso-las-colecciones-hathitrust-mineria-textual-R.md @@ -78,7 +78,7 @@ library(stringr) ## Obtener los atributos extraídos de un volumen -Cada libro o volumen en HathiTrust posee un número de identificación único (o el "htid"), el cual permite que obtengamos datos sobre el volumen. Cuando el libro no está limitado por los derechos de autor, puedes verlo añadiendo su número de identificación a un URL de la siguiente manera: `http://hdl.handle.net/2027/{número de identificación}`. Por ejemplo, el número que identifica una de las primeras ediciones de la clásica novela colombiana, *María* de Jorge Isaacs, es `uc1.31175010656638` y al visitar el enlace [http://hdl.handle.net/2027/uc1.31175010656638](http://hdl.handle.net/2027/uc1.31175010656638) accedemos a una copia de la obra. +Cada libro o volumen en HathiTrust posee un número de identificación único (o el "htid"), el cual permite que obtengamos datos sobre el volumen. Cuando el libro no está limitado por los derechos de autor, puedes verlo añadiendo su número de identificación a un URL de la siguiente manera: `http://hdl.handle.net/2027/{número de identificación}`. Por ejemplo, el número que identifica una de las primeras ediciones de la clásica novela colombiana, *María* de Jorge Isaacs, es `uc1.31175010656638` y al visitar el enlace [https://hdl.handle.net/2027/uc1.31175010656638](https://hdl.handle.net/2027/uc1.31175010656638) accedemos a una copia de la obra. {% include figure.html filename="es-or-uso-las-colecciones-hathitrust-mineria-textual-R-01.png" alt="Ejemplar de la novela María en el sitio HathiTrust" caption="Figura 1. 'María' de Jorge Isaacs" %} diff --git a/es/lecciones/ver-archivos-html.md b/es/lecciones/ver-archivos-html.md index 0ae4594a40..07128b5c11 100644 --- a/es/lecciones/ver-archivos-html.md +++ b/es/lecciones/ver-archivos-html.md @@ -20,8 +20,8 @@ translation-reviewer: - Antonio Rojas Castro review-ticket: https://github.com/programminghistorian/ph-submissions/issues/39 layout: lesson -next: trabajar-con-archivos-de-texto -previous: introduccion-e-instalacion +next: /es/lecciones/trabajar-con-archivos-de-texto +previous: /es/lecciones/introduccion-e-instalacion original: viewing-html-files difficulty: 2 activity: presenting @@ -112,7 +112,7 @@ Sugerencia de lecturas para aprender HTML: - [W3Schools HTML Tutorial] - [W3Schools HTML5 Tutorial] -[la anterior de la serie]: http://es.programminghistorian.org/lecciones/introduccion-e-instalacion/"> -[tutorial de HTML ofrecido por W3Schools]: http://www.w3schools.com/html/default.asp -[W3Schools HTML Tutorial]: http://www.w3schools.com/html/default.asp -[W3Schools HTML5 Tutorial]: http://www.w3schools.com/html/html5_intro.asp +[la anterior de la serie]: https://es.programminghistorian.org/lecciones/introduccion-e-instalacion/"> +[tutorial de HTML ofrecido por W3Schools]: https://www.w3schools.com/html/default.asp +[W3Schools HTML Tutorial]: https://www.w3schools.com/html/default.asp +[W3Schools HTML5 Tutorial]: https://www.w3schools.com/html/html5_intro.asp diff --git a/es/pia.md b/es/pia.md index 47df0afff8..b1c69f990d 100644 --- a/es/pia.md +++ b/es/pia.md @@ -2,7 +2,7 @@ layout: blank title: Programa de Instituciones Asociadas redirect_from: -- /pia +- /pia/ original: ipp --- @@ -51,7 +51,7 @@ Al unirte al Programa de Instituciones Asociadas recibirás los siguientes benef
    - + diff --git a/es/politica-retirada-lecciones.md b/es/politica-retirada-lecciones.md index a95460ab69..fc3fa4da28 100644 --- a/es/politica-retirada-lecciones.md +++ b/es/politica-retirada-lecciones.md @@ -29,11 +29,11 @@ Con independencia de que se cree o no un nuevo derivado, si el tutorial ya ha si ## Más sobre sostenibilidad -[Guía para autores y traductores - Escribe de manera sostenible](/es/guia-para-autores#escribe-de-manera-sostenible) +[Guía para autores y traductores - Escribe de manera sostenible](/es/guia-para-autores#escritura-sostenible) [Guía para revisores - Sostenibilidad](/es/guia-para-revisores#sostenibilidad) -[Guía para editores - Revisar la sostenibilidad](/es/guia-editor#c-revisar-la-sostenibilidad) +[Guía para editores - Revisar la sostenibilidad](/es/guia-editor#c-revisar-la-sostenibilidad-e-internacionalización) ## Lecciones retiradas diff --git a/es/retroalimentacion.md b/es/retroalimentacion.md index 5146dbe8d1..28c20bacf1 100644 --- a/es/retroalimentacion.md +++ b/es/retroalimentacion.md @@ -13,7 +13,7 @@ original: feedback Definimos errores o *bugs* como: "Un error en un programa informático que produce un resultado inesperado o que se comporta de manera diferente a las instrucciones de una lección". Ten en cuenta que no te podemos ayudar con los errores causados si editas el código o cambias los materiales de la lección (conjuntos de datos, archivos, etc.). -Antes de nada, te pedimos que consultes si alguien ya ha reportado el problema en [nuestro repositorio](https://github.com/orgs/programminghistorian/projects/6) y, en tal caso, que dejes un comentario. Si no se ha informado del problema, sigue una de estas opciones: +Antes de nada, te pedimos que consultes si alguien ya ha reportado el problema en [nuestro repositorio](https://github.com/programminghistorian/jekyll/issues) y, en tal caso, que dejes un comentario. Si no se ha informado del problema, sigue una de estas opciones:
    Por favor, no abras un "Pull Request" con la corrección. diff --git a/feed.xml b/feed.xml index 1f92cafbc3..83a2d6e9ff 100644 --- a/feed.xml +++ b/feed.xml @@ -1,5 +1,6 @@ --- skip_concordance: true +permalink: /feed.xml --- diff --git a/fr/README.md b/fr/README.md index d6351b7a22..3f933212e0 100644 --- a/fr/README.md +++ b/fr/README.md @@ -1,10 +1,10 @@ -Ceci est le dépôt principal pour le _Programming Historian en français_ () où nous stockons les fichiers du site web. +Ceci est le dépôt principal pour le _Programming Historian en français_ () où nous stockons les fichiers du site web. Pour les tutoriels soumis, voir: [_Programming Historian Submissions_](https://github.com/programminghistorian/ph-submissions). SI vous avez des suggestions à faire sur le site ou le projet, merci de cliquer sur [Issues](https://github.com/programminghistorian/jekyll/issues) en haut et créer un nouveau ticket en cliquant sur [New Issue](https://github.com/programminghistorian/jekyll/issues/new) pour décrire votre idée. Merci de noter que le ticket sera public. Si vous souhaitez échanger avec nous en privé, merci de contacter [Jessica Parr](mailto:jarr1129@gmail.com). -Si vous souhaitez contribuer au projet, vous pouvez trouver des consignes précis aux auteur(e)s, aux évaluateurs et évaluatrices et aux rédacteurs et rédactrices sur la [page contributions](http://programminghistorian.org/fr/contribuer) du site web. +Si vous souhaitez contribuer au projet, vous pouvez trouver des consignes précis aux auteur(e)s, aux évaluateurs et évaluatrices et aux rédacteurs et rédactrices sur la [page contributions](https://programminghistorian.org/fr/contribuer) du site web. Pour apporter des contributions techniques au projet ou pour vous renseigner sur notre politique concernant de nouvelles sous-équipes linguistiques qui s'intéressent à créer une version traduite, merci de lire le [wiki du projet](https://github.com/programminghistorian/jekyll/wiki). diff --git a/fr/apropos.md b/fr/apropos.md index dcd8355b25..ca5320a31a 100644 --- a/fr/apropos.md +++ b/fr/apropos.md @@ -14,7 +14,7 @@ Notre procédure d'évaluation par les pairs est un peu différente de celle qui ## Open Source (code source ouvert) -Le _Programming Historian en français_ adhère aux principes de l'open source. Toutes les leçons utilisent dans la mesure du possible des langages de programmation et des logiciels ouverts. Cette politique vise à minimiser les coûts pour toutes les parties concernées et à encourager la plus large participation possible. C'est notre conviction que tout un chacun devrait avoir la possibilité de profiter de ces tutoriels, non pas seulement ceux et celles disposant de budgets de recherche considérables qui permettent l'acquisition de logiciels propriétaires coûteux. Depuis 2016, une version citable du projet _Programming Historian_ est déposée sur [Zenodo](https://zenodo.org/). Le dépôt de l'année 2022 est accessible sur [doi.org/10.5281/zenodo.7313045](https://doi.org/10.5281/zenodo.7313045). Depuis 2018, le robot d’indexation du [UK Web Archive](https://www.webarchive.org.uk/) parcourt régulièrement les pages du _Programming Historian_. Celles-ci sont archivées et mises à la disposition du public [via leur site web](https://www.webarchive.org.uk/wayback/en/archive/*/http://programminghistorian.org/). +Le _Programming Historian en français_ adhère aux principes de l'open source. Toutes les leçons utilisent dans la mesure du possible des langages de programmation et des logiciels ouverts. Cette politique vise à minimiser les coûts pour toutes les parties concernées et à encourager la plus large participation possible. C'est notre conviction que tout un chacun devrait avoir la possibilité de profiter de ces tutoriels, non pas seulement ceux et celles disposant de budgets de recherche considérables qui permettent l'acquisition de logiciels propriétaires coûteux. Depuis 2016, une version citable du projet _Programming Historian_ est déposée sur [Zenodo](https://zenodo.org/). Le dépôt de l'année 2022 est accessible sur [doi.org/10.5281/zenodo.7313045](https://doi.org/10.5281/zenodo.7313045). Depuis 2018, le robot d’indexation du [UK Web Archive](https://www.webarchive.org.uk/) parcourt régulièrement les pages du _Programming Historian_. Celles-ci sont archivées et mises à la disposition du public [via leur site web](https://www.webarchive.org.uk/wayback/en/archive/*/https://programminghistorian.org/). ## Voie diamant (Diamond Open Access) @@ -26,7 +26,7 @@ Le _Programming Historian_ (ISSN 2397-2068) est recensé dans le répertoire de ## Récompenses -Le _Programming Historian_ a gagné plusieurs prix qui reconnaissent et célèbrent nos réussites dans les domaines de la publication en libre accès et des humanités numériques. En 2016, la version anglaise de la revue fut la grande gagnante du [Digital Humanities Awards](http://dhawards.org/dhawards2016/results/) dans la catégorie des Meilleures Collections d'Articles, puis l'année suivante, en 2017, _Programming Historian en espagnol_ [remporta la même distinction](http://dhawards.org/dhawards2017/results/). En 2018, _Programming Historian en espagnol_ était le vainqueur de 'Mejor iniciativa formativa desarrollada durante el año 2018', [Humanidades Digitales Hispánicas Association](http://humanidadesdigitaleshispanicas.es/). Nous avons remporté le [Canadian Social Knowledge Institute's Open Scholarship Award](https://etcl.uvic.ca/events-activities/open-scholarship-awards/) en 2020 et en 2021 nous avons été récompensés d'un [Coko Foundation's Open Publishing Award](https://web.archive.org/web/20220408041024/https://openpublishingawards.org/results/2021/index.html) dans la catégorie Contenu Ouvert. En 2022, nous avons remporté la catégorie "Meilleur support de formation DH" des [Digital Humanities Awards](http://dhawards.org/dhawards2022/results/). +Le _Programming Historian_ a gagné plusieurs prix qui reconnaissent et célèbrent nos réussites dans les domaines de la publication en libre accès et des humanités numériques. En 2016, la version anglaise de la revue fut la grande gagnante du [Digital Humanities Awards](https://dhawards.org/dhawards2016/results/) dans la catégorie des Meilleures Collections d'Articles, puis l'année suivante, en 2017, _Programming Historian en espagnol_ [remporta la même distinction](https://dhawards.org/dhawards2017/results/). En 2018, _Programming Historian en espagnol_ était le vainqueur de 'Mejor iniciativa formativa desarrollada durante el año 2018', [Humanidades Digitales Hispánicas Association](https://humanidadesdigitaleshispanicas.es/). Nous avons remporté le [Canadian Social Knowledge Institute's Open Scholarship Award](https://etcl.uvic.ca/events-activities/open-scholarship-awards/) en 2020 et en 2021 nous avons été récompensés d'un [Coko Foundation's Open Publishing Award](https://web.archive.org/web/20220408041024/https://openpublishingawards.org/results/2021/index.html) dans la catégorie Contenu Ouvert. En 2022, nous avons remporté la catégorie "Meilleur support de formation DH" des [Digital Humanities Awards](https://dhawards.org/dhawards2022/results/). ## Politique de diversité @@ -36,8 +36,8 @@ Le _Programming Historian en français_ s'attache au principe de la diversité. ## Financement & proprieté Le _Programming Historian_ est un projet international mené par des volontaires. Ses activités financières sont administrées par ProgHist Limited, une organisation caritative enregistrée en Angleterre et au pays de Galles sous le numéro [1195875](https://register-of-charities.charitycommission.gov.uk/charity-search/-/charity-details/5181272/charity-overview) et incorporée en tant que compagnie à responsabilité limitée par garanties en Angleterre et au pays de Galles sous le numéro [12192946](https://beta.companieshouse.gov.uk/company/12192946). Le projet est publié par le conseil éditorial du _Programming Historian_. -Une liste de nos donateurs et des soutiens reçus est consultable sur la page qui expose comment vous pouvez ['nous soutenir']({{site.baseurl}}/fr/nous-soutenir). +Une liste de nos donateurs et des soutiens reçus est consultable sur la page qui expose comment vous pouvez ['nous soutenir']({{site.baseurl}}/fr/dons/). ## Histoire du projet -Le *Programming Historian* a été fondé en 2008 par William J. Turkel et Alan MacEachern. À l'époque, Turkel avait publié un [billet de blog](http://digitalhistoryhacks.blogspot.com/2008/01/programming-historian.html), exposant leurs intentions pour le projet. Il s'est focalisé principalement sur Python et il a été publié en libre accès, en tant que projet d'infrastructure numérique de *NiCHE* (*Network in Canadian History & Environment* / *Nouvelle initiative canadienne en histoire de l’environnement*). En 2012, *The Programming Historian* a élargi son équipe éditoriale pour être lancé en tant que revue académique évaluée par les pairs, en libre accès, spécialisée aux questions méthodologiques pour des historiens et historiennes numériques. En 2016, nous avons ajouté une publication hispanophone à la publication anglophone initiale puis, en 2017, nous avons commencé à publier des leçons traduites sous le titre *[Programming Historian en español]({{site.baseurl}}/es)*. En 2018, [nous avons organisé notre premier atelier d'écriture hispanophone](/posts/bogota-workshop-report) et lancé un appel à contributions pour [de nouvelles leçons en espagnol](/posts/convocatoria-de-tutoriales). Dans la même année, nous avons ajouté une publication francophone et le *Programming Historian en français* a été lancé en 2019. Un an plus tard, nous avons été rejoints par une équipe lusophone et avons lancé *[Programming Historian em português]({{site.baseurl}}/pt)* début 2021. +Le *Programming Historian* a été fondé en 2008 par William J. Turkel et Alan MacEachern. À l'époque, Turkel avait publié un [billet de blog](https://digitalhistoryhacks.blogspot.com/2008/01/programming-historian.html), exposant leurs intentions pour le projet. Il s'est focalisé principalement sur Python et il a été publié en libre accès, en tant que projet d'infrastructure numérique de *NiCHE* (*Network in Canadian History & Environment* / *Nouvelle initiative canadienne en histoire de l’environnement*). En 2012, *The Programming Historian* a élargi son équipe éditoriale pour être lancé en tant que revue académique évaluée par les pairs, en libre accès, spécialisée aux questions méthodologiques pour des historiens et historiennes numériques. En 2016, nous avons ajouté une publication hispanophone à la publication anglophone initiale puis, en 2017, nous avons commencé à publier des leçons traduites sous le titre *[Programming Historian en español]({{site.baseurl}}/es)*. En 2018, [nous avons organisé notre premier atelier d'écriture hispanophone](/posts/bogota-workshop-report) et lancé un appel à contributions pour [de nouvelles leçons en espagnol](/posts/convocatoria-de-tutoriales). Dans la même année, nous avons ajouté une publication francophone et le *Programming Historian en français* a été lancé en 2019. Un an plus tard, nous avons été rejoints par une équipe lusophone et avons lancé *[Programming Historian em português]({{site.baseurl}}/pt)* début 2021. diff --git a/fr/consignes-redacteurs.md b/fr/consignes-redacteurs.md index 9e92ec11b0..5f3023640a 100644 --- a/fr/consignes-redacteurs.md +++ b/fr/consignes-redacteurs.md @@ -279,7 +279,7 @@ Ci-dessous quelques sites pour chercher des images: - The [Virtual Manuscript Library of Switzerland](https://www.e-codices.unifr.ch/fr) - The [British Library](https://www.flickr.com/photos/britishlibrary) - The [Internet Archive Book Images](https://archive.org/details/bookimages) - - The [Library of Congress Maps](http://www.loc.gov/maps/collections) + - The [Library of Congress Maps](https://www.loc.gov/maps/collections) Merci de prendre soin de trouver une image dont le style est proche de celles déjà utilisées, donc pas de photographie, mais plutôt une image d'illustration de livre, de taille minimale de 200x200 pixels, sans restriction de droits de copyright. Merci de faire attention à ce que les images ne heurtent pas les sensibilités et, en conformité à notre [engagement en faveur de la diversité](/posts/PH-commitment-to-diversity), qu'elles ne reproduisent pas de stéréotypes sexistes ou raciaux. @@ -341,7 +341,7 @@ Il existe différentes manières de faire un "pull request" pour publier les fic * A) Suivre nos [consignes pour faire des contributions techniques](https://github.com/programminghistorian/jekyll/wiki/Making-Technical-Contributions) via l'interface GUI de Github. -* B) Utiliser `git` depuis la ligne de commande. Les instructions suivantes supposent que vous avez déjà cloné les dépôts `jekyll` and `ph-submissions` en local sur votre machine. Notre [leçon sur l'utilisation de l'application GitHub Desktop](/lessons/getting-started-with-github-desktop) peut vous être utile, si cela est tout nouveau pour vous. Si vous n'êtes pas confiant(e) sur comment procéder ou si vous avez des questions, merci de contacter l'équipe technique pour obtenir de l'aide. +* B) Utiliser `git` depuis la ligne de commande. Les instructions suivantes supposent que vous avez déjà cloné les dépôts `jekyll` and `ph-submissions` en local sur votre machine. Notre [leçon sur l'utilisation de l'application GitHub Desktop](/en/lessons/getting-started-with-github-desktop) peut vous être utile, si cela est tout nouveau pour vous. Si vous n'êtes pas confiant(e) sur comment procéder ou si vous avez des questions, merci de contacter l'équipe technique pour obtenir de l'aide. 1. Aller au répertoire de votre dépôt local `ph-submissions`. diff --git a/fr/contribuer.md b/fr/contribuer.md index b546a5f0f5..64d4b3c03a 100644 --- a/fr/contribuer.md +++ b/fr/contribuer.md @@ -38,13 +38,13 @@ Il nous arrive de temps en temps de faire des appels pour recruter des membres p Si vous parlez couramment plus d'une des langues dans lesquelles le _Programming Historian_ paraît (français, anglais, espagnol, portugais), nous vous invitons à prendre contact avec nous pour traduire une des leçons publiées d'une langue à une autre. Cela nous aidera à être partie prenante dans les communautés des humanités numériques hispanophone et francophone et vous permettra de renforcer vos compétences linguistiques, méthodologiques et techniques. -Nous recherchons des traductions rigoureuses et lisibles qui tiennent compte des contextes de recherche hispanophone, francophone et lusophone, et des ressources disponibles dans nos communautés respectives. Si cela vous intéresse de collaborer avec nous, merci de consulter nos [consignes aux auteur(e)s et traducteurs/traductrices](/fr/consignes-auteurs.html). +Nous recherchons des traductions rigoureuses et lisibles qui tiennent compte des contextes de recherche hispanophone, francophone et lusophone, et des ressources disponibles dans nos communautés respectives. Si cela vous intéresse de collaborer avec nous, merci de consulter nos [consignes aux auteur(e)s et traducteurs/traductrices](/fr/consignes-auteurs). ## Faire un retour ou signaler un problème {{ site.data.snippets.feedback-image-alt[page.lang] }} -Nous vous invitons à nous faire des [retours d'expérience](/fr/reaction.html) sur tout aspect du _Programming Historian en français_. D'avance merci de nous aider à améliorer le projet. +Nous vous invitons à nous faire des [retours d'expérience](/fr/reaction) sur tout aspect du _Programming Historian en français_. D'avance merci de nous aider à améliorer le projet. Nous apprécions tout particulièrement les informations reçues sur les liens cassés. Au fur et à mesure que les URLs changent et que les versions des logiciels et des plateformes évoluent, les leçons peuvent présenter des défaillances. Merci de nous aider à garder le _Programming Historian en français_ à jour en nous tenant au courant de tout problème rencontré pendant vos lectures. @@ -56,9 +56,9 @@ Nous apprécions tout particulièrement les informations reçues sur les liens c Ce projet est notre démonstration de ce que l'édition scientifique en accès libre peut et doit être. Merci de nous aider à disséminer le message et à fournir le plus large accès possible à cette ressource en demandant à votre bibliothèque d'enregister le projet dans son catalogue. -Le _Programming Historian_ a sa notice dans WorldCat [en français](https://uva.worldcat.org/title/programming-historian-en-franais/oclc/1104391842), [en anglais](http://www.worldcat.org/title/programming-historian/oclc/951537099), [en espagnol](https://www.worldcat.org/title/programming-historian-en-espanol/oclc/1061292935&referer=brief_results) et [en portugais](https://search.worldcat.org/title/1332987197). +Le _Programming Historian_ a sa notice dans WorldCat [en français](https://uva.worldcat.org/title/programming-historian-en-franais/oclc/1104391842), [en anglais](https://www.worldcat.org/title/programming-historian/oclc/951537099), [en espagnol](https://www.worldcat.org/title/programming-historian-en-espanol/oclc/1061292935&referer=brief_results) et [en portugais](https://search.worldcat.org/title/1332987197). -Toute notre gratitude à la [bibliothèque de l'Université de Purdue](http://purdue-primo-prod.hosted.exlibrisgroup.com/primo_library/libweb/action/dlDisplay.do?vid=PURDUE&search_scope=everything&docId=PURDUE_ALMA51671812890001081&fn=permalink), à Amanda Visconti, et à la [bibliothèque de l'Université de Virginia]). +Toute notre gratitude à la [bibliothèque de l'Université de Purdue](https://purdue-primo-prod.hosted.exlibrisgroup.com/primo_library/libweb/action/dlDisplay.do?vid=PURDUE&search_scope=everything&docId=PURDUE_ALMA51671812890001081&fn=permalink), à Amanda Visconti, et à la [bibliothèque de l'Université de Virginia]). L'édition anglais est aussi indexé dans le [Directory of Open Access Journals](https://doaj.org/toc/2397-2068). diff --git a/fr/dons.md b/fr/dons.md index 092f938123..7af10f35f9 100644 --- a/fr/dons.md +++ b/fr/dons.md @@ -19,7 +19,7 @@ Votre soutien contribue directement à l'infrastructure qui assure la cohésion
    - + diff --git a/fr/index.md b/fr/index.md index fa172424a5..55fe21adf3 100644 --- a/fr/index.md +++ b/fr/index.md @@ -1,6 +1,7 @@ --- layout: base title: The Programming Historian en français +permalink: /fr/ ---
    @@ -23,20 +24,20 @@ title: The Programming Historian en français

    Enseigner

    -

    Vous pouvez utiliser le Programming Historian en français dans le cadre de vos cours ou de vos ateliers! Si nous pouvons améliorer nos leçons pour qu'elles répondent mieux à vos besoins ou si vous rencontrez des difficultés, merci de nous le faire savoir.

    +

    Vous pouvez utiliser le Programming Historian en français dans le cadre de vos cours ou de vos ateliers! Si nous pouvons améliorer nos leçons pour qu'elles répondent mieux à vos besoins ou si vous rencontrez des difficultés, merci de nous le faire savoir.

    Contribuer

    -

    Que vous souhaitiez écrire une leçon, rejoindre notre équipe de rédacteurs et rédactrices, nous faire un retour, nous sommes constamment à l'écoute de notre lectorat!

    +

    Que vous souhaitiez écrire une leçon, rejoindre notre équipe de rédacteurs et rédactrices, nous faire un retour, nous sommes constamment à l'écoute de notre lectorat!

    Notre équipe

    -

    Notre projet est animé par des volontaires et par une communauté, c'est donc avec fierté que nous rendons hommage à tous les gens remarquables qui ont consacré leur temps et leur énergie au Programming Historian.

    +

    Notre projet est animé par des volontaires et par une communauté, c'est donc avec fierté que nous rendons hommage à tous les gens remarquables qui ont consacré leur temps et leur énergie au Programming Historian.

    diff --git a/fr/lecons/analyse-corpus-antconc.md b/fr/lecons/analyse-corpus-antconc.md index 9384593d97..1b6ba018e5 100644 --- a/fr/lecons/analyse-corpus-antconc.md +++ b/fr/lecons/analyse-corpus-antconc.md @@ -55,12 +55,12 @@ Vous avez déjà fait ce genre de choses auparavant, si vous avez déjà... * utilisé [Voyant Tools][48] pour analyser des schémas dans un texte * suivi les tutoriels d'introduction à Python du [Programming Historian][51] -En quelque sorte, [Voyant Tools](http://voyant-tools.org/) est une passerelle vers la réalisation d'analyses plus sophistiquées et reproductibles, car l'esthétique de bricolage des scripts Python ou R peut ne pas convenir à tout le monde. [AntConc](http://www.laurenceanthony.net/software/antconc/) comble ce vide en étant un logiciel autonome d'analyse linguistique de textes, disponible gratuitement pour Windows, Mac OS et Linux. Par ailleurs, il est régulièrement mis à jour par son créateur, [Laurence Anthony](http://www.laurenceanthony.net/). Il existe d'autres logiciels de concordance, mais AntConc est librement disponible sur toutes les plateformes et très bien maintenu. Voir la [bibliographie][56] pour d'autres ressources. +En quelque sorte, [Voyant Tools](https://voyant-tools.org/) est une passerelle vers la réalisation d'analyses plus sophistiquées et reproductibles, car l'esthétique de bricolage des scripts Python ou R peut ne pas convenir à tout le monde. [AntConc](https://www.laurenceanthony.net/software/antconc/) comble ce vide en étant un logiciel autonome d'analyse linguistique de textes, disponible gratuitement pour Windows, Mac OS et Linux. Par ailleurs, il est régulièrement mis à jour par son créateur, [Laurence Anthony](https://www.laurenceanthony.net/). Il existe d'autres logiciels de concordance, mais AntConc est librement disponible sur toutes les plateformes et très bien maintenu. Voir la [bibliographie][56] pour d'autres ressources. Ce tutoriel explore différentes façons d'aborder un corpus de textes. Il est important de noter que les approches issues de la linguistique de corpus sont rarement, voire jamais, l'unique possibilité. Ainsi, à chaque étape, il vaut la peine de réfléchir à ce que vous faites et comment cela peut vous aider à répondre à une question spécifique avec vos données. Bien que je présente dans ce tutoriel une approche modulaire qui explique 'comment faire ceci puis cela pour atteindre x', il n'est pas toujours nécessaire de suivre l'ordre exact décrit ici. Cette leçon donne un aperçu de certaines des méthodes disponibles, plutôt qu'une recette du succès. ### Téléchargements nécessaires -1. Logiciel : [AntConc](http://www.laurenceanthony.net/software/antconc/). +1. Logiciel : [AntConc](https://www.laurenceanthony.net/software/antconc/). Dézippez le fichier si nécessaire, et lancez l'application. Les captures d'écran ci-dessous peuvent varier légèrement par rapport à la version que vous avez (et selon le système d'exploitation, bien sûr), mais les procédures sont plus ou moins les mêmes sur les plateformes et les versions récentes d'AntConc. Ce tutoriel a recours à une version plus ancienne d'AntConc, car je la trouve plus facile à utiliser dans un contexte d'introduction. Vous pouvez utiliser la version la plus récente, mais si vous souhaitez suivre avec les captures d'écran fournies, vous pouvez télécharger la version utilisée ici, [version 3.2.4](https://www.laurenceanthony.net/software/antconc/releases/AntConc324/). 2. Corpus test : Téléchargez un [fichier zip de critiques de films](/assets/corpus-analysis-with-antconc/antconc_corpus_files.zip). @@ -94,10 +94,10 @@ Allez sur votre bureau et vérifiez que vous pouvez trouver votre fichier texte. Répétez la procédure plusieurs fois et c'est ainsi que vous construirez un corpus de fichiers texte brut. Ce processus s'appelle la construction de corpus, ce qui implique très souvent d'aborder des questions d'échantillonnage, de représentativité et d'organisation. Rappelez-vous, *chaque fichier que vous voulez utiliser dans votre corpus __doit__ être un fichier texte brut pour qu'AntConc puisse l'utiliser.* Il est d'usage de nommer les fichiers avec le suffixe .txt pour que vous sachiez de quel type de fichier il s'agit. Comme vous pouvez l'imaginer, il peut être assez fastidieux de constituer un corpus substantiel un fichier à la fois, surtout si vous avez l'intention de traiter un ensemble important de documents. Il est donc très courant d'utiliser l'extraction de contenus (_webscraping_) (en utilisant un petit programme pour télécharger automatiquement les fichiers sur le web pour vous) pour construire votre corpus. Pour en savoir plus sur les concepts et les techniques d'extraction, consultez les tutoriels du _Programming Historian_ sur l'[extraction avec Beautiful Soup][50] et le [téléchargement automatique avec wget][51] (en anglais). -Plutôt que de construire un corpus un document à la fois, nous allons utiliser un corpus préparé de critiques de films positives et négatives, emprunté au [_Natural Language Processing Toolkit_](http://www.nltk.org/). Le corpus des critiques de films de la NLTK compte 2000 critiques, organisées par résultats positifs et négatifs ; aujourd'hui, nous allons aborder un petit sous-ensemble d'entre eux (200 positifs, 200 négatifs). +Plutôt que de construire un corpus un document à la fois, nous allons utiliser un corpus préparé de critiques de films positives et négatives, emprunté au [_Natural Language Processing Toolkit_](https://www.nltk.org/). Le corpus des critiques de films de la NLTK compte 2000 critiques, organisées par résultats positifs et négatifs ; aujourd'hui, nous allons aborder un petit sous-ensemble d'entre eux (200 positifs, 200 négatifs). -La construction de corpus est un sous-domaine à part entière. Voir "[_Representativeness in Corpus Design_](https://academic.oup.com/dsh/article-abstract/8/4/243/928942)", _Literary and Linguistic Computing_, 8 (4) : 243-257 et [_Developing Linguistic Corpora : a Guide to Good Practice_](http://www.amazon.com/Developing-Linguistic-Corpora-Practice-Guides/dp/1842172050/ref=sr_1_1_1) pour plus d'informations. +La construction de corpus est un sous-domaine à part entière. Voir "[_Representativeness in Corpus Design_](https://academic.oup.com/dsh/article-abstract/8/4/243/928942)", _Literary and Linguistic Computing_, 8 (4) : 243-257 et [_Developing Linguistic Corpora : a Guide to Good Practice_](https://www.amazon.com/Developing-Linguistic-Corpora-Practice-Guides/dp/1842172050/ref=sr_1_1_1) pour plus d'informations. @@ -280,6 +280,7 @@ Vous pouvez également opter pour l'échange de corpus de référence et de fich Dans _Keyword List_ (Liste des mots-clés), appuyez simplement sur "_Start_" (Démarrer) (sans rien taper dans le champ de recherche). Si vous venez de changer le corpus de référence et les fichiers cibles, il se peut qu'on vous demande de créer une nouvelle liste de mots avant qu'AntConc ne calcule les mots-clés. Nous voyons une liste de mots-clés qui ont des mots qui sont beaucoup plus "inhabituels" - plus statistiquement inattendus - dans le corpus que nous regardons en comparaison avec le corpus de référence. > Keyness (spécificité) : c'est la fréquence d'un mot dans le texte par rapport à sa fréquence dans un corpus de référence, "telle que la probabilité statistique calculée par une procédure appropriée soit inférieure ou égale à une valeur p indiquée par l'utilisateur" (tiré d'[ici][41]). Pour ceux et celles qui s'intéressent aux détails statistiques, voir la section sur la spécificité (Keyness) à la page 7 du [fichier read me](https://www.laurenceanthony.net/software/antconc/releases/AntConc335/help.pdf) de Laurence Anthony. +> Keyness (spécificité) : c'est la fréquence d'un mot dans le texte par rapport à sa fréquence dans un corpus de référence, "telle que la probabilité statistique calculée par une procédure appropriée soit inférieure ou égale à une valeur p indiquée par l'utilisateur" (tiré d'[ici][41]). Pour ceux et celles qui s'intéressent aux détails statistiques, voir la section sur la spécificité (Keyness) à la page 7 du [fichier read me](https://www.laurenceanthony.net/software/antconc/releases/AntConc335/help.pdf) de Laurence Anthony. @@ -327,29 +328,29 @@ En résumé, il vaut la peine de réfléchir : ## Ressources supplémentaires pour ce tutoriel #### En anglais [Une courte bibliographie sur la linguistique des corpus][43]. -[Une version plus détaillée de ce tutoriel, en supposant que vous n'avez aucune connaissance en informatique.](http://hfroehli.ch/workshops/getting-started-with-antconc/) +[Une version plus détaillée de ce tutoriel, en supposant que vous n'avez aucune connaissance en informatique.](https://hfroehli.ch/workshops/getting-started-with-antconc/) #### En français (notes de la version traduite) -[Page AntConc de EduTech Wiki de l'UNIGE](http://edutechwiki.unige.ch/fr/AntConc#) -[Page AntConc sur le site Exploration de corpus : outils et pratiques](http://explorationdecorpus.corpusecrits.huma-num.fr/antconc/) -[Tutoriel AntConc du CID-ENS Lyon](http://cid.ens-lyon.fr/ac_article.asp?fic=antconc.asp) +[Page AntConc de EduTech Wiki de l'UNIGE](https://edutechwiki.unige.ch/fr/AntConc#) +[Page AntConc sur le site Exploration de corpus : outils et pratiques](https://explorationdecorpus.corpusecrits.huma-num.fr/antconc/) +[Tutoriel AntConc du CID-ENS Lyon](https://cid.ens-lyon.fr/ac_article.asp?fic=antconc.asp) -En France, des outils similaires à AntConc ont été dévéloppés dans le cadre de la textométrie, de la lexicométrie, et de la logométrie, souvent par des historien(ne)s. On peut nommer notamment [Hyperbase](http://ancilla.unice.fr/), [Iramuteq](http://iramuteq.org/), [Lexico](http://www.lexi-co.com/) ou [TXM](http://textometrie.ens-lyon.fr/?lang=fr). Merci de consulter également: Bénédicte Pincemin, ["Sept logiciels de textométrie"](https://halshs.archives-ouvertes.fr/halshs-01843695/document), 2018. +En France, des outils similaires à AntConc ont été dévéloppés dans le cadre de la textométrie, de la lexicométrie, et de la logométrie, souvent par des historien(ne)s. On peut nommer notamment [Hyperbase](https://ancilla.unice.fr/), [Iramuteq](https://iramuteq.org/), [Lexico](https://www.lexi-co.com/) ou [TXM](https://textometrie.ens-lyon.fr/?lang=fr). Merci de consulter également: Bénédicte Pincemin, ["Sept logiciels de textométrie"](https://halshs.archives-ouvertes.fr/halshs-01843695/document), 2018. #### Bibliographie non-exhaustive -Ludovic Lebart et André Salem, [*Statistique textuelle*](http://lexicometrica.univ-paris3.fr/livre/st94/st94-tdm.html), 1994. +Ludovic Lebart et André Salem, [*Statistique textuelle*](https://lexicometrica.univ-paris3.fr/livre/st94/st94-tdm.html), 1994. Damon Mayaffre, ["L’entrelacement lexical des textes. Cooccurrences et lexicométrie"](https://hal.archives-ouvertes.fr/hal-00553808), _Journées de linguistique de corpus_, 2008, p. 91-102. [La cooccurrence, du fait statistique au fait textuel](https://journals.openedition.org/corpus/2183), _Corpus_, 11, 2012, numéro coordonné par Damon Mayaffre et Jean-Marie Viprey. -[41]: http://www.lexically.net/downloads/version6/HTML/index.html?keyness_definition.htm -[43]: http://hfroehlich.wordpress.com/2014/05/11/intro-bibliography-corpus-linguistics/ -[47]: http://hfroehli.ch/workshops/getting-started-with-antconc/ -[48]: http://voyant-tools.org/ +[41]: https://www.lexically.net/downloads/version6/HTML/index.html?keyness_definition.htm +[43]: https://hfroehlich.wordpress.com/2014/05/11/intro-bibliography-corpus-linguistics/ +[47]: https://hfroehli.ch/workshops/getting-started-with-antconc/ +[48]: https://voyant-tools.org/ [50]: /en/lessons/intro-to-beautiful-soup [51]: /en/lessons/automated-downloading-with-wget -[52]: http://www.antlab.sci.waseda.ac.jp/ -[53]: http://notepad-plus-plus.org/ -[54]: http://www.barebones.com/products/textwrangler/ -[55]: http://www.wordfrequency.info/free.asp -[56]: http://hfroehli.ch/2014/05/11/intro-bibliography-corpus-linguistics/ +[52]: https://www.antlab.sci.waseda.ac.jp/ +[53]: https://notepad-plus-plus.org/ +[54]: https://www.barebones.com/products/textwrangler/ +[55]: https://www.wordfrequency.info/free.asp +[56]: https://hfroehli.ch/2014/05/11/intro-bibliography-corpus-linguistics/ diff --git a/fr/lecons/analyse-de-documents-avec-tfidf.md b/fr/lecons/analyse-de-documents-avec-tfidf.md index 3f0f196297..67355b4fba 100644 --- a/fr/lecons/analyse-de-documents-avec-tfidf.md +++ b/fr/lecons/analyse-de-documents-avec-tfidf.md @@ -42,7 +42,7 @@ En étudiant **tf-idf**, vous découvrirez une méthode d'analyse textuelle que ## Connaissances préalables recommandées -- Être familiarisé(e) avec Python ou un langage de programmation similaire. Le code de cette leçon a été programmé en Python 3.6, mais vous pouvez exécuter **tf-idf** dans toutes les versions courantes de Python, en utilisant l'un des divers modules appropriés, ainsi que dans plusieurs autres langages de programmation. Le niveau de compétence en programmation requis est difficile à évaluer, mais vous devrez au moins être à l'aise avec les types de données et les opérations élémentaires. Pour tirer profit de cette leçon, il serait aussi souhaitable de suivre un cours comme celui proposé par Antoine Rozo sur [zestedesavoir.com](https://perma.cc/7WJ4-WD3P) ou d'avoir suivi certaines des [leçons d'introduction à la programmation en Python](/fr/lecons/introduction-et-installation) du _Programming Historian_. Si vous avez accès à une bibliothèque, n'hésitez pas à consulter le livre d'Émilien Schultz et de Matthias Bussonnier [*Python pour les sciences humaines et sociales*](http://www.worldcat.org/oclc/1232233436). +- Être familiarisé(e) avec Python ou un langage de programmation similaire. Le code de cette leçon a été programmé en Python 3.6, mais vous pouvez exécuter **tf-idf** dans toutes les versions courantes de Python, en utilisant l'un des divers modules appropriés, ainsi que dans plusieurs autres langages de programmation. Le niveau de compétence en programmation requis est difficile à évaluer, mais vous devrez au moins être à l'aise avec les types de données et les opérations élémentaires. Pour tirer profit de cette leçon, il serait aussi souhaitable de suivre un cours comme celui proposé par Antoine Rozo sur [zestedesavoir.com](https://perma.cc/7WJ4-WD3P) ou d'avoir suivi certaines des [leçons d'introduction à la programmation en Python](/fr/lecons/introduction-et-installation) du _Programming Historian_. Si vous avez accès à une bibliothèque, n'hésitez pas à consulter le livre d'Émilien Schultz et de Matthias Bussonnier [*Python pour les sciences humaines et sociales*](https://www.worldcat.org/oclc/1232233436). - À défaut de pouvoir suivre la recommandation précédente, vous pourriez [réviser les bases de Python](https://perma.cc/YDT4-9JJ6), dont les types de données élémentaires (chaînes de caractères, nombres entiers, nombres réels, tuples, listes et dictionnaires), les variables, les boucles, les classes d'objets et leurs instances. - La maîtrise des bases d'Excel ou d'un autre tableur pourrait être utile si vous souhaitez examiner les feuilles de calcul au format CSV liées à cette leçon de plus près. Vous pouvez aussi employer le module Pandas du langage Python pour lire ces fichiers CSV. @@ -438,19 +438,19 @@ Le résumé automatique est une autre manière d'explorer un corpus. Rada Mihalc - C.D. Manning, P. Raghavan et H. Schütze, _Introduction to Information Retrieval_. Cambridge: Cambridge University Press, 2008. -- Rada Mihalcea et Paul Tarau. « Textrank: Bringing order into text », _Proceedings of the 2004 Conference on Empirical Methods in Natural Language Processing_, Barcelone, Espagne, 2004. [http://www.aclweb.org/anthology/W04-3252](https://perma.cc/SMV5-7MYY) +- Rada Mihalcea et Paul Tarau. « Textrank: Bringing order into text », _Proceedings of the 2004 Conference on Empirical Methods in Natural Language Processing_, Barcelone, Espagne, 2004. [https://www.aclweb.org/anthology/W04-3252](https://perma.cc/SMV5-7MYY) - « Nellie Bly, Journalist, Dies of Pneumonia », [_The New York Times_, 28 janvier 1922](https://perma.cc/LA5B-65HL). - G. Salton et M.J. McGill, _Introduction to Modern Information Retrieval_. New York: McGraw-Hill, 1983. -- Ben Schmidt, « Do Digital Humanists Need to Understand Algorithms? », _Debates in the Digital Humanities 2016_. Édition en ligne. Minneapois: University of Minnesota Press. [http://dhdebates.gc.cuny.edu/debates/text/99](https://perma.cc/95WD-SDM5). +- Ben Schmidt, « Do Digital Humanists Need to Understand Algorithms? », _Debates in the Digital Humanities 2016_. Édition en ligne. Minneapois: University of Minnesota Press. [https://dhdebates.gc.cuny.edu/debates/text/99](https://perma.cc/95WD-SDM5). -- Ben Schmidt, « Words Alone: Dismantling Topic Models in the Humanities », _Journal of Digital Humanities_. Vol. 2, No. 1 (2012): n.p. [http://journalofdigitalhumanities.org/2-1/words-alone-by-benjamin-m-schmidt/](https://perma.cc/LT4N-X4MZ). +- Ben Schmidt, « Words Alone: Dismantling Topic Models in the Humanities », _Journal of Digital Humanities_. Vol. 2, No. 1 (2012): n.p. [https://journalofdigitalhumanities.org/2-1/words-alone-by-benjamin-m-schmidt/](https://perma.cc/LT4N-X4MZ). - Karen Spärck Jones, « A Statistical Interpretation of Term Specificity and Its Application in Retrieval. », _Journal of Documentation_ 28, no. 1 (1972): 11–21. -- Jonathan Stray et Julian Burgess. « A Full-text Visualization of the Iraq War Logs », 10 décembre 2010 (dernière mise à jour en avril 2012), [http://jonathanstray.com/a-full-text-visualization-of-the-iraq-war-logs](https://perma.cc/QBZ4-DKTE). +- Jonathan Stray et Julian Burgess. « A Full-text Visualization of the Iraq War Logs », 10 décembre 2010 (dernière mise à jour en avril 2012), [https://jonathanstray.com/a-full-text-visualization-of-the-iraq-war-logs](https://perma.cc/QBZ4-DKTE). - Ted Underwood, « Identifying diction that characterizes an author or genre: why Dunning's may not be the best method », _The Stone and the Shell_, 9 novembre 2011, [https://tedunderwood.com/2011/11/09/identifying-the-terms-that-characterize-an-author-or-genre-why-dunnings-may-not-be-the-best-method/](https://perma.cc/SY25-UXK3). @@ -470,7 +470,7 @@ Si vous n'utilisez pas Anaconda, il faudra vous assurer de disposer des outils p 1. Une installation de Python 3 (préférablement Python 3.6 ou une version plus récente) 2. Idéalement, un environnement virtuel dans lequel installer et exécuter le Python -3. Le module Scikit-Learn et ses dépendances (voir [http://scikit-learn.org/stable/install.html](http://scikit-learn.org/stable/install.html)) +3. Le module Scikit-Learn et ses dépendances (voir [https://scikit-learn.org/stable/install.html](https://scikit-learn.org/stable/install.html)) 4. Jupyter Notebook et ses dépendances # Notes @@ -487,13 +487,13 @@ Si vous n'utilisez pas Anaconda, il faudra vous assurer de disposer des outils p [^6]: Scikit-Learn Developers, « TfidfVectorizer » (en anglais), consulté le 9 juin 2022, [https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html](https://perma.cc/JUN8-39Z6). -[^7]: Ben Schmidt, « Do Digital Humanists Need to Understand Algorithms? », _Debates in the Digital Humanities 2016_. Édition en ligne. Minneapolis: University of Minnesota Press. [http://dhdebates.gc.cuny.edu/debates/text/99](https://perma.cc/95WD-SDM5). +[^7]: Ben Schmidt, « Do Digital Humanists Need to Understand Algorithms? », _Debates in the Digital Humanities 2016_. Édition en ligne. Minneapolis: University of Minnesota Press. [https://dhdebates.gc.cuny.edu/debates/text/99](https://perma.cc/95WD-SDM5). [^8]: Guido van Rossum, Barry Warsaw et Nick Coghlan. « PEP 8 - Style Guide for Python Code », 5 juillet 2001 (mise à jour août 2013), [https://www.python.org/dev/peps/pep-0008/](https://perma.cc/P2ZM-VPQM). [^9]: « Ida M. Tarbell, 86, Dies in Bridgeport », [_The New York Times_, 17 janvier 1944](https://perma.cc/NBV6-S2XM); « W. E. B. DuBois Dies in Ghana; Negro Leader and Author, 95 », [_The New York Times_, 28 août 1963](https://perma.cc/W5NX-XZRV); Alden Whitman, « Upton Sinclair, Author, Dead; Crusader for Social Justice, 90 », [_The New York Times_, 26 novembre 1968](https://perma.cc/E4N7-2KD6); « Willa Cather Dies; Noted Novelist, 70 », [_The New York Times_, 25 avril 1947](https://perma.cc/2L7H-WGKN). -[^10]: Jonathan Stray et Julian Burgess. « A Full-text Visualization of the Iraq War Logs », 10 décembre 2010 (dernière mise à jour en avril 2012), [http://jonathanstray.com/a-full-text-visualization-of-the-iraq-war-logs](https://perma.cc/QBZ4-DKTE). +[^10]: Jonathan Stray et Julian Burgess. « A Full-text Visualization of the Iraq War Logs », 10 décembre 2010 (dernière mise à jour en avril 2012), [https://jonathanstray.com/a-full-text-visualization-of-the-iraq-war-logs](https://perma.cc/QBZ4-DKTE). [^11]: C.D. Manning, P. Raghavan et H. Schütze, _Introduction to Information Retrieval_ (Cambridge: Cambridge University Press, 2008), 118-120. @@ -503,6 +503,6 @@ Si vous n'utilisez pas Anaconda, il faudra vous assurer de disposer des outils p [^14]: Il n'est habituellement pas recommandé d'appliquer **tf-idf** comme prétraitement avant de produire un modèle thématique. Voir : [https://datascience.stackexchange.com/questions/21950/why-we-should-not-feed-lda-with-tfidf](https://perma.cc/N5W9-TYX7). -[^15]: Ben Schmidt, « Words Alone: Dismantling Topic Models in the Humanities », _Journal of Digital Humanities_. Vol. 2, No. 1 (2012): n.p., [http://journalofdigitalhumanities.org/2-1/words-alone-by-benjamin-m-schmidt/](https://perma.cc/LT4N-X4MZ). +[^15]: Ben Schmidt, « Words Alone: Dismantling Topic Models in the Humanities », _Journal of Digital Humanities_. Vol. 2, No. 1 (2012): n.p., [https://journalofdigitalhumanities.org/2-1/words-alone-by-benjamin-m-schmidt/](https://perma.cc/LT4N-X4MZ). -[^16]: Rada Mihalcea et Paul Tarau. « Textrank: Bringing order into text », _Proceedings of the 2004 Conference on Empirical Methods in Natural Language Processing_, Barcelone, Espagne, 2004, [http://www.aclweb.org/anthology/W04-3252](https://perma.cc/SMV5-7MYY). +[^16]: Rada Mihalcea et Paul Tarau. « Textrank: Bringing order into text », _Proceedings of the 2004 Conference on Empirical Methods in Natural Language Processing_, Barcelone, Espagne, 2004, [https://www.aclweb.org/anthology/W04-3252](https://perma.cc/SMV5-7MYY). diff --git a/fr/lecons/analyse-donnees-tabulaires-R.md b/fr/lecons/analyse-donnees-tabulaires-R.md index 4a59c7fb0d..16eb0a0a05 100644 --- a/fr/lecons/analyse-donnees-tabulaires-R.md +++ b/fr/lecons/analyse-donnees-tabulaires-R.md @@ -578,7 +578,7 @@ Pour en savoir plus sur R, consultez le [manuel de R](https://cran.r-project.org Vous trouverez en ligne de nombreux tutoriels sur R. Nous vous conseillons : -* [R: A self-learn tutorial](http://web.archive.org/web/20191015004305/https://www.nceas.ucsb.edu/files/scicomp/Dloads/RProgramming/BestFirstRTutorial.pdf) (en anglais). Ce tutoriel passe en revue plusieurs fonctions et propose des exercices pour s’entrainer. +* [R: A self-learn tutorial](https://web.archive.org/web/20191015004305/https://www.nceas.ucsb.edu/files/scicomp/Dloads/RProgramming/BestFirstRTutorial.pdf) (en anglais). Ce tutoriel passe en revue plusieurs fonctions et propose des exercices pour s’entrainer. * [Introduction à R](https://www.datacamp.com/courses/introduction-a-r). Cours proposé par le site Datacamp qui vous permet de vous entrainer en ligne (gratuit, mais il faut s’inscrire pour y accéder). Les exercices interactifs permettent d’identifier vos erreurs et d’apprendre à écrire du code plus efficacement. * [R pour les débutants](https://r.developpez.com/tutoriels/r/debutants/#Lno-I). Écrit par Emmanuel Paradis, il s’agit d’un des premiers manuels francophones d’introduction à R. * L’ouvrage [Computational Historical Thinking](https://dh-r.lincolnmullen.com/). Écrit par Lincoln A. Mullen, c’est une ressource précieuse pour les historiennes et historiens qui souhaitent utiliser R pour faire leurs travaux de recherche. diff --git a/fr/lecons/analyse-reseau-python.md b/fr/lecons/analyse-reseau-python.md index f64922e95b..0dcfccd9b2 100644 --- a/fr/lecons/analyse-reseau-python.md +++ b/fr/lecons/analyse-reseau-python.md @@ -73,7 +73,7 @@ Cette leçon peut vous aider à répondre à des questions telles que : Bien avant qu'il n'y ait des ami·es sur Facebook, il y avait la «  Société des Amis  », connue sous le nom de quakers. Fondés en Angleterre au milieu du XVIIe siècle, les quakers étaient des chrétien·nes protestantes qui s'opposaient à l'Église officielle d'Angleterre et prônaient une large tolérance religieuse, préférant la « lumière intérieure » et la conscience des chrétien·nes à l'orthodoxie imposée par l'État. Le nombre de quakers a augmenté rapidement entre le milieu et la fin du XVIIe siècle et leurs membres se sont répandus dans les iles britanniques, en Europe et dans les colonies du Nouveau Monde - en particulier en Pennsylvanie, colonie fondée par le leader quaker William Penn et où vivent les quatre auteurs et autrices de cette leçon. -Les universitaires ayant depuis longtemps lié la croissance des effectifs et la pérennité des quakers à l'efficacité de leurs réseaux, les données utilisées dans cette leçon sont une liste de noms et de relations parmi les premiers quakers du XVIIe siècle. Ce jeu de données est issu du *[Oxford Dictionary of National Biography](http://www.oxforddnb.com)* et du projet *[Six Degrees of Francis Bacon](https://perma.cc/Q63S-UZTU)* qui reconstruit les réseaux sociaux du début de la Grande-Bretagne moderne (1500-1700). +Les universitaires ayant depuis longtemps lié la croissance des effectifs et la pérennité des quakers à l'efficacité de leurs réseaux, les données utilisées dans cette leçon sont une liste de noms et de relations parmi les premiers quakers du XVIIe siècle. Ce jeu de données est issu du *[Oxford Dictionary of National Biography](https://www.oxforddnb.com)* et du projet *[Six Degrees of Francis Bacon](https://perma.cc/Q63S-UZTU)* qui reconstruit les réseaux sociaux du début de la Grande-Bretagne moderne (1500-1700). ## Préparation des données et installation de NetworkX @@ -535,7 +535,7 @@ Travailler avec NetworkX permet d'en apprendre beaucoup sur les classes de modul ## Exporter les données -NetworkX prend en charge un très grand nombre de formats de fichiers pour [exporter les données](https://perma.cc/Z7H3-UMKD). Si vous voulez exporter une liste de liens en format texte à charger dans Palladio, il existe un [outil adapté](https://perma.cc/DWK2-J389). Fréquemment, dans le projet *Six Degrees of Francis Bacon*, nous exportons les données NetworkX en [format JSON d3](https://perma.cc/2STT-F466) pour les visualiser dans un navigateur. Vous pouvez aussi [exporter](https://perma.cc/7UCP-YBX4) votre graphe en tant que [tableau de données Pandas](http://pandas.pydata.org/) si vous souhaitez effectuer des manipulations statistiques plus avancées. Il existe de nombreuses options et, si vous avez ajouté toutes vos mesures dans votre objet `Graph` en tant qu’attributs, toutes vos données seront exportées simultanément. +NetworkX prend en charge un très grand nombre de formats de fichiers pour [exporter les données](https://perma.cc/Z7H3-UMKD). Si vous voulez exporter une liste de liens en format texte à charger dans Palladio, il existe un [outil adapté](https://perma.cc/DWK2-J389). Fréquemment, dans le projet *Six Degrees of Francis Bacon*, nous exportons les données NetworkX en [format JSON d3](https://perma.cc/2STT-F466) pour les visualiser dans un navigateur. Vous pouvez aussi [exporter](https://perma.cc/7UCP-YBX4) votre graphe en tant que [tableau de données Pandas](https://pandas.pydata.org/) si vous souhaitez effectuer des manipulations statistiques plus avancées. Il existe de nombreuses options et, si vous avez ajouté toutes vos mesures dans votre objet `Graph` en tant qu’attributs, toutes vos données seront exportées simultanément. La plupart des options d’exportation fonctionnent à peu près de la même manière. Dans cette leçon, vous apprendrez comment exporter vos données au format GEXF de Gephi. Une fois le fichier exporté, vous pouvez le charger [directement dans Gephi](https://gephi.org/quickstart/) pour le visualiser. diff --git a/fr/lecons/calibration-radiocarbone-avec-r.md b/fr/lecons/calibration-radiocarbone-avec-r.md index 3b5f9d230b..0a0ab87f3f 100644 --- a/fr/lecons/calibration-radiocarbone-avec-r.md +++ b/fr/lecons/calibration-radiocarbone-avec-r.md @@ -127,7 +127,7 @@ On comprend ainsi que ces particularités, si elles sont mal comprises, peuvent ## Applications avec R -De nombreux outils sont aujourd'hui disponibles pour calibrer des âges radiocarbone. [OxCal](https://c14.arch.ox.ac.uk/oxcal/), [CALIB](http://calib.org) et [ChronoModel](https://chronomodel.com) offrent cette possibilité, mais sont plutôt destinés à traiter des problèmes de [modélisation bayésienne](https://fr.wikipedia.org/wiki/Statistique_bay%C3%A9sienne) de séquences chronologiques. Le langage R offre une alternative intéressante. Distribué sous licence libre, il favorise la reproductibilité et permet d'intégrer le traitement d'âges radiocarbone à des études plus larges (analyse spatiale etc.). +De nombreux outils sont aujourd'hui disponibles pour calibrer des âges radiocarbone. [OxCal](https://c14.arch.ox.ac.uk/oxcal/), [CALIB](https://calib.org) et [ChronoModel](https://chronomodel.com) offrent cette possibilité, mais sont plutôt destinés à traiter des problèmes de [modélisation bayésienne](https://fr.wikipedia.org/wiki/Statistique_bay%C3%A9sienne) de séquences chronologiques. Le langage R offre une alternative intéressante. Distribué sous licence libre, il favorise la reproductibilité et permet d'intégrer le traitement d'âges radiocarbone à des études plus larges (analyse spatiale etc.). Plusieurs packages R permettent de réaliser des calibrations d'âges radiocarbone ([Bchron](https://cran.r-project.org/package=Bchron), [oxcAAR](https://cran.r-project.org/package=oxcAAR)...) et sont souvent orientés vers la modélisation (construction de chronologies, modèles âges-profondeur, etc.). La solution retenue ici est [rcarbon](https://cran.r-project.org/package=rcarbon) (Bevan et Crema 2020). Ce package permet de calibrer simplement et d'analyser des âges radiocarbone. @@ -501,7 +501,7 @@ Dean, J. S. "Independent Dating in Archaeological Analysis". In *Advances in Arc Hyndman, R. J. 1996. "Computing and Graphing Highest Density Regions." *The American Statistician* 50 (2): 120-26. https://doi.org/10.2307/2684423. -Libby, W. F. "Radiocarbon Dating". *Nobel Lecture*. Stockholm, 12 décembre 1960. http://www.nobelprize.org/nobel_prizes/chemistry/laureates/1960/libby-lecture.html. +Libby, W. F. "Radiocarbon Dating". *Nobel Lecture*. Stockholm, 12 décembre 1960. https://www.nobelprize.org/nobel_prizes/chemistry/laureates/1960/libby-lecture.html. Millard, A. R. 2014. "Conventions for Reporting Radiocarbon Determinations." *Radiocarbon* 56 (2): 555-59. https://doi.org/10.2458/56.17455. diff --git a/fr/lecons/comprendre-les-expressions-regulieres.md b/fr/lecons/comprendre-les-expressions-regulieres.md index fb20b2367a..ea6f02fdd6 100644 --- a/fr/lecons/comprendre-les-expressions-regulieres.md +++ b/fr/lecons/comprendre-les-expressions-regulieres.md @@ -78,7 +78,7 @@ n'importe quelle année entre 1850 et 1899. Pour cet exercice, nous utilisons LibreOffice Writer et LibreOffice Calc, des logiciels de bureautique libres, utilisés respectivement pour le traitement de texte et les feuilles de calcul. Les paquets d'installation pour Linux, Mac ou -Windows peuvent être téléchargés depuis . +Windows peuvent être téléchargés depuis . D'autre logiciels de traitement de texte et même des langages de programmation ont des fonctionalités similaires de recherche de motifs. Comme sa distribution est libre et comme sa syntaxe pour les expressions régulières est proche de ce @@ -110,7 +110,7 @@ différents types de ressources textuelles utilisées pour toutes sortes de recherche en histoire. Pour notre exercice, nous allons utiliser un rapport de cinq pages contenant des statistiques mensuelles sur la morbidité et la mortalité dans les États et les villes des États-Unis, publié en février 1908. -Il est disponible ici : . +Il est disponible ici : . Prenez un moment pour parcourir brièvement les pages du document grâce au [lien pour lire en ligne][], afin de vous familiariser avec lui. Ce document @@ -725,10 +725,10 @@ Pittsburgh, a quant à lui de bons cas de figures sur la manière de travailler [expressions régulières et des outils pour XML][], dans le but de baliser des fichiers de texte brut pour en faire des fichiers XML. - [lien pour lire en ligne]: http://archive.org/stream/jstor-4560629/4560629#page/n0/mode/2up - [Texte intégral]: http://archive.org/stream/jstor-4560629/4560629_djvu.txt + [lien pour lire en ligne]: https://archive.org/stream/jstor-4560629/4560629#page/n0/mode/2up + [Texte intégral]: https://archive.org/stream/jstor-4560629/4560629_djvu.txt [Liste des expressions régulières]: https://help.libreoffice.org/6.3/fr/text/shared/01/02100001.html?DbPAR=SHARED#bm_id3146765 [expressions régulières]: https://fr.wikipedia.org/wiki/Expression_r%C3%A9guli%C3%A8re - [Rubular]: http://rubular.com/ - [expressions régulières et des outils pour XML]: http://dh.obdurodon.org/regex.html + [Rubular]: https://rubular.com/ + [expressions régulières et des outils pour XML]: https://dh.obdurodon.org/regex.html diff --git a/fr/lecons/comprendre-les-pages-web.md b/fr/lecons/comprendre-les-pages-web.md index d4038e5a62..d8549641c4 100644 --- a/fr/lecons/comprendre-les-pages-web.md +++ b/fr/lecons/comprendre-les-pages-web.md @@ -137,5 +137,5 @@ et l'éditeur de texte qui ne l'interprète pas. - [tutoriels W3 Schools HTML][] - [tutoriels W3 Schools HTML5][] - [tutoriels W3 Schools HTML]: http://www.w3schools.com/html/default.asp - [tutoriels W3 Schools HTML5]: http://www.w3schools.com/html/html5_intro.asp + [tutoriels W3 Schools HTML]: https://www.w3schools.com/html/default.asp + [tutoriels W3 Schools HTML5]: https://www.w3schools.com/html/html5_intro.asp diff --git a/fr/lecons/compter-exploiter-donnees-unix.md b/fr/lecons/compter-exploiter-donnees-unix.md index 7501b7c723..1cc8eb0295 100644 --- a/fr/lecons/compter-exploiter-donnees-unix.md +++ b/fr/lecons/compter-exploiter-donnees-unix.md @@ -52,7 +52,7 @@ Cette leçon a été écrite en 2014, pour Git Bash 1.9.0 et le système d' Vous trouverez les fichiers utilisés dans cette leçon sur [Figshare](https://doi.org/10.6084/m9.figshare.1172094). Les données servant de support à la leçon comprennent les métadonnées d'articles de revues se trouvant dans la catégorie « Histoire » de la base de données ESTAR de la British Library. Ces données sont diffusées sous la licence libre de droits CC0. -Téléchargez et enregistrez l'archive ZIP sur votre ordinateur, puis décompressez-la. Si vous n'avez pas de logiciel par défaut prenant en charge le format ZIP, nous vous conseillons d'utiliser [7-zip](http://www.7-zip.org/). Sur Windows, nous recommandons de décompresser le dossier fourni à l'intérieur de votre répertoire Utilisateur, de sorte que vos fichiers se trouvent dans `c/Users/NOM_UTILISATEUR/proghist/`. Bien sûr, n'importe quel emplacement fera tout aussi bien l'affaire, cependant vous devrez probablement adapter vos commandes par rapport à celles présentées tout au long de la leçon si vous choisissez un autre répertoire. De même, nous conseillons de décompresser les fichiers dans le répertoire Utilisateur, de sorte à les retrouver dans `/user/NOM_UTILISATEUR/proghist/` sur macOS ou `/home/NOM_UTILISATEUR/proghist/` sur Linux. Dans tous les cas, l'objectif est qu'en ouvrant une nouvelle fenêtre terminal, nous n'ayez qu'à taper `cd proghist` pour vous déplacer directement dans le bon dossier. +Téléchargez et enregistrez l'archive ZIP sur votre ordinateur, puis décompressez-la. Si vous n'avez pas de logiciel par défaut prenant en charge le format ZIP, nous vous conseillons d'utiliser [7-zip](https://7-zip.org/). Sur Windows, nous recommandons de décompresser le dossier fourni à l'intérieur de votre répertoire Utilisateur, de sorte que vos fichiers se trouvent dans `c/Users/NOM_UTILISATEUR/proghist/`. Bien sûr, n'importe quel emplacement fera tout aussi bien l'affaire, cependant vous devrez probablement adapter vos commandes par rapport à celles présentées tout au long de la leçon si vous choisissez un autre répertoire. De même, nous conseillons de décompresser les fichiers dans le répertoire Utilisateur, de sorte à les retrouver dans `/user/NOM_UTILISATEUR/proghist/` sur macOS ou `/home/NOM_UTILISATEUR/proghist/` sur Linux. Dans tous les cas, l'objectif est qu'en ouvrant une nouvelle fenêtre terminal, nous n'ayez qu'à taper `cd proghist` pour vous déplacer directement dans le bon dossier.
    Remarque (NDLT) : les noms de l'archive ZIP peuvent varier en fonction de la méthode de décompression que vous avez utilisée, et le nom du dossier de travail peut ressembler plutôt à ProgHistcountingminingunixdata Dans ce cas, vous avez deux possibilités : soit renommer le dossier proghist et taper cd proghist dans le terminal ; soit taper cd ProgHistcoutingminingunixdata (ou le nom de votre archive ZIP) dans le terminal pour vous trouver dans le bon répertoire, à chaque fois que cette leçon vous demandera de taper cd proghist. Notez également que le terminal tient compte des majuscules et des minuscules. diff --git a/fr/lecons/concevoir-base-donnees-nodegoat.md b/fr/lecons/concevoir-base-donnees-nodegoat.md index 453c6090ba..1a3755afa6 100644 --- a/fr/lecons/concevoir-base-donnees-nodegoat.md +++ b/fr/lecons/concevoir-base-donnees-nodegoat.md @@ -161,7 +161,7 @@ Ceci correspond maintenant à ce que l’on appelle généralement un «&nb Nous avons maintenant organisé ces informations en trois tables qui représentent notre jeu de données. Afin de pouvoir naviguer entre ces tables, en suivant les relations que nous avons établies dans le schéma du modèle de données, il faut maintenant les relier. Pour ce faire, on définit les possibilités et les restrictions qui se manifestent dans leurs relations - nous appelons cela la [cardinalité](https://perma.cc/M4M5-DCES)[^6]. -Quand on construit une base de données relationnelle, on doit toujours s’interroger sur les relations qui existent entre les tables : chaque élément d’une table se rapporte-t-il exclusivement à un élément individuel d’une autre table, ou entretient-il des relations multiples et croisées ? Dans le cas exemple des relations entre auteurs et ouvrages : chaque ouvrage a-t-il seulement un auteur (cardinalité 1,1) ? Ou peut-il en avoir deux ou plus, comme *The Intellectuals on the Road to Class Power* (cardinalité 1,N) ? À l’inverse, chaque auteur d’un livre avec plusieurs auteurs ne pourrait-il pas être l’auteur de plusieurs ouvrages (cardinalité N,N) ? Ces questions se posent certainement au moment de constituer notre base de données[^7]. Les réponses dépendront de notre jeu de données. Dans [la partie suivante](#créer-et-paramétrer-une-base-dans-nodegoat), nous verrons comment mettre tout ceci en pratique. +Quand on construit une base de données relationnelle, on doit toujours s’interroger sur les relations qui existent entre les tables : chaque élément d’une table se rapporte-t-il exclusivement à un élément individuel d’une autre table, ou entretient-il des relations multiples et croisées ? Dans le cas exemple des relations entre auteurs et ouvrages : chaque ouvrage a-t-il seulement un auteur (cardinalité 1,1) ? Ou peut-il en avoir deux ou plus, comme *The Intellectuals on the Road to Class Power* (cardinalité 1,N) ? À l’inverse, chaque auteur d’un livre avec plusieurs auteurs ne pourrait-il pas être l’auteur de plusieurs ouvrages (cardinalité N,N) ? Ces questions se posent certainement au moment de constituer notre base de données[^7]. Les réponses dépendront de notre jeu de données. Dans [la partie suivante](#paramétrer-la-base-de-données), nous verrons comment mettre tout ceci en pratique. ## Construire une base de données avec nodegoat @@ -172,7 +172,7 @@ nodegoat est un logiciel en ligne qui permet aux utilisateurs et aux utilisatric nodegoat est un logiciel libre et il est aussi possible de [l’installer localement](https://perma.cc/5PLH-YJQG), mais cela exige néanmoins des compétences informatiques poussées. Dans les deux cas (installation locale ou application en ligne), l’utilisation de fonctionnalités plus avancées, notamment pour travailler sur plusieurs projets ou de manière collaborative, requiert de souscrire à l’un des abonnements payants proposés par la société qui le développe. -Les instructions qui suivent visent à guider les lectrices et les lecteurs dans la création d’une base de données sur nodegoat, selon les principes expliqués [dans la première partie de la leçon](#la-logique-de-notre-recherche--entre-le-modèle-de-données-et-la-base-de-données). L’approche que prend le logiciel ressemble fortement à celle qu’on a décrite plus haut pour conceptualiser notre recherche : essentiellement, elle traite les personnes, les groupes et les choses comme des objets, connectés par des relations diverses.[^8] nodegoat offre aussi des outils d’analyse relationnelle et de production de visualisations telles que cartes ou réseaux. Surtout, le logiciel accepte de consigner des informations incertaines ou ambigües, courantes en sciences humaines. Par exemple, il peut suggérer d’utiliser un intervalle de temps si on ne dispose pas de dates exactes, ou de dessiner un polygone si on ne dispose pas de coordonnées géographiques exactes. +Les instructions qui suivent visent à guider les lectrices et les lecteurs dans la création d’une base de données sur nodegoat, selon les principes expliqués [dans la première partie de la leçon](#de-la-saisie-à-la-modélisation-des-données). L’approche que prend le logiciel ressemble fortement à celle qu’on a décrite plus haut pour conceptualiser notre recherche : essentiellement, elle traite les personnes, les groupes et les choses comme des objets, connectés par des relations diverses.[^8] nodegoat offre aussi des outils d’analyse relationnelle et de production de visualisations telles que cartes ou réseaux. Surtout, le logiciel accepte de consigner des informations incertaines ou ambigües, courantes en sciences humaines. Par exemple, il peut suggérer d’utiliser un intervalle de temps si on ne dispose pas de dates exactes, ou de dessiner un polygone si on ne dispose pas de coordonnées géographiques exactes. Certes, l’objet de cette leçon n’est pas l’utilisation de nodegoat en soi - vous pouvez tout à fait utiliser [d’autres logiciels de gestion de bases de données spécifiquement conçus pour la recherche en sciences humaines et sociales](/fr/lecons/introduction-a-heurist). Malgré tout, en combinant tous ces outils dans un même environnement, nodegoat facilite considérablement l’exercice de concevoir sa recherche en données. Son avantage majeur pour nous, dans cette leçon, est qu’il facilite particulièrement la définition et l’exécution du modèle que nous avons décrit de façon abstraite ci-dessus. @@ -282,6 +282,6 @@ Pour approfondir votre utilisation de nodegoat et explorer tout son potentiel, n [^6]: Voir cette notice de Wikipédia pour plus d’éléments sur la notion de « cardinalité » : « Modèle relationnel », [https://fr.wikipedia.org/wiki/Mod%C3%A8le_relationnel#Relation_1:N](https://perma.cc/KSA4-Y4WL). Voir aussi Gardarin, ouvrage cité, 412-413. -[^7]: Bree, P. van, Kessels, G., (2013). nodegoat: a web-based data management, network analysis & visualisation environment, http://nodegoat.net from [LAB1100](https://perma.cc/LAT9-M6UW) +[^7]: Bree, P. van, Kessels, G., (2013). nodegoat: a web-based data management, network analysis & visualisation environment, https://nodegoat.net from [LAB1100](https://perma.cc/LAT9-M6UW) [^8]: Les créateurs de nodegoat décrivent l’approche relationnelle du logiciel comme « orienté-objet ». Ce concept étant le plus souvent utilisé pour décrire un paradigme de programmation informatique, nous préférons éviter l’emploi de ce terme afin d’éviter des confusions. diff --git a/fr/lecons/debuter-avec-markdown.md b/fr/lecons/debuter-avec-markdown.md index f3a7d18384..7b6fb0e46f 100644 --- a/fr/lecons/debuter-avec-markdown.md +++ b/fr/lecons/debuter-avec-markdown.md @@ -42,11 +42,11 @@ Cette leçon sert d’initiation à Markdown, qui est une syntaxe en texte brut Puisque les tutoriels de ce site sont soumis sous forme de fichiers Markdown, je mobilise des exemples maison chaque fois que cela est possible. J'espère que ce guide vous sera particulièrement utile si vous envisagez de rédiger un tutoriel en tant qu'auteur(e) pour le *Programming Historian*, même s'il reste d'une portée plus générale. ## Qu'est-ce que le Markdown? -Développé en 2004 par [John Gruber](http://daringfireball.net/projects/markdown/ "Markdown on Daring Fireball"), Markdown est à la fois un langage de balisage de fichiers textes et une fonctionnalité du langage [Perl](https://fr.wikipedia.org/wiki/Perl_(langage)) permettant de convertir des fichiers Markdown en HTML. Notre leçon traite davantage du premier aspect, puisque nous apprendrons à utiliser la syntaxe Markdown pour préparer des fichiers. +Développé en 2004 par [John Gruber](https://daringfireball.net/projects/markdown/ "Markdown on Daring Fireball"), Markdown est à la fois un langage de balisage de fichiers textes et une fonctionnalité du langage [Perl](https://fr.wikipedia.org/wiki/Perl_(langage)) permettant de convertir des fichiers Markdown en HTML. Notre leçon traite davantage du premier aspect, puisque nous apprendrons à utiliser la syntaxe Markdown pour préparer des fichiers. Les fichiers texte brut présentent plusieurs avantages comparés aux autres formats. Non seulement ils sont compatibles avec tout type d'appareil et de système d'exploitation, mais ils s'avèrent aussi plus pérennes. Si jamais vous avez tenté d'ouvrir un document sauvegardé dans une version antérieure d'un logiciel de traitement de texte, vous pouvez comprendre facilement les problèmes de compatibilité qui sont en jeu. -L'utilisation de la syntaxe Markdown vous permettra de produire des fichiers à la fois lisibles en texte brut et prêts à recevoir davantage de traitement sur une autre plateforme. Plusieurs systèmes de gestion de blogs, des générateurs de sites web statiques ou encore des plateformes comme [GitHub](http://github.com "GitHub") prennent en charge des fichiers Markdown pour les convertir en [HTML](https://fr.wikipedia.org/wiki/Hypertext_Markup_Language) et les publier sur le web. De plus, des outils comme Pandoc peuvent convertir des fichiers depuis et vers Markdown. Pour apprendre plus sur Pandoc, vous pouvez faire un tour sur [cette leçon](/fr/lecons/redaction-durable-avec-pandoc-et-markdown) de Dennis Tenen et Grant Wythoff. +L'utilisation de la syntaxe Markdown vous permettra de produire des fichiers à la fois lisibles en texte brut et prêts à recevoir davantage de traitement sur une autre plateforme. Plusieurs systèmes de gestion de blogs, des générateurs de sites web statiques ou encore des plateformes comme [GitHub](https://github.com "GitHub") prennent en charge des fichiers Markdown pour les convertir en [HTML](https://fr.wikipedia.org/wiki/Hypertext_Markup_Language) et les publier sur le web. De plus, des outils comme Pandoc peuvent convertir des fichiers depuis et vers Markdown. Pour apprendre plus sur Pandoc, vous pouvez faire un tour sur [cette leçon](/fr/lecons/redaction-durable-avec-pandoc-et-markdown) de Dennis Tenen et Grant Wythoff. ## La syntaxe Markdown Les fichiers Markdown portent l'extension `.md`. Il est possible de les ouvrir avec un éditeur de texte comme TextEdit, Notepad++, Sublime Text ou Vim. Plusieurs sites web et des plateformes de publication proposent des éditeurs de texte en ligne et/ou des extensions pour insérer du texte avec la syntaxe Markdown. @@ -239,13 +239,13 @@ Les liens de références sont très pratiques pour créer des notes de bas de p Puis vous pouvez ajouter l'URL dans une autre partie du document: -`[1]: http://programminghistorian.org/ "The Programming Historian"` +`[1]: https://programminghistorian.org/ "The Programming Historian"` **Ceci s'affiche comme suit:** Le site web du [Programming Historian][1] fournit un exemple. -[1]: http://programminghistorian.org/ "The Programming Historian" +[1]: https://programminghistorian.org/ "The Programming Historian" ### Images @@ -312,7 +312,7 @@ Pour régler l'alignement de chaque colonne, les deux points `:` peuvent être a ## Les limites de Markdown -Même si Markdown devient de plus en plus populaire, notamment pour formatter des documents exposés sur le web, beaucoup de gens et d'éditeurs sollicitent des documents traditionnels en Word, PDF et d'autres formats de fichiers. Certains outils de conversion exécutables en ligne de commande, comme [Pandoc](https://pandoc.org/), offrent une solution, sans toutefois offrir toutes les fonctionnalités des logiciels de traitement de texte, notamment le versionnage. Pour en savoir plus sur Pandoc, merci de consulter la leçon du *Programming Historian* intitulée ["Sustainable authorship in plain text using Pandoc and Markdown"](/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) (en anglais). +Même si Markdown devient de plus en plus populaire, notamment pour formatter des documents exposés sur le web, beaucoup de gens et d'éditeurs sollicitent des documents traditionnels en Word, PDF et d'autres formats de fichiers. Certains outils de conversion exécutables en ligne de commande, comme [Pandoc](https://pandoc.org/), offrent une solution, sans toutefois offrir toutes les fonctionnalités des logiciels de traitement de texte, notamment le versionnage. Pour en savoir plus sur Pandoc, merci de consulter la leçon du *Programming Historian* intitulée ["Sustainable authorship in plain text using Pandoc and Markdown"](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) (en anglais). ## Conclusion diff --git a/fr/lecons/decomptes-de-frequences-de-mots-en-python.md b/fr/lecons/decomptes-de-frequences-de-mots-en-python.md index 4c5294782c..ab1622d847 100644 --- a/fr/lecons/decomptes-de-frequences-de-mots-en-python.md +++ b/fr/lecons/decomptes-de-frequences-de-mots-en-python.md @@ -362,7 +362,7 @@ Lutz, _Learning Python_ Pilgrim, _Diving into Python_ -- Ch. 7: [Regular Expressions](https://web.archive.org/web/20180416143856/http://www.diveintopython.net/regular_expressions/index.html) +- Ch. 7: [Regular Expressions](https://web.archive.org/web/20180416143856/https://www.diveintopython.net/regular_expressions/index.html) ### Synchronisation du code diff --git a/fr/lecons/detecter-la-reutilisation-de-texte-avec-passim.md b/fr/lecons/detecter-la-reutilisation-de-texte-avec-passim.md index 463621df19..482921825b 100644 --- a/fr/lecons/detecter-la-reutilisation-de-texte-avec-passim.md +++ b/fr/lecons/detecter-la-reutilisation-de-texte-avec-passim.md @@ -48,7 +48,7 @@ La liste ci-dessous présente une partie des outils qui permettent de détecter - [Basic Local Alignment Search Tool (BLAST)](https://blast.ncbi.nlm.nih.gov/Blast.cgi) - [Tesserae](https://github.com/tesserae/tesserae) (PHP, Perl) - [TextPAIR (Pairwise Alignment for Intertextual Relations)](https://github.com/ARTFL-Project/text-pair) -- [Passim](https://github.com/dasmiq/passim) (Scala) développé par [David Smith](http://www.ccs.neu.edu/home/dasmith/ +- [Passim](https://github.com/dasmiq/passim) (Scala) développé par [David Smith](https://www.ccs.neu.edu/home/dasmith/ ) (Université Northeastern) Pour ce tutoriel, nous avons choisi de nous concentrer sur la bibliothèque Passim et cela pour trois raisons principales. Premièrement, car celle-ci peut être adaptée à une grande variété d'utilisation, puisqu'elle fonctionne autant sur une petite collection de texte que sur un corpus de grande échelle. Deuxièmement, parce que, bien que la documentation au sujet de Passim soit exhaustive, du fait que ses utilisateurs soient relativement avancés, un guide « pas-à-pas » de la détection de la réutilisation de texte avec Passim plus axé sur l'utilisateur serait bénéfique pour l'ensemble de la communauté. Enfin, les exemples suivants illustrent la variété de scénarios dans lesquels la réutilisation de texte est une méthodologie utile : @@ -212,7 +212,7 @@ export PATH="/home/simon/Passim/bin:$PATH" ### Installation de Spark -1. Accédez à la [section de téléchargement](http://spark.apache.org/downloads) du site Web de Spark et sélectionnez la version publiée de Spark 3.x.x (où *x.x* indique les éditions de la version 3.) ainsi que le type de paquetage « Pre-built for Apache Hadoop 2.7 » dans les menus déroulants. +1. Accédez à la [section de téléchargement](https://spark.apache.org/downloads) du site Web de Spark et sélectionnez la version publiée de Spark 3.x.x (où *x.x* indique les éditions de la version 3.) ainsi que le type de paquetage « Pre-built for Apache Hadoop 2.7 » dans les menus déroulants. 2. Extrayez les données binaires compressées dans le répertoire de votre choix (par exemple `/Applications`) : ```bash @@ -413,7 +413,7 @@ En fin de compte, ce qui compose un document et comment ces documents devraient ## Format JSON de base -Le format d'entrée de Passim consiste en des documents JSON qui sont au format [JSON lines](http://jsonlines.org/) (c'est-à-dire que chaque ligne de texte contient un seul document JSON). +Le format d'entrée de Passim consiste en des documents JSON qui sont au format [JSON lines](https://jsonlines.org/) (c'est-à-dire que chaque ligne de texte contient un seul document JSON). Le contenu suivant d'un fichier nommé `test.json` illustre le format minimal d'entrée requis pour Passim : @@ -715,7 +715,7 @@ Vous êtes maintenant prêt(e)s à vous lancer dans votre premier projet de réu Pour l'instant, ne vous souciez pas des arguments supplémentaires `SPARK_SUBMIT_ARGS='--master local[12] --driver-memory 8G --executor-memory 4G'`; dans la section [Etude de Cas 2](#etude-de-cas-2--réutilisation-de-textes-dans-un-grand-corpus-de-journaux-historiques) nous les expliquerons en détail. -Ce cas de test prend approximativement huit minutes sur un ordinateur portable récent avec huit threads. Vous pouvez également suivre la progression de la détection sur http://localhost:4040 — un tableau de bord interactif créé par Spark. Notez que le tableau de bord se fermera dès que Passim aura terminé son exécution. +Ce cas de test prend approximativement huit minutes sur un ordinateur portable récent avec huit threads. Vous pouvez également suivre la progression de la détection sur https://localhost:4040 — un tableau de bord interactif créé par Spark. Notez que le tableau de bord se fermera dès que Passim aura terminé son exécution. ## Etude de cas 2 : Réutilisation de textes dans un grand corpus de journaux historiques @@ -903,15 +903,15 @@ Matteo Romanello remercie le Fonds national suisse de la recherche scientifique # Bibliographie -1. Franzini, Greta, Maria Moritz, Marco Büchler et Marco Passarotti. « Using and evaluating TRACER for an Index fontium computatus of the Summa contra Gentiles of Thomas Aquinas ». *Proceedings of the Fifth Italian Conference on Computational Linguistics (CLiC-it 2018)* (2018). [Lien](http://ceur-ws.org/Vol-2253/paper22.pdf) -2. Smith, David A., Ryan Cordell et Abby Mullen. « Computational Methods for Uncovering Reprinted Texts in Antebellum Newspapers ». *American Literary History* **27** (2015). [Lien](http://dx.doi.org/10.1093/alh/ajv029) -3. Cordell, Ryan. « Reprinting Circulation, and the Network Author in Antebellum Newspapers ». *American Literary History* **27** (2015): 417–445. [Lien](http://dx.doi.org/10.1093/alh/ajv028) -4. Vogler, Daniel, Linards Udris et Mark Eisenegger. « Measuring Media Content Concentration at a Large Scale Using Automated Text Comparisons ». *Journalism Studies* 21, no.11 (2020). [Lien](http://dx.doi.org/10.1080/1461670x.2020.1761865) +1. Franzini, Greta, Maria Moritz, Marco Büchler et Marco Passarotti. « Using and evaluating TRACER for an Index fontium computatus of the Summa contra Gentiles of Thomas Aquinas ». *Proceedings of the Fifth Italian Conference on Computational Linguistics (CLiC-it 2018)* (2018). [Lien](https://ceur-ws.org/Vol-2253/paper22.pdf) +2. Smith, David A., Ryan Cordell et Abby Mullen. « Computational Methods for Uncovering Reprinted Texts in Antebellum Newspapers ». *American Literary History* **27** (2015). [Lien](https://dx.doi.org/10.1093/alh/ajv029) +3. Cordell, Ryan. « Reprinting Circulation, and the Network Author in Antebellum Newspapers ». *American Literary History* **27** (2015): 417–445. [Lien](https://dx.doi.org/10.1093/alh/ajv028) +4. Vogler, Daniel, Linards Udris et Mark Eisenegger. « Measuring Media Content Concentration at a Large Scale Using Automated Text Comparisons ». *Journalism Studies* 21, no.11 (2020). [Lien](https://dx.doi.org/10.1080/1461670x.2020.1761865) 5. Mullen, Lincoln. *textreuse: Detect Text Reuse and Document Similarity*. Github. (2016). [Lien](https://github.com/ropensci/textreuse) -6. Büchler, Marco, Philip R. Burns, Martin Müller, Emily Franzini et Greta Franzini. « Towards a Historical Text Re-use Detection ». In *Text Mining: From Ontology Learning to Automated Text Processing Applications* dir. Chris Biemann et Alexander Mehler, 221–238. Springer International Publishing, 2014. [Lien](http://dx.doi.org/10.1007/978-3-319-12655-5_11) -7. Vierthaler, Paul et Meet Gelein. « A BLAST-based, Language-agnostic Text Reuse Algorithm with a MARKUS Implementation and Sequence Alignment Optimized for Large Chinese Corpora ». *Journal of Cultural Analytics* 4, vol.2 (2019). [Lien](http://dx.doi.org/10.22148/16.034) +6. Büchler, Marco, Philip R. Burns, Martin Müller, Emily Franzini et Greta Franzini. « Towards a Historical Text Re-use Detection ». In *Text Mining: From Ontology Learning to Automated Text Processing Applications* dir. Chris Biemann et Alexander Mehler, 221–238. Springer International Publishing, 2014. [Lien](https://dx.doi.org/10.1007/978-3-319-12655-5_11) +7. Vierthaler, Paul et Meet Gelein. « A BLAST-based, Language-agnostic Text Reuse Algorithm with a MARKUS Implementation and Sequence Alignment Optimized for Large Chinese Corpora ». *Journal of Cultural Analytics* 4, vol.2 (2019). [Lien](https://dx.doi.org/10.22148/16.034) 8. Vesanto, Aleksi, Asko Nivala, Heli Rantala, Tapio Salakoski, Hannu Salmi et Filip Ginter. « Applying BLAST to Text Reuse Detection in Finnish Newspapers and Journals, 1771-1910 ». *Proceedings of the NoDaLiDa 2017 Workshop on Processing Historical Language* (2017): 54–58. [Lien](https://aclanthology.org/W17-0510.pdf) 9. Salmi, Hannu, Heli Rantala, Aleksi Vesanto et Filip Ginter. « The long-term reuse of text in the Finnish press, 1771–1920 ». *CEUR Workshop Proceedings* 2364 (2019): 394–544. -10. Soto, Axel J, Abidalrahman Mohammad, Andrew Albert, Aminul Islam, Evangelos Milios, Michael Doyle, Rosane Minghim et Maria Cristina de Oliveira. « Similarity-Based Support for Text Reuse in Technical Writing ». *Proceedings of the 2015 ACM Symposium on Document Engineering* (2015): 97–106. [Lien](http://dx.doi.org/10.1145/2682571.2797068) +10. Soto, Axel J, Abidalrahman Mohammad, Andrew Albert, Aminul Islam, Evangelos Milios, Michael Doyle, Rosane Minghim et Maria Cristina de Oliveira. « Similarity-Based Support for Text Reuse in Technical Writing ». *Proceedings of the 2015 ACM Symposium on Document Engineering* (2015): 97–106. [Lien](https://dx.doi.org/10.1145/2682571.2797068) 11. Schofield, Alexandra, Laure Thompson et David Mimno. « Quantifying the Effects of Text Duplication on Semantic Models ». *Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing* (2017): 2737–2747. [https://doi.org/10.18653/v1/D17-1290](https://perma.cc/KSK6-5TXP) 12. Romanello, Matteo, Aurélien Berra et Alexandra Trachsel. « Rethinking Text Reuse as Digital Classicists ». *Digital Humanities conference* (2014). [Lien](https://web.archive.org/web/20140829121705/https://wiki.digitalclassicist.org/Text_Reuse) diff --git a/fr/lecons/du-html-a-une-liste-de-mots-1.md b/fr/lecons/du-html-a-une-liste-de-mots-1.md index 52bd91526c..6f2cdc825b 100644 --- a/fr/lecons/du-html-a-une-liste-de-mots-1.md +++ b/fr/lecons/du-html-a-une-liste-de-mots-1.md @@ -23,7 +23,7 @@ translation-reviewer: - Marie Flesch difficulty: 2 review-ticket: https://github.com/programminghistorian/ph-submissions/issues/560 -next: du-html-a-une-liste-de-mots-2 +next: /fr/lecons/du-html-a-une-liste-de-mots-2 series_total: 2 lessons sequence: 1 activity: transforming diff --git a/fr/lecons/du-html-a-une-liste-de-mots-2.md b/fr/lecons/du-html-a-une-liste-de-mots-2.md index b3918c4dcb..6eef93030d 100644 --- a/fr/lecons/du-html-a-une-liste-de-mots-2.md +++ b/fr/lecons/du-html-a-une-liste-de-mots-2.md @@ -23,7 +23,7 @@ translation-reviewer: - Florian Barras difficulty: 2 review-ticket: https://github.com/programminghistorian/ph-submissions/issues/584 -previous: du-html-a-une-liste-de-mots-1 +previous: /fr/lecons/du-html-a-une-liste-de-mots-1 series_total: 2 lessons sequence: 2 activity: transforming diff --git a/fr/lecons/enrichir-donnees-reconciliation-openrefine.md b/fr/lecons/enrichir-donnees-reconciliation-openrefine.md index 7ad9a4982c..d69a98d771 100644 --- a/fr/lecons/enrichir-donnees-reconciliation-openrefine.md +++ b/fr/lecons/enrichir-donnees-reconciliation-openrefine.md @@ -207,7 +207,7 @@ Avant l’exportation finale des données enrichies, je recommande d’ajouter u Pour cela, avec notre exemple ci-dessus, cliquez sur les options de la colonne **Titre_RECON** > **Réconcilier** > **Ajouter une colonne d’identifiants d’entités...** Puis, indiquez le titre de la colonne, **Titre_QID**. -### Comment améliorer la réconciliation ? +### Comment améliorer la réconciliation ? Vous remarquerez probablement que la réconciliation est parfois peu satisfaisante. Voici quelques techniques pour réduire les réconciliations manuelles après une réconciliation automatique insatisfaisante. @@ -233,7 +233,7 @@ Vous remarquerez probablement que la réconciliation est parfois peu satisfaisan - `Hist Sci Med`-> `Histoire des sciences médicales`  - `AHA` -> `American Historical Association` -### À quoi correspond « Créer un nouvel élément » ? +### À quoi correspond « Créer un nouvel élément »? Cela permet de marquer un enregistrement pour créer un élément dans Wikidata depuis OpenRefine. Je n’utilise pas cette option, car je préfère créer de nouveaux éléments directement dans Wikidata (manuellement ou avec QuickStatements). Cela garantit une meilleure indexation et plus de contrôle sur le processus de création ou d’enrichissement.  diff --git a/fr/lecons/generer-jeu-donnees-texte-ocr.md b/fr/lecons/generer-jeu-donnees-texte-ocr.md index 1b26d3ed43..87b58e7dfc 100644 --- a/fr/lecons/generer-jeu-donnees-texte-ocr.md +++ b/fr/lecons/generer-jeu-donnees-texte-ocr.md @@ -47,7 +47,7 @@ Que se passerait-il si, par exemple, votre OCR interprétait les chaînes de car Bien souvent, les documents que les historien(ne)s souhaitent numériser sont déjà des structures ordonnées de données : une collection ordonnée de documents issus d'une source primaire, un code juridique ou encore un cartulaire. Mais la structure éditoriale imposée à ces ressources est généralement conçue pour un type particulier de technologie d’extraction de données, c'est-à-dire un codex, un livre. Pour un texte numérisé, la structure utilisée sera différente. Si vous pouvez vous débarrasser de l’infrastructure liée au livre et réorganiser le texte selon les sections et les divisions qui vous intéressent, vous vous retrouverez avec des données sur lesquelles il sera beaucoup plus facile d'effectuer des recherches et des opérations de remplacement, et en bonus, votre texte deviendra immédiatement exploitable dans une multitude d’autres contextes. -C'est là qu'un langage de script comme Python devient très utile. Pour notre projet nous avons voulu préparer certains des documents d’une [collection du XIIe siècle d’*imbreviatura*](http://www.worldcat.org/oclc/17591390) du scribe italien connu sous le nom de Giovanni Scriba (vous pouvez [accéder au PDF ici](https://notariorumitinera.eu/Docs/Biblioteca_Digitale/SB/3a47488c28eef2aedfea52ebbde2c634/dd361cb1479ab2309f5ceef1f875c2a5.pdf)) afin qu’ils puissent être traités par des historien(ne)s à des fins d’analyse TAL ou autres. Les pages de l'édition de 1935 ressemblent à cela : +C'est là qu'un langage de script comme Python devient très utile. Pour notre projet nous avons voulu préparer certains des documents d’une [collection du XIIe siècle d’*imbreviatura*](https://www.worldcat.org/oclc/17591390) du scribe italien connu sous le nom de Giovanni Scriba (vous pouvez [accéder au PDF ici](https://notariorumitinera.eu/Docs/Biblioteca_Digitale/SB/3a47488c28eef2aedfea52ebbde2c634/dd361cb1479ab2309f5ceef1f875c2a5.pdf)) afin qu’ils puissent être traités par des historien(ne)s à des fins d’analyse TAL ou autres. Les pages de l'édition de 1935 ressemblent à cela : {% include figure.html filename="gs_pg110.png" caption="GS page 110" %} @@ -246,12 +246,12 @@ Si vous avez besoin d'importer des modules faisant partie de la bibliothèque st >Quand elles se retrouvent confrontées à un problème, certaines personnes se disent : « Je n'ai qu'à utiliser les expressions régulières ! » Elles se retrouvent alors avec deux problèmes. - (Je vous recommande une nouvelle fois de jeter un coup d’œil à la présentation de L.T. O’Hara ici sur le site du Programming Historian [Cleaning OCR’d text with Regular Expressions](/en/lessons/cleaning-ocrd-text-with-regular-expressions)) + (Je vous recommande une nouvelle fois de jeter un coup d’œil à la présentation de L.T. O’Hara ici sur le site du _Programming Historian_ [Cleaning OCR’d text with Regular Expressions](/en/lessons/cleaning-ocrd-text-with-regular-expressions), en anglais.) 2. Vous devrez réaliser l'import d'une bibliothèque Python qui nous sera utile : `from pprint import pprint`. Il s'agit d'un outil de formatage pour les objets Python comme les listes et les dictionnaires. Vous en aurez besoin parce que les dictionnaires Python sont beaucoup plus faciles à lire s’ils sont formatés. -3. L'import `from collections import Counter` nous sera utile pour la section [Identifier les notes de bas de page à l'aide d'une expression régulière](/en/lessons/generer-jeu-donnees-texte-ocr#identifier-notes-bas-de-page) que nous aborderons juste après. Ce n’est pas vraiment nécessaire, mais nous allons faire des opérations de comptage qui exigeraient beaucoup de lignes de code et cela nous épargnera du temps. Le module des collections a beaucoup d'utilité et vaut la peine qu'on se familiarise avec. Encore une fois, voir la présentation Pymotw de Doug Hellmann concernant le module des [collections](https://docs.python.org/fr/3/library/collections.html). Je souligne également que son livre [The Python Standard Library By Example](https://doughellmann.com/books/the-python-3-standard-library-by-example/) vaut le coût. +3. L'import `from collections import Counter` nous sera utile pour la section [Identifier les notes de bas de page à l'aide d'une expression régulière](/fr/lecons/generer-jeu-donnees-texte-ocr#identifier-les-notes-de-bas-de-page-à-laide-dune-expression-régulière) que nous aborderons juste après. Ce n’est pas vraiment nécessaire, mais nous allons faire des opérations de comptage qui exigeraient beaucoup de lignes de code et cela nous épargnera du temps. Le module des collections a beaucoup d'utilité et vaut la peine qu'on se familiarise avec. Encore une fois, voir la présentation Pymotw de Doug Hellmann concernant le module des [collections](https://docs.python.org/fr/3/library/collections.html). Je souligne également que son livre [The Python Standard Library By Example](https://doughellmann.com/books/the-python-3-standard-library-by-example/) vaut le coût. ## Un petit aperçu des expressions régulières telles qu'elles sont implémentées en Python @@ -1064,7 +1064,7 @@ Ouvrez le fichier résultant avec un navigateur web et vous obtenez une édition Ainsi, notre problème de départ, le nettoyage OCR, est maintenant beaucoup plus gérable parce que nous pouvons cibler des expressions régulières pour les types spécifiques de métadonnées que nous avons : erreurs dans le résumé en italien ou dans le texte latin ? Ou nous pourrions concevoir des routines de recherche et de remplacement uniquement pour des chartes spécifiques ou des groupes de chartes. -Au-delà de cela, il y a beaucoup de choses que vous pouvez faire avec un ensemble de données ordonnnées, y compris l'alimenter grâce à un outil de balisage comme [le « brat »](http://brat.nlplab.org/) dont nous nous sommes servis pour le projet ChartEx. Des spécialistes peuvent alors commencer à ajouter des couches de balisage sémantique, même si vous ne faites plus de correction d’erreur OCR. En outre, avec un ensemble de données ordonnnées, nous pouvons obtenir toutes sortes de sorties : TEI (Text Encoding Initiative) ou EAD (Encoded Archival Description). Ou encore vous pouvez lire votre ensemble de données directement dans une base de données relationnelle ou un répertoire de stockage qui associe une clé et une valeur. Toutes ces choses sont tout bonnement impossibles, si vous travaillez seulement avec un simple fichier texte. +Au-delà de cela, il y a beaucoup de choses que vous pouvez faire avec un ensemble de données ordonnnées, y compris l'alimenter grâce à un outil de balisage comme [le « brat »](https://brat.nlplab.org/) dont nous nous sommes servis pour le projet ChartEx. Des spécialistes peuvent alors commencer à ajouter des couches de balisage sémantique, même si vous ne faites plus de correction d’erreur OCR. En outre, avec un ensemble de données ordonnnées, nous pouvons obtenir toutes sortes de sorties : TEI (Text Encoding Initiative) ou EAD (Encoded Archival Description). Ou encore vous pouvez lire votre ensemble de données directement dans une base de données relationnelle ou un répertoire de stockage qui associe une clé et une valeur. Toutes ces choses sont tout bonnement impossibles, si vous travaillez seulement avec un simple fichier texte. Les morceaux de code ci-dessus ne sont en aucun cas une solution clé en main pour nettoyer une sortie OCR lambda. Il n'existe pas de telle baguette magique. L’approche de Google pour scanner le contenu des bibliothèques de recherche menace de nous noyer dans un océan de mauvaises données. Pire encore, elle élude un fait fondamental du savoir numérique : les sources numériques sont difficiles à obtenir. Des textes numériques fiables, flexibles et utiles nécessitent une rédaction soignée et une conservation pérenne. Google, Amazon, Facebook et d'autres géants du Web n’ont pas à se soucier de la qualité de leurs données, juste de leur quantité. Les historien(ne)s, par contre, doivent d’abord se soucier de l’intégrité de leurs sources. diff --git a/fr/lecons/gestion-manipulation-donnees-r.md b/fr/lecons/gestion-manipulation-donnees-r.md index 8c8dd47ba7..54ff2f4d17 100644 --- a/fr/lecons/gestion-manipulation-donnees-r.md +++ b/fr/lecons/gestion-manipulation-donnees-r.md @@ -62,7 +62,7 @@ Remplir ces critères nous permet de juger si la donnée est organisée ou pas. 4. Plusieurs unités d'observation sont présentes dans une même table. 5. Une même unité d'observation est présente dans plusieurs tables. -Un avantage peut-être encore plus important est de garder nos données dans ce format propre, qui nous permet d'utiliser une galerie de paquets dans le [« tidyverse »](http://tidyverse.org/), spécifiquement conçus pour fonctionner avec des données bien structurées. En nous assurant que nos données en entrée et en sortie sont bien structurées, nous n'aurons qu'un nombre limité d'outils à utiliser pour répondre à un grand nombre de questions. De plus, nous pourrons combiner, manipuler et séparer des jeux de données comme bon nous semble. +Un avantage peut-être encore plus important est de garder nos données dans ce format propre, qui nous permet d'utiliser une galerie de paquets dans le [« tidyverse »](https://tidyverse.org/), spécifiquement conçus pour fonctionner avec des données bien structurées. En nous assurant que nos données en entrée et en sortie sont bien structurées, nous n'aurons qu'un nombre limité d'outils à utiliser pour répondre à un grand nombre de questions. De plus, nous pourrons combiner, manipuler et séparer des jeux de données comme bon nous semble. Dans cette leçon, nous nous intéresserons particulièrement au paquet [dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) du tidyverse. Mais cela vaut la peine de mentionner brièvement quelques autres paquets que nous utiliserons : diff --git a/fr/lecons/installation-windows-py.md b/fr/lecons/installation-windows-py.md index e1cadfb2ae..11807c3ea8 100644 --- a/fr/lecons/installation-windows-py.md +++ b/fr/lecons/installation-windows-py.md @@ -1,152 +1,152 @@ ---- -title: Installer un environnement de développement intégré pour Python (Windows) -layout: lesson -slug: installation-windows-py -date: 2012-07-17 -authors: -- William J. Turkel -- Adam Crymble -reviewers: -- Jim Clifford -- Amanda Morton -editors: -- Miriam Posner -translation_date: 2020-07-13 -translator: -- Sofia Papastamkou -translation-editor: -- François Dominic Laramée -translation-reviewer: -- Marie-Christine Boucher -difficulty: 1 -exclude_from_check: - - review-ticket -activity: transforming -topics: [get-ready, python] -abstract: "Cette leçon vous montrera comment installer un environnement de développement pour Python sur un ordinateur exécutant le système d'exploitation Windows." -original: windows-installation -doi: 10.46430/phfr0011 -avatar_alt: Un groupe de trois musiciens ---- - -{% include toc.html %} - - - - - -## Sauvegarder son disque dur - -Faites en sorte de toujours disposer de sauvegardes régulières et récentes du contenu de votre disque dur. L'importance de cette pratique dépasse largement le cadre de vos activités de programmation, et il serait avisé d’en faire une habitude. - -## Installer Python (v.3) - -Rendez-vous sur le [site web de Python][], téléchargez la dernière version stable du langage de programmation Python (version 3.8 au mois d'avril 2020) et procédez à l'installation selon les instructions fournies sur le site. *N.D.L.R. Notez que les versions les plus récentes de Python, à partir de la v. 3.5, ne sont pas compatibles avec Windows XP ni avec les versions antérieures de Windows.* - -## Créer un répertoire dédié - -Pour mieux organiser votre travail, il est recommandé de créer un répertoire (dossier) dédié sur votre disque dur, à l'emplacement de votre choix, pour y ranger vos fichiers de programmation (par exemple, `programming-historian`). - -## Installer Komodo Edit - -Komodo Edit est un éditeur de texte au code source ouvert et gratuit, dévelopé par [ActiveState](https://www.activestate.com/). Pour installer Komodo Edit, vous pouvez télécharger le fichier [Komodo-Edit-12.0.1-18441.msi](https://downloads.activestate.com/Komodo/releases/12.0.1/Komodo-Edit-12.0.1-18441.msi) depuis leur liste de [Komodo Edit releases](https://downloads.activestate.com/Komodo/releases/12.0.1/). Il existe néanmoins [un vaste choix d'éditeurs de texte][], si vous souhaitez utiliser un autre programme. - -
    -ActiveState a retiré Komodo Edit en décembre 2022. Comme il est écrit dans cet article de blog (en anglais), "Le retrait de Komodo signifie qu'ActiveState cessera de déveloper du code ou de créer des versions pour [...] Komodo Edit. ActiveState ne fournira plus de nouvelles caractéristiques/fonctionnalités, et ne réparera plus les bugs ou les problèmes de sécurité" [notre traduction]. Vous pouvez toujours télécharger et utiliser Komodo Edit, mais il est probablement préférable de choisir un autre programme. -
    - -## Démarrer Komodo Edit - -Ouvrez Komodo Edit; vous devriez obtenir quelque chose qui ressemble à ceci: - -{% include figure.html filename="komodo-edit11-windows-main.png" caption="Komodo Edit sur Windows" %} - -Si vous ne voyez pas le volet `Toolbox` (*Boîte à outils*) en haut à droite, vous pouvez y accéder via le menu `View -> Tabs & Sidebars -> Toolbox` (*Vue -> Onglets & Volets latéraux -> Boîte à outils*). Peu importe pour le moment si le volet du projet est ouvert ou non. Prenez le temps d'explorer l'interface et de vous familiariser avec son agencement. Si besoin, le menu d'aide `Help` offre une documentation détaillée. - -### Configurer Komodo Edit - -Vous devez maintenant configurer l'éditeur pour pouvoir exécuter vos programmes en Python. - -Sélectionnez d'abord `Edit -> Preferences -> Languages -> Python 3` (*Modifier -> Préférences -> Langages -> Python 3*) puis `Browse` (*Parcourir*). Puis, sélectionnez le chemin du répertoire d'installation de Python, qui ressemble à ceci: `C:\Utilisateurs\VotreNomUtilisateur\AppData\Local\Programs\Python\Python38-32`). Lorsque vous avez trouvé le bon chemin, cliquez sur `OK`: - -{% include figure.html caption="Définissez l'interpréteur Python par défaut" filename="komodo-edit11-windows-interpreter.png" %} - -*(N.D.L.R. En effectuant la manipulation décrite plus haut, après avoir cliqué sur `Browse` pour parcourir votre disque dur et afficher la boîte de dialogue `Open Executable File`, il se peut que vous n'arriviez pas à localiser le dossier AppData pour récupérer le chemin et définir l'interpréteur par défaut. Dans ce cas, entrez `%AppData%` dans la barre de recherche du menu `Démarrer` de Windows, puis cliquez sur l'emplacement pour l'ouvrir. Localisez le chemin spécifié ci-haut (`\AppData\Local\Programs\Python\Python38-32`) puis retournez à la boîte de dialogue `Open Executable File` (à l'intérieur de Komodo Edit) et copiez-le dans la barre `Nom du fichier`. Une fois le répertoire ouvert, sélectionnez `python.exe` (type de fichier: application) et cliquez sur `Ouvrir`.* - -Ensuite, depuis le menu `Preferences` (*Préférences*) à gauche sélectionnez `Internationalization`. Maintenant, allez à la section `Language-specific Default Encoding` (*Encodage par défaut selon le langage de programmation*) et, dans le menu déroulant de `Language-specific`, sélectionnez `Python`. Vérifiez que l'encodage [UTF-8][] est sélectionné en tant qu'encodage par défaut. - -{% include figure.html caption="Paramètre d'encodage du texte en format UTF-8" filename="komodo-edit11-windows-utf-set.png" %} - -Ensuite sélectionnez `Toolbox->Add->New Command` (*Boite à outils->Ajouter->Nouvelle commande*). Vous ouvrirez ainsi une nouvelle fenêtre de dialogue. Renommez votre commande `‘Run Python’` (*Exécuter Python*). Dans la barre `‘Command’` (*Commande*), tapez: - -``` python -%(python3) %f -``` - -Si vous oubliez d'exécuter cette commande, Python ne saura pas coment interpréter les instructions envoyées. - -Dans la barre `‘Start in’`, tapez: - -`%D` - -Si vous obtenez ceci, cliquez sur `OK`: - -{% include figure.html filename="komodo-edit11-windows-python-command.png" caption="Commande « Exécuter Python3 »" %} -{% include figure.html filename="komodo-edit11-windows-python-start.png" caption="Configuration de la commande « Run Python3 Start » ." %} - -Votre nouvelle commande devrait apparaître dans le panneau de la boite à outils `Toolbox`. Après avoir complété cette étape, vous devrez peut-être redémarrer votre ordinateur avant d’être en mesure de travailler avec Python dans Komodo Edit. - -Étape 2 – 'Hello World' en Python --------------------------------- - -Il est de coutume d'inaugurer l'utilisation d'un nouveau langage de programmation avec un script qui dit tout simplement *"hello world"* soit "bonjour le monde". Nous allons voir ensemble comment faire cela en Python et en HTML. - -Python est un langage de très haut niveau, ce qui en fait un choix fréquemment recommandé pour les personnes qui débutent en programmation. Autrement dit: il est possible d'écrire en Python de courts programmes qui sont très performants. Plus un programme est court, plus il est susceptible de tenir sur la taille d'un écran et donc plus il a des chances de rester gravé dans votre mémoire. - -Python est un langage de programmation interprété. Cela signifie qu'il existe un programme informatique spécifique, appelé interpréteur, qui sait reconnaître les instructions écrites dans ce langage. Une manière d'utiliser un interpréteur consiste à stocker toutes vos instructions Python dans un fichier puis à soumettre ce fichier à l'interpréteur. Un fichier contenant des instructions écrites avec un langage de programmation s'appelle un programme (informatique). L'interpréteur exécute chacune des instructions contenues dans le programme, puis il s'arrête. Voyons les étapes à suivre pour y parvenir. - -Dans votre éditeur de texte, créez un nouveau fichier, entrez ce petit programme de deux lignes, puis sauvegardez-le dans votre répertoire `programming-historian` sous le nom -`hello-world.py` - -``` python -# hello-world.py -print('hello world') -``` - -L'éditeur de texte de votre choix doit avoir un bouton de menu “`Run`” qui vous permet d'exécuter votre programme. Si tout s'est bien passé, vous devriez obtenir un résultat semblable à celui présenté dans la figure ci-dessous, que nous avons obtenue avec Komodo Edit: - -{% include figure.html filename="komodo-edit11-windows-hello.png" caption="'Hello World'" %} - -## Interagir via une console Python (shell) - -Une autre manière d'interagir avec un interpréteur est d'utiliser ce que nous appelons une console. Dans ce cas, il suffit de taper une instruction au clavier et d'appuyer sur la touche Entrée pour que la console exécute votre commande. La console est un moyen parfait pour tester votre code et avoir la certitude que vous allez obtenir le résultat que vous recherchez. - -Vous pouvez exécuter une console Python en double-cliquant sur le fichier `python.exe`. Si vous avez installé la version 3.8 (la plus récente au moment de cette traduction en avril 2020), ce fichier se trouve fort probablement dans votre répertoire `C:\Utilisateurs\VotreNomUtilisateur\AppData\Local\Programs\Python\Python38-32`. Lorsque la fenêtre de la console s'affiche sur votre écran, tapez: - -``` python -print('hello world') -``` - -puis appuyez sur la touche Entrée. Votre ordinateur va vous répondre: - -``` python -hello world -``` - -Pour représenter une interaction via la console, nous utilisons -\> pour indiquer la réponse reçue dans celle-ci, comme suit: - -``` python -print('hello world') --> hello world -``` -Sur votre écran, l'affichage ressemble plutôt à cela: - -{% include figure.html caption="La console Python sous Windows" filename="windows-python3-cmd.png" %} - -Maintenant, votre ordinateur est prêt et vous êtes en mesure d'exécuter des tâches plus intéressantes. Si vous travaillez avec nos tutoriels sur Python dans l'ordre, nous vous recommandons de consulter par la suite la leçon « [Comprendre les pages web et le HTML][] » . - - [site web de Python]: http://www.python.org/ - [un vaste choix d'éditeurs de texte]: https://wiki.python.org/python/PythonEditors - [UTF-8]: http://en.wikipedia.org/wiki/UTF-8 - [Comprendre les pages web et le HTML]: /fr/lecons/comprendre-les-pages-web - +--- +title: Installer un environnement de développement intégré pour Python (Windows) +layout: lesson +slug: installation-windows-py +date: 2012-07-17 +authors: +- William J. Turkel +- Adam Crymble +reviewers: +- Jim Clifford +- Amanda Morton +editors: +- Miriam Posner +translation_date: 2020-07-13 +translator: +- Sofia Papastamkou +translation-editor: +- François Dominic Laramée +translation-reviewer: +- Marie-Christine Boucher +difficulty: 1 +exclude_from_check: + - review-ticket +activity: transforming +topics: [get-ready, python] +abstract: "Cette leçon vous montrera comment installer un environnement de développement pour Python sur un ordinateur exécutant le système d'exploitation Windows." +original: windows-installation +doi: 10.46430/phfr0011 +avatar_alt: Un groupe de trois musiciens +--- + +{% include toc.html %} + + + + + +## Sauvegarder son disque dur + +Faites en sorte de toujours disposer de sauvegardes régulières et récentes du contenu de votre disque dur. L'importance de cette pratique dépasse largement le cadre de vos activités de programmation, et il serait avisé d’en faire une habitude. + +## Installer Python (v.3) + +Rendez-vous sur le [site web de Python][], téléchargez la dernière version stable du langage de programmation Python (version 3.8 au mois d'avril 2020) et procédez à l'installation selon les instructions fournies sur le site. *N.D.L.R. Notez que les versions les plus récentes de Python, à partir de la v. 3.5, ne sont pas compatibles avec Windows XP ni avec les versions antérieures de Windows.* + +## Créer un répertoire dédié + +Pour mieux organiser votre travail, il est recommandé de créer un répertoire (dossier) dédié sur votre disque dur, à l'emplacement de votre choix, pour y ranger vos fichiers de programmation (par exemple, `programming-historian`). + +## Installer Komodo Edit + +Komodo Edit est un éditeur de texte au code source ouvert et gratuit, dévelopé par [ActiveState](https://www.activestate.com/). Pour installer Komodo Edit, vous pouvez télécharger le fichier [Komodo-Edit-12.0.1-18441.msi](https://downloads.activestate.com/Komodo/releases/12.0.1/Komodo-Edit-12.0.1-18441.msi) depuis leur liste de [Komodo Edit releases](https://downloads.activestate.com/Komodo/releases/12.0.1/). Il existe néanmoins [un vaste choix d'éditeurs de texte][], si vous souhaitez utiliser un autre programme. + +
    +ActiveState a retiré Komodo Edit en décembre 2022. Comme il est écrit dans cet article de blog (en anglais), "Le retrait de Komodo signifie qu'ActiveState cessera de déveloper du code ou de créer des versions pour [...] Komodo Edit. ActiveState ne fournira plus de nouvelles caractéristiques/fonctionnalités, et ne réparera plus les bugs ou les problèmes de sécurité" [notre traduction]. Vous pouvez toujours télécharger et utiliser Komodo Edit, mais il est probablement préférable de choisir un autre programme. +
    + +## Démarrer Komodo Edit + +Ouvrez Komodo Edit; vous devriez obtenir quelque chose qui ressemble à ceci: + +{% include figure.html filename="komodo-edit11-windows-main.png" caption="Komodo Edit sur Windows" %} + +Si vous ne voyez pas le volet `Toolbox` (*Boîte à outils*) en haut à droite, vous pouvez y accéder via le menu `View -> Tabs & Sidebars -> Toolbox` (*Vue -> Onglets & Volets latéraux -> Boîte à outils*). Peu importe pour le moment si le volet du projet est ouvert ou non. Prenez le temps d'explorer l'interface et de vous familiariser avec son agencement. Si besoin, le menu d'aide `Help` offre une documentation détaillée. + +### Configurer Komodo Edit + +Vous devez maintenant configurer l'éditeur pour pouvoir exécuter vos programmes en Python. + +Sélectionnez d'abord `Edit -> Preferences -> Languages -> Python 3` (*Modifier -> Préférences -> Langages -> Python 3*) puis `Browse` (*Parcourir*). Puis, sélectionnez le chemin du répertoire d'installation de Python, qui ressemble à ceci: `C:\Utilisateurs\VotreNomUtilisateur\AppData\Local\Programs\Python\Python38-32`). Lorsque vous avez trouvé le bon chemin, cliquez sur `OK`: + +{% include figure.html caption="Définissez l'interpréteur Python par défaut" filename="komodo-edit11-windows-interpreter.png" %} + +*(N.D.L.R. En effectuant la manipulation décrite plus haut, après avoir cliqué sur `Browse` pour parcourir votre disque dur et afficher la boîte de dialogue `Open Executable File`, il se peut que vous n'arriviez pas à localiser le dossier AppData pour récupérer le chemin et définir l'interpréteur par défaut. Dans ce cas, entrez `%AppData%` dans la barre de recherche du menu `Démarrer` de Windows, puis cliquez sur l'emplacement pour l'ouvrir. Localisez le chemin spécifié ci-haut (`\AppData\Local\Programs\Python\Python38-32`) puis retournez à la boîte de dialogue `Open Executable File` (à l'intérieur de Komodo Edit) et copiez-le dans la barre `Nom du fichier`. Une fois le répertoire ouvert, sélectionnez `python.exe` (type de fichier: application) et cliquez sur `Ouvrir`.* + +Ensuite, depuis le menu `Preferences` (*Préférences*) à gauche sélectionnez `Internationalization`. Maintenant, allez à la section `Language-specific Default Encoding` (*Encodage par défaut selon le langage de programmation*) et, dans le menu déroulant de `Language-specific`, sélectionnez `Python`. Vérifiez que l'encodage [UTF-8][] est sélectionné en tant qu'encodage par défaut. + +{% include figure.html caption="Paramètre d'encodage du texte en format UTF-8" filename="komodo-edit11-windows-utf-set.png" %} + +Ensuite sélectionnez `Toolbox->Add->New Command` (*Boite à outils->Ajouter->Nouvelle commande*). Vous ouvrirez ainsi une nouvelle fenêtre de dialogue. Renommez votre commande `‘Run Python’` (*Exécuter Python*). Dans la barre `‘Command’` (*Commande*), tapez: + +``` python +%(python3) %f +``` + +Si vous oubliez d'exécuter cette commande, Python ne saura pas coment interpréter les instructions envoyées. + +Dans la barre `‘Start in’`, tapez: + +`%D` + +Si vous obtenez ceci, cliquez sur `OK`: + +{% include figure.html filename="komodo-edit11-windows-python-command.png" caption="Commande « Exécuter Python3 »" %} +{% include figure.html filename="komodo-edit11-windows-python-start.png" caption="Configuration de la commande « Run Python3 Start » ." %} + +Votre nouvelle commande devrait apparaître dans le panneau de la boite à outils `Toolbox`. Après avoir complété cette étape, vous devrez peut-être redémarrer votre ordinateur avant d’être en mesure de travailler avec Python dans Komodo Edit. + +Étape 2 – 'Hello World' en Python +-------------------------------- + +Il est de coutume d'inaugurer l'utilisation d'un nouveau langage de programmation avec un script qui dit tout simplement *"hello world"* soit "bonjour le monde". Nous allons voir ensemble comment faire cela en Python et en HTML. + +Python est un langage de très haut niveau, ce qui en fait un choix fréquemment recommandé pour les personnes qui débutent en programmation. Autrement dit: il est possible d'écrire en Python de courts programmes qui sont très performants. Plus un programme est court, plus il est susceptible de tenir sur la taille d'un écran et donc plus il a des chances de rester gravé dans votre mémoire. + +Python est un langage de programmation interprété. Cela signifie qu'il existe un programme informatique spécifique, appelé interpréteur, qui sait reconnaître les instructions écrites dans ce langage. Une manière d'utiliser un interpréteur consiste à stocker toutes vos instructions Python dans un fichier puis à soumettre ce fichier à l'interpréteur. Un fichier contenant des instructions écrites avec un langage de programmation s'appelle un programme (informatique). L'interpréteur exécute chacune des instructions contenues dans le programme, puis il s'arrête. Voyons les étapes à suivre pour y parvenir. + +Dans votre éditeur de texte, créez un nouveau fichier, entrez ce petit programme de deux lignes, puis sauvegardez-le dans votre répertoire `programming-historian` sous le nom +`hello-world.py` + +``` python +# hello-world.py +print('hello world') +``` + +L'éditeur de texte de votre choix doit avoir un bouton de menu “`Run`” qui vous permet d'exécuter votre programme. Si tout s'est bien passé, vous devriez obtenir un résultat semblable à celui présenté dans la figure ci-dessous, que nous avons obtenue avec Komodo Edit: + +{% include figure.html filename="komodo-edit11-windows-hello.png" caption="'Hello World'" %} + +## Interagir via une console Python (shell) + +Une autre manière d'interagir avec un interpréteur est d'utiliser ce que nous appelons une console. Dans ce cas, il suffit de taper une instruction au clavier et d'appuyer sur la touche Entrée pour que la console exécute votre commande. La console est un moyen parfait pour tester votre code et avoir la certitude que vous allez obtenir le résultat que vous recherchez. + +Vous pouvez exécuter une console Python en double-cliquant sur le fichier `python.exe`. Si vous avez installé la version 3.8 (la plus récente au moment de cette traduction en avril 2020), ce fichier se trouve fort probablement dans votre répertoire `C:\Utilisateurs\VotreNomUtilisateur\AppData\Local\Programs\Python\Python38-32`. Lorsque la fenêtre de la console s'affiche sur votre écran, tapez: + +``` python +print('hello world') +``` + +puis appuyez sur la touche Entrée. Votre ordinateur va vous répondre: + +``` python +hello world +``` + +Pour représenter une interaction via la console, nous utilisons -\> pour indiquer la réponse reçue dans celle-ci, comme suit: + +``` python +print('hello world') +-> hello world +``` +Sur votre écran, l'affichage ressemble plutôt à cela: + +{% include figure.html caption="La console Python sous Windows" filename="windows-python3-cmd.png" %} + +Maintenant, votre ordinateur est prêt et vous êtes en mesure d'exécuter des tâches plus intéressantes. Si vous travaillez avec nos tutoriels sur Python dans l'ordre, nous vous recommandons de consulter par la suite la leçon « [Comprendre les pages web et le HTML][] » . + + [site web de Python]: https://www.python.org/ + [un vaste choix d'éditeurs de texte]: https://wiki.python.org/python/PythonEditors + [UTF-8]: https://en.wikipedia.org/wiki/UTF-8 + [Comprendre les pages web et le HTML]: /fr/lecons/comprendre-les-pages-web + diff --git a/fr/lecons/intro-a-bash-et-zsh.md b/fr/lecons/intro-a-bash-et-zsh.md index ba0532138f..aea2f13d29 100644 --- a/fr/lecons/intro-a-bash-et-zsh.md +++ b/fr/lecons/intro-a-bash-et-zsh.md @@ -241,7 +241,7 @@ Enfoncer la touche tabulation (TAB) à n'importe quel moment dans le shel Sous Windows, les extensions de fichier sont invisibles par défaut. Si vous souhaitez manipuler des fichiers sous Windows, nous vous recommandons d'activer l'affichage des extensions de fichier. Pour faire cela, ouvrez votre explorateur de fichiers et sous **Affichage**, dans le groupe **Afficher/masquer**, cochez la case **Extensions de nom de fichier**. Pour plus d'informations, vous pouvez vous référer à [cet article](https://perma.cc/5ZWL-XRFF) du support Windows. -Nous avons désormais besoin d'un fichier texte pour nos futures commandes. Nous pouvons utiliser un livre réputé pour être long, l'épique *Guerre et Paix* de Léon Tolstoï. Le fichier est disponible, en anglais, grâce au [Projet Gutenberg](http://www.gutenberg.org/ebooks/2600). Si vous avez déjà installé [wget](/en/lessons/applied-archival-downloading-with-wget), vous pouvez simplement taper : +Nous avons désormais besoin d'un fichier texte pour nos futures commandes. Nous pouvons utiliser un livre réputé pour être long, l'épique *Guerre et Paix* de Léon Tolstoï. Le fichier est disponible, en anglais, grâce au [Projet Gutenberg](https://www.gutenberg.org/ebooks/2600). Si vous avez déjà installé [wget](/en/lessons/applied-archival-downloading-with-wget), vous pouvez simplement taper : ```bash wget http://www.gutenberg.org/files/2600/2600-0.txt diff --git a/fr/lecons/intro-aux-bots-twitter.md b/fr/lecons/intro-aux-bots-twitter.md index 3fefee7512..d4958ab3d2 100644 --- a/fr/lecons/intro-aux-bots-twitter.md +++ b/fr/lecons/intro-aux-bots-twitter.md @@ -39,10 +39,10 @@ L'accès à l'API de Twitter a récemment changé. Le niveau gratuit ne permet p # Une introduction aux bots Twitter avec Tracery -Cette leçon explique comment créer des bots basiques sur Twitter à l’aide de la [grammaire générative Tracery](http://tracery.io) et du service [Cheap Bots Done Quick](http://cheapbotsdonequick.com/). Tracery est interopérable avec plusieurs langages de programmation et peut être intégrée dans des sites web, des jeux ou des bots. Vous pouvez en faire une copie (fork) sur github [ici](https://github.com/galaxykate/tracery/tree/tracery2). +Cette leçon explique comment créer des bots basiques sur Twitter à l’aide de la [grammaire générative Tracery](https://tracery.io) et du service [Cheap Bots Done Quick](https://cheapbotsdonequick.com/). Tracery est interopérable avec plusieurs langages de programmation et peut être intégrée dans des sites web, des jeux ou des bots. Vous pouvez en faire une copie (fork) sur github [ici](https://github.com/galaxykate/tracery/tree/tracery2). ## Pourquoi des bots? -Pour être exact, un bot Twitter est un logiciel permettant de contrôler automatiquement un compte Twitter. Lorsque des centaines de bots sont créés et tweetent plus ou moins le même message, ils peuvent façonner le discours sur Twitter, ce qui influence ensuite le discours d’autres médias. Des bots de ce type [peuvent même être perçus comme des sources crédibles d’information](http://www.sciencedirect.com/science/article/pii/S0747563213003129). Des projets tels que [Documenting the Now](https://web.archive.org/web/20260316082621/https://www.docnow.io/) mettent au point des outils qui permettent aux chercheur(e)s de créer et d’interroger des archives de réseaux sociaux en ligne à propos d’événements récents qui comprennent très probablement un bon nombre de messages générés par des bots. Dans ce tutoriel, je veux montrer comment construire un bot Twitter basique afin que des historiens et des historiennes, ayant connaissance de leur fonctionnement, puissent plus facilement les repérer dans des archives et, peut-être, même les neutraliser grâce à leurs propres bots. +Pour être exact, un bot Twitter est un logiciel permettant de contrôler automatiquement un compte Twitter. Lorsque des centaines de bots sont créés et tweetent plus ou moins le même message, ils peuvent façonner le discours sur Twitter, ce qui influence ensuite le discours d’autres médias. Des bots de ce type [peuvent même être perçus comme des sources crédibles d’information](https://www.sciencedirect.com/science/article/pii/S0747563213003129). Des projets tels que [Documenting the Now](https://web.archive.org/web/20260316082621/https://www.docnow.io/) mettent au point des outils qui permettent aux chercheur(e)s de créer et d’interroger des archives de réseaux sociaux en ligne à propos d’événements récents qui comprennent très probablement un bon nombre de messages générés par des bots. Dans ce tutoriel, je veux montrer comment construire un bot Twitter basique afin que des historiens et des historiennes, ayant connaissance de leur fonctionnement, puissent plus facilement les repérer dans des archives et, peut-être, même les neutraliser grâce à leurs propres bots. Mais je crois aussi qu’il y a de la place en histoire et dans les humanités numériques de façon plus large pour un travail créatif, expressif, voire artistique. Les historiens et les historiennes qui connaissent la programmation peuvent profiter des possibilités offertes par les médias numériques pour monter des créations, autrement impossibles à réaliser pour nous émouvoir, nous inspirer, nous interpeller. Il y a de la place pour de la satire, il y a de la place pour commenter. Comme Mark Sample, je crois qu’il y a besoin de « [bots de conviction](https://medium.com/@samplereality/a-protest-bot-is-a-bot-so-specific-you-cant-mistake-it-for-bullshit-90fe10b7fbaa)». Ce sont des bots de contestation, des bots si pointus et pertinents, qu’il devient impossible de les prendre pour autre chose par erreur. Selon Sample, il faudrait que de tels bots soient: @@ -83,11 +83,11 @@ Pour entamer la réflexion, voici quelques suggestions de personnes qui m’ont > - un bot qui imaginerait la réaction d’Afghans, d’Irakiens, de Syriens, de Yéménites lorsque des membres de leur famille sont tués dans des attaques de drones. — Cory Taylor (@CoryTaylor_) 22 avril 2017 -Dans la mesure où beaucoup de données historiques en ligne sont disponibles en format [JSON](http://json.org/), en cherchant un peu, vous devriez en trouver à utiliser avec votre bot. +Dans la mesure où beaucoup de données historiques en ligne sont disponibles en format [JSON](https://json.org/), en cherchant un peu, vous devriez en trouver à utiliser avec votre bot. -Ma méthode est celle du bricoleur qui adapte et assemble des morceaux de code trouvés ici et là. En vérité, la programmation fonctionne en grande partie comme ça. Il existe beaucoup de logiciels pour interagir avec l’API (*Application Programming Interface* soit l'interface de programmation d'application) de Twitter. Dans cette leçon, il y aura peu de « programmation »: les bots ne seront pas écrits en Python, par exemple. Dans cette leçon d’introduction, je vais vous montrer comment construire un bot qui raconte des histoires, qui compose de la poésie, qui fait des choses merveilleuses à l’aide de [Tracery.io](http://tracery.io/) comme _grammaire générative_ et du service Cheap Bots Done Quick comme hébergeur du bot. Pour davantage de tutoriels pour apprendre à construire et héberger des bots Twitter sur d’autres services, voir [la liste de tutoriels de Botwiki](https://botwiki.org/tutorials/twitterbots/) (en anglais). +Ma méthode est celle du bricoleur qui adapte et assemble des morceaux de code trouvés ici et là. En vérité, la programmation fonctionne en grande partie comme ça. Il existe beaucoup de logiciels pour interagir avec l’API (*Application Programming Interface* soit l'interface de programmation d'application) de Twitter. Dans cette leçon, il y aura peu de « programmation »: les bots ne seront pas écrits en Python, par exemple. Dans cette leçon d’introduction, je vais vous montrer comment construire un bot qui raconte des histoires, qui compose de la poésie, qui fait des choses merveilleuses à l’aide de [Tracery.io](https://tracery.io/) comme _grammaire générative_ et du service Cheap Bots Done Quick comme hébergeur du bot. Pour davantage de tutoriels pour apprendre à construire et héberger des bots Twitter sur d’autres services, voir [la liste de tutoriels de Botwiki](https://botwiki.org/tutorials/twitterbots/) (en anglais). -Celui de mes bots qui a connu le plus de succès est [@tinyarchae](http://twitter.com/tinyarchae), un bot qui tweete des scènes de dysfonctionnements au sein d’un horrible projet d’excavation archéologique. Tout projet archéologique est confronté à des problèmes de sexisme, d’insultes, de mauvaise foi. Ainsi, @tinyarchae prend tout ce qui se murmure dans les colloques et le pousse à l’extrême. C’est, en réalité, une caricature qui comporte une part de vérité embarrassante. D’autres bots que j’ai construits détournent de la [photographie archéologique](https://twitter.com/archaeoglitch); l’un est même utile puisqu’il [annonce la sortie de nouveaux articles de revues en archéologie](https://twitter.com/botarchaeo) et fait donc office d’assistant de recherche. Pour plus de réflexions sur le rôle joué par les bots en archéologie publique, voir ce [discours inaugural](https://electricarchaeology.ca/2017/04/27/bots-of-archaeology-machines-writing-public-archaeology/) tiré du [colloque Twitter sur l’archéologie publique](http://web.archive.org/web/20180131161516/https://publicarchaeologyconference.wordpress.com/)). +Celui de mes bots qui a connu le plus de succès est [@tinyarchae](https://twitter.com/tinyarchae), un bot qui tweete des scènes de dysfonctionnements au sein d’un horrible projet d’excavation archéologique. Tout projet archéologique est confronté à des problèmes de sexisme, d’insultes, de mauvaise foi. Ainsi, @tinyarchae prend tout ce qui se murmure dans les colloques et le pousse à l’extrême. C’est, en réalité, une caricature qui comporte une part de vérité embarrassante. D’autres bots que j’ai construits détournent de la [photographie archéologique](https://twitter.com/archaeoglitch); l’un est même utile puisqu’il [annonce la sortie de nouveaux articles de revues en archéologie](https://twitter.com/botarchaeo) et fait donc office d’assistant de recherche. Pour plus de réflexions sur le rôle joué par les bots en archéologie publique, voir ce [discours inaugural](https://electricarchaeology.ca/2017/04/27/bots-of-archaeology-machines-writing-public-archaeology/) tiré du [colloque Twitter sur l’archéologie publique](https://web.archive.org/web/20180131161516/https://publicarchaeologyconference.wordpress.com/)). # Préparation : que fera votre bot ? @@ -97,7 +97,7 @@ Commençons avec un bloc-notes et du papier. À l'école primaire, une activité et les élèves remplissaient les blancs comme demandé. C'était un peu bête et, surtout, c'était amusant. Les Twitterbots sont à ce type d'improvisation ce que les voitures de sport sont aux attelages de chevaux. Les blancs à remplir pourraient, par exemple, être des valeurs dans des graphiques vectoriels svg. Il pourrait s'agir de nombres dans des noms de fichiers numériques (et donc de liens aléatoires vers une base de données ouverte, par exemple). Cela pourrait même être des noms et des adverbes. Comme les bots Twitter vivent sur le web, les blocs de construction à assembler peuvent être autre chose que du texte, même si, pour l'instant, le texte est le plus facile à utiliser. -Nous allons commencer par esquisser une *grammaire de remplacement*. Cette grammaire s’appelle [Tracery.io](http://tracery.io) et ses conventions ont été développées par Kate Compton ([@galaxykate](https://twitter.com/galaxykate) sur Twitter). Elle s’utilise comme une bibliothèque [javascript](https://fr.wikipedia.org/wiki/JavaScript) dans des pages web, des jeux, et des bots. Une grammaire de remplacement fonctionne en grande partie comme les improvisations ci-dessus. +Nous allons commencer par esquisser une *grammaire de remplacement*. Cette grammaire s’appelle [Tracery.io](https://tracery.io) et ses conventions ont été développées par Kate Compton ([@galaxykate](https://twitter.com/galaxykate) sur Twitter). Elle s’utilise comme une bibliothèque [javascript](https://fr.wikipedia.org/wiki/JavaScript) dans des pages web, des jeux, et des bots. Une grammaire de remplacement fonctionne en grande partie comme les improvisations ci-dessus. Afin de clarifier d'abord ce que fait la _grammaire_, nous n'allons _pas_ créer un bot en histoire pour l'instant. Νous allons plutôt construire quelque chose de surréaliste pour montrer comment cette grammaire fonctionne. Imaginons que vous souhaitiez créer un bot qui parle avec la voix d'une plante en pot. Que pourrait-il bien dire ce bot que nous appelerons tout simplement _PlanteEnPot_? Notez quelques idées. @@ -219,7 +219,7 @@ Vous pouvez certes associer un bot à votre propre compte Twitter. Toutefois, si Normalement, quand on construit un bot Twitter, il faut créer [une application sur Twitter en tant que développeur ou développeuse](https://developer.twitter.com/)), obtenir les clés d'accès d'utilisateur/utilisatrice de l'API (Application Programming Interface, il s'agit de l'interface de programmation applicative), ainsi que le *token* (jeton). Ensuite, il faudrait programmer l'authentification pour que Twitter sache que le programme essayant d'accéder à la plate-forme est autorisé. -Heureusement, nous n'avons pas à faire tout cela, puisque George Buckenham a créé le site d'hébergement de bot [Cheap Bots Done Quick](http://cheapbotsdonequick.com/) (ce site web montre également la grammaire source en JSON pour un certain nombre de bots différents, ce qui peut vous donner des idées). Une fois que vous avez créé le compte Twitter de votre bot et que vous y êtes connecté, allez sur Cheap Bots Done Quick et cliquez sur le bouton `Sign in with Twitter`(*Connexion avec Twitter*). Le site vous redirigera vers Twitter pour approuver l'autorisation, puis vous ramènera à Cheap Bots Done Quick. +Heureusement, nous n'avons pas à faire tout cela, puisque George Buckenham a créé le site d'hébergement de bot [Cheap Bots Done Quick](https://cheapbotsdonequick.com/) (ce site web montre également la grammaire source en JSON pour un certain nombre de bots différents, ce qui peut vous donner des idées). Une fois que vous avez créé le compte Twitter de votre bot et que vous y êtes connecté, allez sur Cheap Bots Done Quick et cliquez sur le bouton `Sign in with Twitter`(*Connexion avec Twitter*). Le site vous redirigera vers Twitter pour approuver l'autorisation, puis vous ramènera à Cheap Bots Done Quick. Le JSON qui décrit votre bot peut être rédigé ou collé dans la case blanche principale qui se trouve en bas. Copiez le script que vous avez préparé dans Tracery depuis votre éditeur de texte et collez-le dans la case blanche principale. S'il y a des erreurs dans votre JSON, la fenêtre de résultat en bas deviendra rouge et le site essaiera de vous indiquer ce qui pose problème. Dans la plupart des cas, ce sera à cause d'une virgule ou d'un guillemet erronés ou égarés. Si vous cliquez sur le bouton d'actualisation à droite de la fenêtre de résultat (attention, il n'est PAS question ici du bouton d'actualisation de votre navigateur!), le site va générer un nouveau texte à partir de votre grammaire. @@ -239,7 +239,7 @@ Cheap Bots Done Quick est un service fourni par George Buckenham dans un esprit > Si vous créez un bot que je juge spammeux, injurieux ou désagréable d'une manière ou d'une autre (par exemple, en @mentionnant des personnes qui n'ont pas donné leur consentement, en publiant des insultes ou en proférant des calomnies), je le retirerai. -Darius Kazemi, l'un des grands artistes du bot, fournit davantage de conseils en matière de bonnes manières concernant les bots [ici](http://tinysubversions.com/2013/03/basic-twitter-bot-etiquette/). +Darius Kazemi, l'un des grands artistes du bot, fournit davantage de conseils en matière de bonnes manières concernant les bots [ici](https://tinysubversions.com/2013/03/basic-twitter-bot-etiquette/). # Aller plus loin avec Tracery Ce que nous avons décrit ici est suffisant pour vous permettre de vous lancer. Toutefois, beaucoup de bots sont plus compliqués que cela et il est possible d'en créer qui sont étonnamment efficaces en utilisant Tracery. @@ -259,7 +259,7 @@ Les modificateurs `.capitalize` et `.s` sont ajoutés à l'intérieur du `#` du ## Utiliser des emoji -Les emojis peuvent être utilisés avec beaucoup d'efficacité dans des bots Twitter. Vous pouvez copier et coller des emojis directement dans l'éditeur Cheap Bots Done Quick, en les plaçant chacun entre guillemets comme toute autre valeur qui vous sert de règle. Utilisez [cette liste](http://unicode.org/emoji/charts/full-emoji-list.html) pour repérer les emojis que vous souhaitez utiliser, en veillant à les copier/coller depuis la colonne Twitter pour vous assurer qu'ils vont bien s'afficher. +Les emojis peuvent être utilisés avec beaucoup d'efficacité dans des bots Twitter. Vous pouvez copier et coller des emojis directement dans l'éditeur Cheap Bots Done Quick, en les plaçant chacun entre guillemets comme toute autre valeur qui vous sert de règle. Utilisez [cette liste](https://unicode.org/emoji/charts/full-emoji-list.html) pour repérer les emojis que vous souhaitez utiliser, en veillant à les copier/coller depuis la colonne Twitter pour vous assurer qu'ils vont bien s'afficher. ## Réutilisation de symboles générés avec la fonctionnalité action @@ -303,7 +303,7 @@ Tracery lit le symbole `origin` (*N.D.L.R.: si vous travaillez sur un exemple en ## Répondre à des mentions dans Cheap Bots Done Quick -[Cheap Bots Done Quick](http://cheapbotsdonequick.com/) possède une fonctionnalité bêta qui permet à votre robot de répondre aux mentions. Attention, si vous créez deux bots configurés pour que l'un mentionne l'autre, la « conversation » qui s'ensuit peut durer très longtemps. A noter qu'il y a 5% de chances dans tout échange que le bot ne réponde pas, interrompant ainsi la conversation. +[Cheap Bots Done Quick](https://cheapbotsdonequick.com/) possède une fonctionnalité bêta qui permet à votre robot de répondre aux mentions. Attention, si vous créez deux bots configurés pour que l'un mentionne l'autre, la « conversation » qui s'ensuit peut durer très longtemps. A noter qu'il y a 5% de chances dans tout échange que le bot ne réponde pas, interrompant ainsi la conversation. Pour configurer un modèle de réponse, cliquez au bas de la page pour paramétrer le bouton pour répondre aux tweets (`Reply`). Dans la fenêtre de modification JSON qui apparaît, configurez le modèle pour les phrases auxquelles votre bot va répondre. Par exemple, voici ci-dessous une partie de ce que mon bot @tinyarchae détecte : @@ -331,7 +331,7 @@ Tout en bas de la page, vous pouvez tester vos mentions en écrivant un exemple {% include figure.html filename="bot-lesson-response.png" caption="Tester la réponse du bot" %} ## Graphiques SVG -Puisque le [SVG](https://fr.wikipedia.org/wiki/Scalable_Vector_Graphics) est un format de données qui décrit la géométrie d'un graphique vectoriel, Tracery peut être utilisé pour réaliser un travail plutôt artistique. Par exemple, il existe le bot [Tiny Space Adventure](https://twitter.com/TinyAdv) qui dessine un champ d'étoiles, un vaisseau spatial et un descriptif. Sa grammaire [peut être consultée ici](https://pastebin.com/YYtZnzZ0). Pour que le format SVG soit généré correctement, il est d'une importance capitale d'avoir paramétré correctement Tracery. N'hésitez donc pas de prendre comme modèle le code source du [bot softlandscapes](http://cheapbotsdonequick.com/source/softlandscapes) qui commence par définir le texte critique qui délimite le SVG : +Puisque le [SVG](https://fr.wikipedia.org/wiki/Scalable_Vector_Graphics) est un format de données qui décrit la géométrie d'un graphique vectoriel, Tracery peut être utilisé pour réaliser un travail plutôt artistique. Par exemple, il existe le bot [Tiny Space Adventure](https://twitter.com/TinyAdv) qui dessine un champ d'étoiles, un vaisseau spatial et un descriptif. Sa grammaire [peut être consultée ici](https://pastebin.com/YYtZnzZ0). Pour que le format SVG soit généré correctement, il est d'une importance capitale d'avoir paramétré correctement Tracery. N'hésitez donc pas de prendre comme modèle le code source du [bot softlandscapes](https://cheapbotsdonequick.com/source/softlandscapes) qui commence par définir le texte critique qui délimite le SVG : ``` "origin2": ["#preface##defs##bg##mountains##clouds##ending#"], @@ -351,12 +351,12 @@ Note : cette fonctionnalité est encore en développement, le bouton tweet sur c Les bots qui génèrent du SVG dépassent le cadre de cette leçon, mais une étude minutieuse des bots existants devrait pouvoir vous aider, si vous souhaitez approfondir cette question. ## Musique -À proprement parler, il ne s'agit plus de bots, mais comme la musique peut être écrite en texte, on peut utiliser Tracery pour composer de la musique et utiliser d'autres bibliothèques pour convertir cette notation en fichiers Midi. Pour aller plus loin, vous pouvez consulter [cet article-ci](http://www.codingblocks.net/videos/generating-music-in-javascript/) et mon [propre retour d'expérience](https://electricarchaeology.ca/2017/04/07/tracery-continues-to-be-awesome/). +À proprement parler, il ne s'agit plus de bots, mais comme la musique peut être écrite en texte, on peut utiliser Tracery pour composer de la musique et utiliser d'autres bibliothèques pour convertir cette notation en fichiers Midi. Pour aller plus loin, vous pouvez consulter [cet article-ci](https://www.codingblocks.net/videos/generating-music-in-javascript/) et mon [propre retour d'expérience](https://electricarchaeology.ca/2017/04/07/tracery-continues-to-be-awesome/). # Autres tutoriels et ressources sur les bots **En anglais:** -- Zach Whalen, [How to make a Twitter Bot with Google Spreadsheets](http://www.zachwhalen.net/posts/how-to-make-a-twitter-bot-with-google-spreadsheets-version-04/), site web de Zach Whalen, http://www.zachwhalen.net/, 7 mai 2015 +- Zach Whalen, [How to make a Twitter Bot with Google Spreadsheets](https://www.zachwhalen.net/posts/how-to-make-a-twitter-bot-with-google-spreadsheets-version-04/), site web de Zach Whalen, https://www.zachwhalen.net/, 7 mai 2015 - Casey Bergman, [Keeping Up With the Scientific Literature using Twitterbots: The FlyPapers Experiment](https://caseybergman.wordpress.com/2014/02/24/keeping-up-with-the-scientific-literature-using-twitterbots-the-flypapers-experiment/) (et aussi [ce repository de Robert Lanfear sur Github](https://github.com/roblanf/phypapers)). Cette méthode consiste à collecter les flux RSS des articles de revues, puis à utiliser un service tel que [Dlvr.it](https://dlvrit.com/) pour rediriger les liens vers un compte Twitter. - Abandonnée: Stefan Bohacek propose des modèles de code pour différents types de bots sur le site de remixage de code Glitch.com. Si vous vous rendez sur sa page, vous verrez une liste de différents types de bots. Séléctionnez-en un, cliquez sur le bouton `remix` puis étudiez attentivement la documentation `README.md` qui s'affiche sur la page. Glitch nécessite une identification (login) via un compte Github ou Facebook. - Enfin, je suggère de rejoindre le groupe BotMakers Slack pour découvrir d'autres tutoriels, des personnes partageant les mêmes intérêts, et d'autres ressources : [Inscrivez-vous ici](https://botmakers.org). diff --git a/fr/lecons/intro-donnees-ouvertes-liees.md b/fr/lecons/intro-donnees-ouvertes-liees.md index 7d24054228..9653109eb9 100644 --- a/fr/lecons/intro-donnees-ouvertes-liees.md +++ b/fr/lecons/intro-donnees-ouvertes-liees.md @@ -98,7 +98,7 @@ Les paires attribut-valeur peuvent également contenir de l’information sur d lieu=2655524 ``` -À ce stade, vous pourriez penser qu’il s’agit en fait de la fonction d’un catalogue de bibliothèque. En effet, il s’agit bien du concept de la [notice d’autorité](https://perma.cc/K9GL-R435), idée centrale en sciences de l’information (un fichier d’autorité est une liste normalisée de termes qui peuvent être utilisés dans un contexte particulier, par exemple pour le catalogage d’un livre). Dans les deux exemples esquissés plus haut, nous avons utilisé des fichiers d’autorité pour assigner des numéros (les identifiants uniques) aux Jack et à Blackburn. Les numéros utilisés pour les deux Jack Straw viennent du [Virtual International Authority File](https://perma.cc/4F7W-AUQN) (VIAF), qui est maintenu par un consortium international de bibliothèques afin de gérer les problèmes qui découlent de la myriade de possibilités dont nous disposons pour nous référer à une même personne. L’identifiant unique utilisé pour la circonscription de Blackburn est tiré de [GeoNames](http://www.geonames.org), une base de données géographique en accès libre. +À ce stade, vous pourriez penser qu’il s’agit en fait de la fonction d’un catalogue de bibliothèque. En effet, il s’agit bien du concept de la [notice d’autorité](https://perma.cc/K9GL-R435), idée centrale en sciences de l’information (un fichier d’autorité est une liste normalisée de termes qui peuvent être utilisés dans un contexte particulier, par exemple pour le catalogage d’un livre). Dans les deux exemples esquissés plus haut, nous avons utilisé des fichiers d’autorité pour assigner des numéros (les identifiants uniques) aux Jack et à Blackburn. Les numéros utilisés pour les deux Jack Straw viennent du [Virtual International Authority File](https://perma.cc/4F7W-AUQN) (VIAF), qui est maintenu par un consortium international de bibliothèques afin de gérer les problèmes qui découlent de la myriade de possibilités dont nous disposons pour nous référer à une même personne. L’identifiant unique utilisé pour la circonscription de Blackburn est tiré de [GeoNames](https://www.geonames.org), une base de données géographique en accès libre. Mais essayons d’être plus précis quant à ce que nous voulons dire par Blackburn dans cet exemple. Jack Straw représentait la circonscription parlementaire (un territoire représenté par un seul membre du parlement) de Blackburn dont les frontières ont changé à travers le temps. Le projet [Digging Into Linked Parliamentary Data](https://perma.cc/5VAA-8MHY)[^1], sur lequel j’ai travaillé, a produit des identifiants uniques pour les affiliations parlementaires et les circonscriptions de chaque membre du parlement. Dans cet exemple, Jack Straw représentait la circonscription connue comme « Blackburn » dans sa version existante à partir de 1955 : @@ -201,7 +201,7 @@ Une ontologie est plus flexible, car elle n’est pas strictement hiérarchique. Peu importe ce que vous cherchez à représenter avec les DOL, nous vous suggérons de trouver un vocabulaire existant et de l’utiliser, plutôt que de tenter de créer le vôtre. Vous trouvez plusieurs des vocabulaires les plus utilisés en consultant le site [Vocabulaires Ouverts](https://vocabulaires-ouverts.inrae.fr/). -Puisque l’exemple présenté plus haut se concentre sur les pianistes, il serait convenable de repérer une ontologie appropriée plutôt que de créer notre propre système. Justement, il existe une [ontologie pour la musique](http://web.archive.org/web/20170715094229/http://www.musicontology.com/) (en anglais). En plus d’offrir une spécification aboutie, la documentation propose aussi des exemples utiles d’utilisations courantes. Vous pouvez visiter les [pages d’introduction](http://web.archive.org/web/20170718143925/http://musicontology.com/docs/getting-started.html) (en anglais) pour vous faire une meilleure idée de la façon dont vous pourriez utiliser cette ontologie en particulier. +Puisque l’exemple présenté plus haut se concentre sur les pianistes, il serait convenable de repérer une ontologie appropriée plutôt que de créer notre propre système. Justement, il existe une [ontologie pour la musique](https://web.archive.org/web/20170715094229/http://www.musicontology.com/) (en anglais). En plus d’offrir une spécification aboutie, la documentation propose aussi des exemples utiles d’utilisations courantes. Vous pouvez visiter les [pages d’introduction](https://web.archive.org/web/20170718143925/http://musicontology.com/docs/getting-started.html) (en anglais) pour vous faire une meilleure idée de la façon dont vous pourriez utiliser cette ontologie en particulier. Malheureusement, je ne trouve rien qui décrive la relation entre un professeur et son élève dans la *Music Ontology*. Mais elle est publiée de façon ouverte, ce qui permet de l’utiliser pour décrire d’autres caractéristiques du domaine de la musique, puis éventuellement de créer ma propre extension de ce modèle. Si je publie ainsi ouvertement mon extension, d’autres pourront l’utiliser à leur tour s’ils le souhaitent et elle pourrait même devenir un standard. Si la *Music Ontology* n’offre pas la relation dont j’ai besoin, le [projet Linked Jazz](https://perma.cc/AGM6-H9BM) (en anglais) permet l’utilisation du terme `mentorOf` qui semble bien fonctionner dans notre cas. Ce n’est pas une solution idéale, mais c’en est une qui s’efforce d’utiliser ce qui existe déjà. @@ -305,7 +305,7 @@ lccn:no2010025398 dc:creator viaf:96994048 , Nous déclarons ici que Shakespeare (96994048) et John Fletcher (12323361) étaient ensemble les créateurs de l’œuvre *The Two Noble Kinsmen*. -Les ontologies que je vous ai suggérées précédemment vous ont permis de jeter un œil sur les exemples de la [Music Ontology](http://web.archive.org/web/20170718143925/http://musicontology.com/docs/getting-started.html). J’espère qu’ils ne vous ont pas découragé. +Les ontologies que je vous ai suggérées précédemment vous ont permis de jeter un œil sur les exemples de la [Music Ontology](https://web.archive.org/web/20170718143925/http://musicontology.com/docs/getting-started.html). J’espère qu’ils ne vous ont pas découragé. Examinez-les de nouveau. Cela demeure compliqué, mais ont-ils plus de sens maintenant ? *Friend of a Friend* ([FOAF](https://perma.cc/2Q8U-S2JY)) est l’une des ontologies les plus accessibles. Elle est conçue pour décrire des personnes et les relations entre elles. Pour cette raison, elle est assez intuitive. Par exemple, si vous souhaitez m’écrire pour me dire que cette leçon est la meilleure chose que vous ayez jamais lue, voici mon adresse courriel exprimée par un triplet avec FOAF : @@ -370,7 +370,7 @@ Vous souvenez-vous de la façon dont les prédicats et les objets sont imbriqué Si vous êtes familier avec XML, ce sera un jeu d’enfant pour vous. Autrement, vous pourriez préférer un format comme Turtle. Mais l’avantage de RDF/XML, c’est que vous pouvez utiliser les outils de l’écosystème XML, comme les éditeurs spécialisés et les processeurs XML permettant, par exemple, de vérifier que votre document est bien formé. Si vous n’êtes pas du type XML, je vous recommande Turtle. -Pour valider la syntaxe de Turtle, vous pourrez utiliser des outils en ligne ([Easy RDF Converter](http://www.easyrdf.org/converter) ou [IDLab Turtle Validator](http://ttl.summerofcode.be)) ou encore [TurtleValidator](https://github.com/IDLabResearch/TurtleValidator), un outil facile à utiliser en [ligne de commande](https://perma.cc/Z8KQ-YNQ6). +Pour valider la syntaxe de Turtle, vous pourrez utiliser des outils en ligne ([Easy RDF Converter](https://www.easyrdf.org/converter) ou [IDLab Turtle Validator](https://ttl.summerofcode.be)) ou encore [TurtleValidator](https://github.com/IDLabResearch/TurtleValidator), un outil facile à utiliser en [ligne de commande](https://perma.cc/Z8KQ-YNQ6). ## Explorer des données RDF avec SPARQL diff --git a/fr/lecons/introduction-a-heurist.md b/fr/lecons/introduction-a-heurist.md index ddbaf2f12d..68d0513c17 100644 --- a/fr/lecons/introduction-a-heurist.md +++ b/fr/lecons/introduction-a-heurist.md @@ -69,7 +69,7 @@ Pour les personnes souhaitant tester le logiciel hors ligne, il est également p Cette installation demande des compétences techniques minimales d'administration d'un serveur web pour pouvoir être effectuée.
    -Heurist s'appuie sur une conception [relationnelle](https://perma.cc/5MJU-FV2G) des données, mais simplifie certains aspects de cette modélisation afin de faciliter son utilisation. Nous abordons brièvement quelques concepts clés du modèle relationnel dans la partie [« Modélisation des données »](#modelisation). +Heurist s'appuie sur une conception [relationnelle](https://perma.cc/5MJU-FV2G) des données, mais simplifie certains aspects de cette modélisation afin de faciliter son utilisation. Nous abordons brièvement quelques concepts clés du modèle relationnel dans la partie [« Modélisation des données »](#modélisation-des-données). @@ -139,7 +139,7 @@ Heurist s'appuie sur une conception relationnelle des données, il est ainsi imp -- Une organisation structurée des connaissances : les éléments d'une base de données sont organisés entre eux à travers des concepts descriptifs définis qui peuvent prendre la forme de [vocabulaires contrôlés](#vocabulaires2) ou d'[ontologies](https://perma.cc/AJL5-H5MK). Cela permet une cohérence dans la manière de décrire les objets au sein d'une même base de données ou d'un collectif de travail. +- Une organisation structurée des connaissances : les éléments d'une base de données sont organisés entre eux à travers des concepts descriptifs définis qui peuvent prendre la forme de [vocabulaires contrôlés](#vocabulaires2) ou d'[ontologies](https://perma.cc/AJL5-H5MK). Cela permet une cohérence dans la manière de décrire les objets au sein d'une même base de données ou d'un collectif de travail.
    La conception intellectuelle d'une base de données fait partie intégrante de la réflexion scientifique. Elle doit être effectuée en amont de la modélisation dans Heurist et aura un impact significatif sur les résultats et analyses qui résulteront de son exploitation. Toutefois, Heurist apporte une aide non négligeable pour débuter dans la conception d'une base de données, en mettant à disposition un ensemble de types d'enregistrement qui peut servir de socle à un nombre important de base de données en SHS (cf. Fig. 3). Par ailleurs, Heurist offre une grande souplesse dans la conception et l'alimentation des bases de données qu'il héberge, offrant ainsi une fonctionnalité de type « bac à sable », permettant de faire et défaire au gré de l'évolution conceptuelle de la base de données. @@ -223,7 +223,7 @@ Nous ne détaillerons pas ici la formalisation de la notation de ces cardinalit Nous venons de rencontrer le cas des thèmes ou périodes qui pouvaient décrire une même intervention. De même, **Type d'intervention** fait référence à un vocabulaire, même si celui-ci compte uniquement deux termes. -Nous avons déjà abordé rapidement ce point dans la partie sur la [cohérence des données](#vocabulaires1). +Nous avons déjà abordé rapidement ce point dans la partie sur la [cohérence des données](#vocabulaires1). Fonctionnellement, les [vocabulaires contrôlés](https://perma.cc/4ESL-P4UZ) s'opposent à une saisie textuelle libre. Concrètement, il s'agit de lister, de catégoriser et de hiérarchiser des concepts, en nombre fini, afin d'éviter certains biais courants lors des saisies textuelles libres tels que : - L'incohérence orthographique (M ≠ m, Moyen-Âge ≠ Moyen Âge) diff --git a/fr/lecons/introduction-a-la-stylometrie-avec-python.md b/fr/lecons/introduction-a-la-stylometrie-avec-python.md index 7f91476781..ed3d09aa4f 100644 --- a/fr/lecons/introduction-a-la-stylometrie-avec-python.md +++ b/fr/lecons/introduction-a-la-stylometrie-avec-python.md @@ -66,7 +66,7 @@ Ce tutoriel utilise un jeu de données et des logiciels que vous devrez téléch ### Le jeu de données -Pour compléter les exercices de ce tutoriel, vous devrez télécharger et ouvrir l'archive des _Federalist Papers_ [.zip](/assets/introduction-to-stylometry-with-python/stylometry-federalist.zip) qui contient les 85 articles dont nous aurons besoin pour effectuer notre analyse. L'archive contient également le [livre électronique du Projet Gutenberg](http://www.gutenberg.org/cache/epub/1404/pg1404.txt) dont ces 85 documents ont été extraits. L'ouverture du fichier .zip créera un [répertoire](https://fr.wikipedia.org/wiki/R%C3%A9pertoire_(informatique)) nommé `data` dans votre répertoire de travail courant. Assurez-vous de rester dans ce répertoire de travail courant et d'y sauvegarder tout le travail que vous réaliserez en suivant le tutoriel. +Pour compléter les exercices de ce tutoriel, vous devrez télécharger et ouvrir l'archive des _Federalist Papers_ [.zip](/assets/introduction-to-stylometry-with-python/stylometry-federalist.zip) qui contient les 85 articles dont nous aurons besoin pour effectuer notre analyse. L'archive contient également le [livre électronique du Projet Gutenberg](https://www.gutenberg.org/cache/epub/1404/pg1404.txt) dont ces 85 documents ont été extraits. L'ouverture du fichier .zip créera un [répertoire](https://fr.wikipedia.org/wiki/R%C3%A9pertoire_(informatique)) nommé `data` dans votre répertoire de travail courant. Assurez-vous de rester dans ce répertoire de travail courant et d'y sauvegarder tout le travail que vous réaliserez en suivant le tutoriel. ### Le logiciel @@ -80,11 +80,11 @@ Certaines de ces ressources peuvent être absentes de votre ordinateur. Si vous ## Quelques notes au sujet des langues -Ce tutoriel applique des méthodes d'analyse stylométrique à un ensemble de textes rédigés en anglais à l'aide d'un module Python nommé `nltk`. Plusieurs des fonctions offertes par `nltk` sont cependant disponibles dans d'autres langues. Pour peu qu'une langue écrite divise ses mots de façon claire et précise, `nltk` devrait fonctionner correctement. Les langues pour lesquelles il n'y a pas de séparation nette entre les mots à l'écrit, comme par exemple le chinois, pourraient poser problème. J'ai utilisé `nltk` avec des textes français sans difficulté; les autres langues qui utilisent des [signes diacritiques](https://fr.wikipedia.org/wiki/Diacritique), comme l'espagnol et l'allemand, devraient être compatibles avec `nltk` elles aussi. Veuillez consulter la [documentation de nltk (en anglais seulement)](http://www.nltk.org/book/) pour plus de détails. +Ce tutoriel applique des méthodes d'analyse stylométrique à un ensemble de textes rédigés en anglais à l'aide d'un module Python nommé `nltk`. Plusieurs des fonctions offertes par `nltk` sont cependant disponibles dans d'autres langues. Pour peu qu'une langue écrite divise ses mots de façon claire et précise, `nltk` devrait fonctionner correctement. Les langues pour lesquelles il n'y a pas de séparation nette entre les mots à l'écrit, comme par exemple le chinois, pourraient poser problème. J'ai utilisé `nltk` avec des textes français sans difficulté; les autres langues qui utilisent des [signes diacritiques](https://fr.wikipedia.org/wiki/Diacritique), comme l'espagnol et l'allemand, devraient être compatibles avec `nltk` elles aussi. Veuillez consulter la [documentation de nltk (en anglais seulement)](https://www.nltk.org/book/) pour plus de détails. Une seule des tâches de ce tutoriel exige du code qui varie en fonction de la langue. Pour diviser un texte en un ensemble de mots en français ou en espagnol, vous devrez spécifier la langue appropriée à [l'analyseur lexical](https://fr.wikipedia.org/wiki/Analyse_lexicale#Analyseur_lexical) de `nltk`. La procédure à suivre sera expliquée au moment venu. -Enfin, veuillez noter que certaines tâches linguistiques, comme [l'étiquetage grammatical](https://fr.wikipedia.org/wiki/%C3%89tiquetage_morpho-syntaxique) des mots, peuvent ne pas être supportées par `nltk` dans les langues autres que l'anglais. Ce tutoriel ne couvre pas l'étiquetage grammatical. Si vos propres projets en ont besoin, veuillez consulter la [documentation de nltk](http://www.nltk.org/book/) pour obtenir des conseils. +Enfin, veuillez noter que certaines tâches linguistiques, comme [l'étiquetage grammatical](https://fr.wikipedia.org/wiki/%C3%89tiquetage_morpho-syntaxique) des mots, peuvent ne pas être supportées par `nltk` dans les langues autres que l'anglais. Ce tutoriel ne couvre pas l'étiquetage grammatical. Si vos propres projets en ont besoin, veuillez consulter la [documentation de nltk](https://www.nltk.org/book/) pour obtenir des conseils. # Les _Federalist Papers_ - Contexte historique @@ -344,7 +344,7 @@ Cependant, le khi carré constitue toujours une méthode approximative. Par exem Dans certaines langues, il peut être utile d'étiqueter grammaticalement les occurrences de mots avant de les compter, pour que les occurrences de certains mots polysémiques puissent être divisées entre deux traits distincts. Par exemple, en français, les mots "le" et "la" servent à la fois d'articles et de pronoms. Ce tutoriel n'applique pas l'étiquetage grammatical puisqu'il est rarement utile pour l'analyse stylométrique de textes en anglais contemporain et parce que l'analyseur syntaxique de `nltk` ne gère pas très bien les autres langues. -Si vous avez besoin d'étiqueter les occurrences pour vos propres projets, il est possible de télécharger des analyseurs extérieurs, d'obtenir des outils séparés comme [Tree Tagger](http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/), ou même d'entraîner vos propres modèles d'étiquetage, mais ces techniques sont hors du cadre de ce tutoriel. +Si vous avez besoin d'étiqueter les occurrences pour vos propres projets, il est possible de télécharger des analyseurs extérieurs, d'obtenir des outils séparés comme [Tree Tagger](https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/), ou même d'entraîner vos propres modèles d'étiquetage, mais ces techniques sont hors du cadre de ce tutoriel. # Troisième test stylométrique : la méthode du Delta de John Burrows (Concepts avancés) @@ -631,7 +631,7 @@ La première itération de ce projet a été développée dans le cadre des sém [^16]: John Burrows, "'Delta': a Measure of Stylistic Difference and a Guide to Likely Authorship", _Literary and Linguistic Computing_, vol. 17, no. 3 (2002), p. 267-287. -[^17]: José Calvo Tello, “Entendiendo Delta desde las Humanidades,” _Caracteres_, 27 mai 2016, http://revistacaracteres.net/revista/vol5n1mayo2016/entendiendo-delta/. +[^17]: José Calvo Tello, “Entendiendo Delta desde las Humanidades,” _Caracteres_, 27 mai 2016, https://revistacaracteres.net/revista/vol5n1mayo2016/entendiendo-delta/. [^18]: Stefan Evert et al., "Understanding and explaining Delta measures for authorship attribution", _Digital Scholarship in the Humanities_, vol. 32, no. suppl_2 (2017), p. ii4-ii16. diff --git a/fr/lecons/introduction-aux-carnets-jupyter-notebooks.md b/fr/lecons/introduction-aux-carnets-jupyter-notebooks.md index d4dc90fce0..6738453038 100644 --- a/fr/lecons/introduction-aux-carnets-jupyter-notebooks.md +++ b/fr/lecons/introduction-aux-carnets-jupyter-notebooks.md @@ -141,7 +141,7 @@ Veuillez noter qu'il ne s'agit pas de la seule manière de faire apparaître des ### Créer un nouveau carnet - + Nous allons maintenant créer un nouveau carnet dans votre répertoire *carnets* afin de convertir les dates de votre projet de recherche. Cliquez sur le bouton *New* (*nouveau*) dans le coin supérieur-droit de votre interface Jupyter Notebook. Si vous venez d'installer Anaconda en suivant la procédure décrite ci-dessus, vous n'aurez pas d'autre choix que de créer un carnet Jupyter qui utilise le *noyau* Python 3 (le noyau est l'infrastructure cachée qui exécute le code du carnet) mais nous expliquerons bientôt comment ajouter des noyaux pour d'autres langages de programmation. Cliquez sur *Python 3* et Jupyter Notebook ouvrira dans votre navigateur web un onglet où s'affichera votre nouveau carnet. Par défaut, celui-ci s'appellera *Untitled* ("sans titre"); vous pouvez cliquer sur ce mot au haut de l'écran pour donner à votre carnet un nom plus significatif. {% include figure.html filename="jupyter-createnew.png" caption="Création d'un nouveau carnet Jupyter" %} @@ -359,7 +359,7 @@ Enfin, si vous employez déjà les carnets Jupyter pour documenter les flots de Il existe plusieurs cours et ateliers d'introduction à la programmation en Python pour les humanités numériques, dont [Introduction à Python et au développement web avec Python pour les sciences humaines](https://github.com/PonteIneptique/cours-python) de Thibault Clérice, qui traduit des contenus développés par Matthew Munson. Les carnets Jupyter servent aussi couramment dans les ateliers d'analyse textuelle, comme celui portant sur le plongement lexical qui a été animé par Eun Seo Jo, Javier de la Rosa et Scott Bailey [lors du congrès DH 2018](https://github.com/sul-cidr/dh2018-word-vector-workshops). - + Enseigner avec les carnets Jupyter n'exige pas forcément que l'on investisse le temps nécessaire pour télécharger et installer Anaconda, surtout si vous prévoyez que seulement une ou deux leçons utiliseront des carnets. Si vos activités en classe impliquent l'utilisation de données que vous avez préparées au préalable et que vous avez déjà rédigé au moins une partie du code, il pourrait être avantageux d'exécuter vos carnets dans des environnements infonuagiques gratuits - à condition que vos étudiant(e)s disposent d'une connexion Internet fiable en classe. Exécuter des carnets dans le nuage présente aussi l'avantage d'assurer un environnement de travail identique à tous et à toutes (ce qui vous évitera d'avoir à gérer les différences entre Windows et Mac), en plus d'offrir aux étudiant(e)s un moyen de participer même si leurs ordinateurs ne disposent pas de l'espace disque ou de la mémoire nécessaires pour exécuter Anaconda efficacement. Notez qu'il vaut mieux faire appel à votre moteur de recherche favori pour connaître la liste des environnements infonuagiques susceptibles d'accueillir vos carnets Jupyter puisque la liste des options varie constamment. Parmi les sites qui ont acquis une certaine popularité en milieu universitaire, notons [MyBinder](https://mybinder.org/), qui accepte un dépôt GitHub contenant des carnets Jupyter (fichiers .ipynb), les fichiers de données associés (images intégrées, jeux de données auxquels appliquer le code, etc.) et l'information au sujet des modules nécessaires et autres dépendances (dans un fichier intitulé requirements.txt ou environment.yml) et en fait un exécutable capable de rouler sur un serveur infonuagique. Une fois que MyBinder aura encapsulé votre dépôt GitHub, vous pourrez ajouter un « badge » Binder au fichier lisez-moi de votre dépôt. Quiconque visitera votre dépôt pourra lancer le carnet directement à partir de son navigateur web sans devoir télécharger ni installer quoi que ce soit. @@ -417,7 +417,7 @@ Qu'il s'agisse d'expérimenter avec la programmation, de documenter les processu [^2]: Millman, KJ et Fernando Perez. 2014. « Developing open source scientific practice », dans *Implementing Reproducible Research*, édité par Victoria Stodden, Friedrich Leisch et Roger D. Peng. https://osf.io/h9gsd/ -[^3]: Sinclair, Stéfan et Geoffrey Rockwell. 2013. « Voyant Notebooks: Literate Programming and Programming Literacy ». Journal of Digital Humanities, Vol. 2, No. 3 Été 2013. http://journalofdigitalhumanities.org/2-3/voyant-notebooks-literate-programming-and-programming-literacy/ +[^3]: Sinclair, Stéfan et Geoffrey Rockwell. 2013. « Voyant Notebooks: Literate Programming and Programming Literacy ». Journal of Digital Humanities, Vol. 2, No. 3 Été 2013. https://journalofdigitalhumanities.org/2-3/voyant-notebooks-literate-programming-and-programming-literacy/ [^4]: Haley Di Pressi, Stephanie Gorman, Miriam Posner, Raphael Sasayama et Tori Schmitt, avec la collaboration de Roderic Crooks, Megan Driscoll, Amy Earhart, Spencer Keralis, Tiffany Naiman et Todd Presner. « A Student Collaborator's Bill of Rights ». https://humtech.ucla.edu/news/a-student-collaborators-bill-of-rights/ diff --git a/fr/lecons/introduction-et-installation.md b/fr/lecons/introduction-et-installation.md index 11bbfe7722..9b4a802273 100644 --- a/fr/lecons/introduction-et-installation.md +++ b/fr/lecons/introduction-et-installation.md @@ -42,7 +42,7 @@ Dans cette leçon d'introduction, vous allez installer le [langage de programmat Le langage de programmation Python ------------------------------- -Dans le cadre de cette série de leçons, nous utiliserons Python, un langage de programmation gratuit et à code ouvert. À moins d'indications contraires, nous utiliserons la **version 3** de Python, la version 2 n'étant plus soutenue. Il se pourrait cependant que vous rencontriez du code Python 2 dans des projets ou des tutoriels plus anciens. [Python 3 présente des différences avec ses prédécesseurs](http://sebastianraschka.com/Articles/2014_python_2_3_key_diff.html) - à titre de comparaison, imaginez une langue dont les règles de grammaire évoluent avec le temps. Méfiez-vous donc d'exemples que vous pourriez trouver en ligne utilisant Python 2, car il est possible qu'ils ne fonctionnent pas en Python 3. +Dans le cadre de cette série de leçons, nous utiliserons Python, un langage de programmation gratuit et à code ouvert. À moins d'indications contraires, nous utiliserons la **version 3** de Python, la version 2 n'étant plus soutenue. Il se pourrait cependant que vous rencontriez du code Python 2 dans des projets ou des tutoriels plus anciens. [Python 3 présente des différences avec ses prédécesseurs](https://sebastianraschka.com/Articles/2014_python_2_3_key_diff.html) - à titre de comparaison, imaginez une langue dont les règles de grammaire évoluent avec le temps. Méfiez-vous donc d'exemples que vous pourriez trouver en ligne utilisant Python 2, car il est possible qu'ils ne fonctionnent pas en Python 3. Sauvegardez votre travail! ----------------- @@ -60,11 +60,11 @@ Pour utiliser les techniques présentées ici, vous devrez d'abord télécharger - [Installation de Python pour Windows](/fr/lecons/installation-windows-py) - [Installation de Python pour Linux](/en/lessons/linux-installation) - [langage de programmation Python]: http://www.python.org/ - [l'analyseur HTML / XML Beautiful Soup]: http://www.crummy.com/software/BeautifulSoup/ + [langage de programmation Python]: https://www.python.org/ + [l'analyseur HTML / XML Beautiful Soup]: https://www.crummy.com/software/BeautifulSoup/ [Komodo Edit]: https://github.com/ActiveState/OpenKomodoIDE [éditeurs compatibles avec Python]: https://wiki.python.org/python/PythonEditors - [Zotero]: http://www.zotero.org/ + [Zotero]: https://www.zotero.org/ [Jungle Disk]: https://www.jungledisk.com/ [Dropbox]: https://www.dropbox.com/ [Affichage des fichiers HTML]: /lessons/viewing-html-files diff --git a/fr/lecons/manipuler-chaines-caracteres-python.md b/fr/lecons/manipuler-chaines-caracteres-python.md index cb61798696..6ab3637b02 100644 --- a/fr/lecons/manipuler-chaines-caracteres-python.md +++ b/fr/lecons/manipuler-chaines-caracteres-python.md @@ -213,7 +213,7 @@ Deux autres séquences d'échappement vous permettent d'imprimer des tabulateurs ``` ## Bibliographie -- Mark Lutz, *[Learning Python](http://www.worldcat.org/oclc/1061273329)* +- Mark Lutz, *[Learning Python](https://www.worldcat.org/oclc/1061273329)* - Ch. 7: Strings *(Chaînes de caractères)* - Ch. 8: Lists and Dictionaries *(Listes et dictionnaires)* - Ch. 10: Introducing Python Statements *(Introduction aux déclarations en Python)* diff --git a/fr/lecons/nettoyer-ses-donnees-avec-openrefine.md b/fr/lecons/nettoyer-ses-donnees-avec-openrefine.md index 518f82f751..930f66c801 100644 --- a/fr/lecons/nettoyer-ses-donnees-avec-openrefine.md +++ b/fr/lecons/nettoyer-ses-donnees-avec-openrefine.md @@ -132,13 +132,13 @@ Une fois vos données nettoyées, vous pouvez passer à l'étape suivante et exp Si vous devez vous souvenir d'une seule chose de ce cours, ce doit être celle-ci : *toutes les données sont sales, mais vous pouvez y faire quelque chose*. Comme nous l'avons montré ici, il y a déjà beaucoup de choses que vous pouvez faire par vous-mêmes pour améliorer la qualité de vos données. Vous avez ainsi appris comment avoir un rapide aperçu du nombre de valeurs vides que contient votre jeu de données et à quelle fréquence une valeur particulière (par exemple un mot-clé) est utilisée dans une collection. Ces cours vous ont également montré comment résoudre des problèmes récurrents tels que les doublons et les incohérences orthographiques de manière automatisée à l'aide d'*OpenRefine*. -[*OpenRefine*]: http://openrefine.org "OpenRefine" +[*OpenRefine*]: https://openrefine.org "OpenRefine" [Powerhouse museum]: https://powerhouse.com.au/ "Powerhouse museum" [*Potter’s Wheel ABC*]: https://perma.cc/Q6QD-E64N "Potter's Wheel ABC " - [*Wrangler*]: http://vis.stanford.edu/papers/wrangler/ "Wrangler" + [*Wrangler*]: https://vis.stanford.edu/papers/wrangler/ "Wrangler" [profilage]: https://fr.wikipedia.org/wiki/Data_profiling [reconnaissance d'entités nommées]: https://fr.wikipedia.org/wiki/Reconnaissance_d%27entit%C3%A9s_nomm%C3%A9es - [Bibliothèque du Congrès]: http://www.loc.gov/index.html "Bibliothèque du Congrès" + [Bibliothèque du Congrès]: https://www.loc.gov/index.html "Bibliothèque du Congrès" [OCLC]: https://www.oclc.org/fr/home.html "OCLC" [site web]: https://api.maas.museum/docs "site web" [licence Creative Commons Attribution - Partage dans les Mêmes Conditions]: https://creativecommons.org/licenses/by-sa/4.0/deed.fr diff --git a/fr/lecons/normaliser-donnees-textuelles-python.md b/fr/lecons/normaliser-donnees-textuelles-python.md index 27e6126bcb..d88365ceb7 100644 --- a/fr/lecons/normaliser-donnees-textuelles-python.md +++ b/fr/lecons/normaliser-donnees-textuelles-python.md @@ -96,7 +96,7 @@ En effet, la fonction `stripTags()` du module `obo.py` retourne une chaine de ca Modifier `html-to-list1.py` pour y appliquer la méthode `lower()` à `obo.stripTags(html)` :
    -Attention : à cause des modifications faites au site du Old Bailey Online depuis la publication de cette leçon, le lien http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33 ne fonctionnera plus dans le code ci-dessous. Vous avez deux options pour contourner le problème . Si vous suivez actuellement cette leçon en utilisant un autre site qui fonctionne, vous pouvez simplement remplacer le lien du Old Bailey Online avec votre propre lien correspondant (en d'autres termes, il suffit de modifier la variable url) : +Attention : à cause des modifications faites au site du Old Bailey Online depuis la publication de cette leçon, le lien https://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33 ne fonctionnera plus dans le code ci-dessous. Vous avez deux options pour contourner le problème . Si vous suivez actuellement cette leçon en utilisant un autre site qui fonctionne, vous pouvez simplement remplacer le lien du Old Bailey Online avec votre propre lien correspondant (en d'autres termes, il suffit de modifier la variable url) :
    ``` python @@ -193,11 +193,11 @@ wordlist = obo.stripNonAlphaNum(text) print(wordlist) ``` -En exécutant le programme et en regardant ce qu'il en ressort dans le panneau `Command Output`, vous verrez qu'il fait plutôt du bon travail. Ce code sépare les mots composés avec un trait d'union comme _coach-wheels_ en deux mots, et compte le possessif anglais _'s_ ou la forme _o'clock_ comme des mots distincts, en retirant l'apostrophe. Il s'agit cependant d'une approximation satisfaisante de ce que nous voulions obtenir, et nous pouvons continuer d'avancer vers nos mesures de fréquences avant d'essayer de l'améliorer. (Si les sources sur lesquelles vous travaillez sont dans plus d'une langue, vous aurez besoin d'en apprendre plus sur le standard [Unicode](https://home.unicode.org/) et sur sa [prise en charge Python](https://web.archive.org/web/20180502053841/http://www.diveintopython.net/xml_processing/unicode.html).) +En exécutant le programme et en regardant ce qu'il en ressort dans le panneau `Command Output`, vous verrez qu'il fait plutôt du bon travail. Ce code sépare les mots composés avec un trait d'union comme _coach-wheels_ en deux mots, et compte le possessif anglais _'s_ ou la forme _o'clock_ comme des mots distincts, en retirant l'apostrophe. Il s'agit cependant d'une approximation satisfaisante de ce que nous voulions obtenir, et nous pouvons continuer d'avancer vers nos mesures de fréquences avant d'essayer de l'améliorer. (Si les sources sur lesquelles vous travaillez sont dans plus d'une langue, vous aurez besoin d'en apprendre plus sur le standard [Unicode](https://home.unicode.org/) et sur sa [prise en charge Python](https://web.archive.org/web/20180502053841/https://www.diveintopython.net/xml_processing/unicode.html).) ## Pour aller plus loin -Si vous souhaitez pratiquer davantage les expressions régulières, le chapitre 7 de [Dive into Python](https://web.archive.org/web/20180416143856/http://www.diveintopython.net/regular_expressions/index.html) de Mark Pilgrim peut être un bon entrainement. +Si vous souhaitez pratiquer davantage les expressions régulières, le chapitre 7 de [Dive into Python](https://web.archive.org/web/20180416143856/https://www.diveintopython.net/regular_expressions/index.html) de Mark Pilgrim peut être un bon entrainement. ### Synchronisation du code diff --git a/fr/lecons/preserver-ses-donnees-de-recherche.md b/fr/lecons/preserver-ses-donnees-de-recherche.md index 84e8c00779..f722a54106 100644 --- a/fr/lecons/preserver-ses-donnees-de-recherche.md +++ b/fr/lecons/preserver-ses-donnees-de-recherche.md @@ -89,7 +89,7 @@ Le moment de la documentation dépend beaucoup de l'individu et du rythme de ses Les données de la recherche et la documentation devraient dans l'idéal être sauvegardées dans des formats ouverts, qui sont [compatibles avec tous les systèmes d'exploitation][], comme .txt pour les notes et .csv (comma-separated values) ou .tsv (tab-seperated values) pour les données tabulées. Ces formats texte brut sont préférables aux formats propriétaires utilisés par défaut avec Microsoft Office ou iWork parce qu'ils peuvent être ouverts avec de nombreux logiciels et ont une forte chance de rester lisibles et modifiables dans le futur. La plupart des suites bureautiques standards incluent une option permettant de sauvegarder les fichiers dans les formats .txt, .csv et .tsv, ce qui signifie que vous pouvez continuer à travailler avec vos logiciels habituels tout en faisant ce qu'il faut pour préserver votre travail. Comparés à du .doc ou du .xls, ces formats ont en plus l'atout, dans une perspective de préservation, de ne contenir que des éléments lisibles par la machine. Bien que l'utilisation des caractères gras, italiques ou colorés pour signifier des titres ou établir des connections visuelles entre des données soit une pratique courante, ces annotations servent à l'affichage sans être lisibles par les machines et ne peuvent pas être interrogées ni fouillées. Elles ne sont pas non plus appropriées pour les grandes quantités d'informations. Il est préférable d'utiliser des schémas de notations simples comme des doubles astérisques ou des triples croisillons pour représenter des caractéristiques: dans mes notes, par exemple, trois points d'interrogations indiquent un point que je dois suivre, j'ai choisi "???' car cette suite peut être facilement trouvée avec une recherche CTRL+F. -Dans de nombreuses occasions, il est probable que ces schémas de notation émergent de la pratique individuelle (et doivent par conséquent être documentés), alors qu'il existe des schémas comme le [Markdown][] de [GitHub](https://github.com) (les fichiers Markdown sont enregistrés au format .md). Un excellent aide-mémoire au Markdown est disponible sur [GitHub](https://github.com/adam-p/markdown-here) pour ceux qui veulent suivre - ou adapter - le schéma existant. [Notepad++](http://notepad-plus-plus.org) est recommandé pour les utilisateurs de Windows, bien que nullement essentiel pour travailler avec des fichiers .md. Les utilisateurs de Mac ou d'Unix peuvent utiliser [Komodo Edit](https://github.com/ActiveState/OpenKomodoIDE) ou [Text Wrangler](https://www.barebones.com/support/textwrangler/updates.html). +Dans de nombreuses occasions, il est probable que ces schémas de notation émergent de la pratique individuelle (et doivent par conséquent être documentés), alors qu'il existe des schémas comme le [Markdown][] de [GitHub](https://github.com) (les fichiers Markdown sont enregistrés au format .md). Un excellent aide-mémoire au Markdown est disponible sur [GitHub](https://github.com/adam-p/markdown-here) pour ceux qui veulent suivre - ou adapter - le schéma existant. [Notepad++](https://notepad-plus-plus.org) est recommandé pour les utilisateurs de Windows, bien que nullement essentiel pour travailler avec des fichiers .md. Les utilisateurs de Mac ou d'Unix peuvent utiliser [Komodo Edit](https://github.com/ActiveState/OpenKomodoIDE) ou [Text Wrangler](https://www.barebones.com/support/textwrangler/updates.html). * * * * * @@ -117,22 +117,22 @@ L'examen des URL est un bon moyen de réfléchir à la raison pour laquelle la s Les URL utilisés par les sites d’informations ou les blogues en sont un exemple typique. Les URL Wordpress suivent le format suivant : - *nom du site Web*/*année (4 chiffres)*/*mois (2 chiffres)*/*jour (2 chiffres)*/*mots du titre séparés par des traits d'union* -- +- Un usage similaire est utilisé par les agences de presse telles que le journal *The Guardian* : - *nom du site Web*/*section de section*/*année (4 chiffres)*/*mois (3 caractères)*/*jour (2 chiffres)*/*mots-descripteurs-contenus-séparés-par-tirets* -- +- De leur côté, les catalogues d’archives utilisent souvent des URL structurées avec un seul élément de données. La British Cartoon Archive structure ses archives en ligne en utilisant le format : - *nom du site*/record/*numéro de référence* -- +- Et l'Old Bailey (la Haute Cour criminelle britannique) utilise le format : - *nom du site*/browse.jsp?ref=*numéro de référence* -- +- Ce que nous apprenons de ces exemples, c’est qu’une combinaison de description parlante et d’éléments de données rend les structures de données cohérentes et intuitives, lisibles à la fois par les humains et par les machines. Appliqué aux données numériques accumulées au cours de recherches historiques, cela facilite la navigation, la recherche et l'interrogation des données de recherche à l'aide des outils standard fournis par nos systèmes d'exploitation (et, comme nous le verrons dans une prochaine leçon, d'outils plus perfectionnés). @@ -245,11 +245,11 @@ blog (17 octobre 2013) Hitchcock, Tim, 'Judging a book by its URLs', Historyonics blog (3 janvier 2014) - + Howard, Sharon, 'Unclean, unclean! What historians can do about sharing our messy research data', Early Modern Notes blog (18 mai 2013) - + Noble, William Stafford, A Quick Guide to Organizing Computational Biology Projects.PLoSComputBiol 5(7): e1000424 (2009) @@ -262,7 +262,7 @@ Information Management: Organising Humanities Material' (2011) Pennock, Maureen, 'The Twelve Principles of Digital Preservation (and a cartridge in a repository…)', British Library Collection Care blog (3 septembre 2013) - + Pritchard, Adam, 'Markdown Cheatsheet' (2013) diff --git a/fr/lecons/publier-archives-tei-ceteicean.md b/fr/lecons/publier-archives-tei-ceteicean.md index c6ee1315aa..64e3653fb8 100644 --- a/fr/lecons/publier-archives-tei-ceteicean.md +++ b/fr/lecons/publier-archives-tei-ceteicean.md @@ -39,7 +39,7 @@ Pour les personnes qui débutent avec la TEI, l'un des obstacles les plus couran Ce tutoriel vous guidera à travers les étapes nécessaires pour publier un fichier TEI en ligne en utilisant CETEIcean, une bibliothèque ouverte écrite dans le langage de programmation JavaScript. CETEIcean permet d'afficher les documents TEI dans un navigateur web sans les transformer au préalable en HTML. CETEIcean charge le fichier TEI dynamiquement dans le navigateur et convertit les éléments TEI en éléments HTML, de sorte que ceux-ci nous permettent de visualiser dans le navigateur web les phénomènes textuels que nous balisons dans nos fichiers en utilisant la TEI. -Tout d'abord, une clarification concernant la visualisation de votre travail : la méthode par défaut de CETEIcean pour afficher les fichiers TEI consiste à charger les fichiers depuis un autre emplacement. Cependant, tous les navigateurs ne vous permettront pas de charger les fichiers s'ils sont stockés sur votre ordinateur. Vous pouvez essayer, mais si cela ne fonctionne pas, vous devrez générer un serveur local, placer les fichiers sur un serveur en ligne, ou utiliser un éditeur de code avec des fonctions de prévisualisation. Pour ce tutoriel, nous suivrons cette dernière option, car nous utiliserons l'éditeur [Visual Studio Code](https://code.visualstudio.com/), avec l'extension *HTML Preview*. Néanmoins, il existe d'autres options libres pour éditer des fichiers TEI et générer des prévisualisations HTML, comme [jEdit](http://www.jedit.org/) ou [Atom](https://atom.io), ainsi que des versions propriétaires comme [Oxygen](https://www.oxygenxml.com/). +Tout d'abord, une clarification concernant la visualisation de votre travail : la méthode par défaut de CETEIcean pour afficher les fichiers TEI consiste à charger les fichiers depuis un autre emplacement. Cependant, tous les navigateurs ne vous permettront pas de charger les fichiers s'ils sont stockés sur votre ordinateur. Vous pouvez essayer, mais si cela ne fonctionne pas, vous devrez générer un serveur local, placer les fichiers sur un serveur en ligne, ou utiliser un éditeur de code avec des fonctions de prévisualisation. Pour ce tutoriel, nous suivrons cette dernière option, car nous utiliserons l'éditeur [Visual Studio Code](https://code.visualstudio.com/), avec l'extension *HTML Preview*. Néanmoins, il existe d'autres options libres pour éditer des fichiers TEI et générer des prévisualisations HTML, comme [jEdit](https://www.jedit.org/) ou [Atom](https://atom.io), ainsi que des versions propriétaires comme [Oxygen](https://www.oxygenxml.com/).
    Mise à jour de mars 2025 : La version originale en espagnol utilise l'éditeur Atom ; cependant nous ne recommandons pas d'utiliser Atom, car le logiciel n'a pas reçu de maintenance ni de mises à jour depuis sa fermeture en décembre 2022. Nous avons donc décidé d'utiliser VS Code de la même manière, à condition d'installer également l'extension HTML Preview depuis l'onglet Extensions. @@ -283,7 +283,7 @@ CETEIcean possède un certain nombre de comportements intégrés que vous pouvez Si vous faites cela, vous voudrez peut-être ajouter des styles CSS ou des comportements pour choisir la manière dont le contenu du TEI Header sera affiché dans le navigateur. -Dans ce tutoriel, nous n'avons pas épuisé toutes les possibilités pour la présentation de notre document source. Nous vous invitons à continuer à expérimenter par vous-même les différentes manières dont un balisage TEI peut être affiché dans un navigateur en utilisant CETEIcean. Vous pouvez trouver plus d'informations sur [CETEIcean](http://teic.github.io/CETEIcean/). Vous pouvez aussi trouver quelques lignes de code commentées dans cette leçon pour appliquer certains comportements supplémentaires qui pourront s'avérer utiles. +Dans ce tutoriel, nous n'avons pas épuisé toutes les possibilités pour la présentation de notre document source. Nous vous invitons à continuer à expérimenter par vous-même les différentes manières dont un balisage TEI peut être affiché dans un navigateur en utilisant CETEIcean. Vous pouvez trouver plus d'informations sur [CETEIcean](https://teic.github.io/CETEIcean/). Vous pouvez aussi trouver quelques lignes de code commentées dans cette leçon pour appliquer certains comportements supplémentaires qui pourront s'avérer utiles. ## Références bibliographiques @@ -301,9 +301,9 @@ Vaughan, Nicolás. « Introduction au codage de textes en TEI (partie - Atom. Un éditeur de texte hackable pour le 21e siècle. [https://atom.io](https://atom.io) -- Cayless, Hugh et Viglianti, Raffaele. CETEIcean. [http://teic.github.io/CETEIcean/](http://teic.github.io/CETEIcean/) +- Cayless, Hugh et Viglianti, Raffaele. CETEIcean. [https://teic.github.io/CETEIcean/](https://teic.github.io/CETEIcean/) -- Jedit. Éditeur de texte pour programmeurs. Version stable : 5.6.0. [http://www.jedit.org/](http://www.jedit.org/) +- Jedit. Éditeur de texte pour programmeurs. Version stable : 5.6.0. [https://www.jedit.org/](https://www.jedit.org/) - Oxygen. Éditeur XML. [https://www.oxygenxml.com/](https://www.oxygenxml.com/) diff --git a/fr/lecons/redaction-durable-avec-pandoc-et-markdown.md b/fr/lecons/redaction-durable-avec-pandoc-et-markdown.md index bc0c8bb783..84bdd070be 100644 --- a/fr/lecons/redaction-durable-avec-pandoc-et-markdown.md +++ b/fr/lecons/redaction-durable-avec-pandoc-et-markdown.md @@ -57,7 +57,7 @@ C'est ici qu'excelle Markdown. Markdown est une syntaxe qui permet le marquage s Écrire ce cette façon libère l'auteur(e) de son outil. Vous pouvez écrire en Markdown dans n'importe quel éditeur de texte brut, et la syntaxe dispose d'un riche écosystème de logiciels qui peuvent transformer ces textes en de magnifiques documents. C'est pour cette raison que Markdown connaît actuellement une hausse de popularité, non seulement comme outil de rédaction d'articles scientifiques, mais aussi comme norme pour l'édition en général. -[Atom](https://atom.io/) (disponible sur toutes les plateformes) et [Notepad++](http://notepad-plus-plus.org) (Windows seulement) sont parmi les éditeurs de texte brut tout usage les plus populaires. +[Atom](https://atom.io/) (disponible sur toutes les plateformes) et [Notepad++](https://notepad-plus-plus.org) (Windows seulement) sont parmi les éditeurs de texte brut tout usage les plus populaires. Il est important de comprendre que Markdown n'est qu'une convention d'écriture. Les fichiers Markdown sont enregistrés en texte brut, ce qui contribue d'autant plus à la flexibilité de ce format. Les fichiers en texte brut existent depuis l'apparition des machines à écrire électroniques. La longévité de cette norme fait du texte brut un format intrinsèquement plus durable et plus stable que les formats propriétaires. Alors que des fichiers créés avec Microsoft Word et Apple Pages il y a à peine dix ans peuvent provoquer des difficultés importantes lorsqu'ils sont affichés dans une version plus récente de ces logiciels, il est encore possible aujourd'hui d'afficher sans problème un fichier créé avec l'un des nombreux éditeurs de texte brut "disparus" depuis quelques décennies : AlphaPlus, Perfect Writer, Text Wizard, Spellbinder, WordStar, ou le préféré d'Isaac Asimov, SCRIPSIT 2.0, créé par la chaîne de magasins d'électronique Radio Shack. Écrire en texte brut permettra à vos fichiers d'être encore lisibles dans 10, 15 ou 20 ans. Cette leçon propose un processus de rédaction qui libère les chercheurs et chercheuses des logiciels de traitement de texte propriétaires et des formats non durables. @@ -315,7 +315,7 @@ Le filtre "citeproc" analysera toutes les clés de citation trouvées dans votre ## Changer de style de citation -Le style de citation par défaut de Pandoc est le Chicago Auteur-Date. On peut spécifier un style différent en utilisant une feuille de style écrite en "Citation Style Language" (une autre convention au format texte, qui décrit les styles de citation) et désignée par l'extension de fichier .csl. Heureusement, le projet CSL maintient un dépôt de styles de citation communs, certains étant même adaptés à des revues spécifiques. Visitez le site pour trouver le fichier .csl de la Modern Language Association (MLA), téléchargez `modern-language-association.csl`, et sauvegardez-le dans le répertoire de votre projet sous le nom `mla.csl`. Maintenant, nous devons signaler à Pandoc d'utiliser la feuille de style MLA au lieu du style par défaut, Chicago. Nous faisons ceci en mettant à jour l'en-tête YAML : +Le style de citation par défaut de Pandoc est le Chicago Auteur-Date. On peut spécifier un style différent en utilisant une feuille de style écrite en "Citation Style Language" (une autre convention au format texte, qui décrit les styles de citation) et désignée par l'extension de fichier .csl. Heureusement, le projet CSL maintient un dépôt de styles de citation communs, certains étant même adaptés à des revues spécifiques. Visitez le site pour trouver le fichier .csl de la Modern Language Association (MLA), téléchargez `modern-language-association.csl`, et sauvegardez-le dans le répertoire de votre projet sous le nom `mla.csl`. Maintenant, nous devons signaler à Pandoc d'utiliser la feuille de style MLA au lieu du style par défaut, Chicago. Nous faisons ceci en mettant à jour l'en-tête YAML : ``` --- @@ -354,13 +354,13 @@ Considérez vos fichiers sources comme une version faisant autorité de votre te ### En anglais : -En cas de problème, il n'y a pas de meilleur endroit pour commencer votre recherche que [le site web de Pandoc](https://pandoc.org/) et sa [liste de discussion](https://groups.google.com/g/pandoc-discuss). Des sites de type "Questions-réponses" peuvent répertorier des questions sur Pandoc, tel [Stack Overflow](http://stackoverflow.com/questions/tagged/pandoc); vous pouvez aussi consulter les archives du site [Digital Humanities Q&A](https://dhanswers.ach.org/) qui était actif de 2010 à 2019. Les questions peuvent également être posées en direct, sur la chaîne Pandoc de Freenode IRC, qui est fréquentée par un groupe d'habitué(e)s plutôt sympathiques. Au fur et à mesure que vous en apprendrez davantage sur Pandoc, vous pouvez également explorer l'une de ses fonctionnalités les plus puissantes : les [filtres](https://github.com/jgm/pandoc/wiki/Pandoc-Filters). +En cas de problème, il n'y a pas de meilleur endroit pour commencer votre recherche que [le site web de Pandoc](https://pandoc.org/) et sa [liste de discussion](https://groups.google.com/g/pandoc-discuss). Des sites de type "Questions-réponses" peuvent répertorier des questions sur Pandoc, tel [Stack Overflow](https://stackoverflow.com/questions/tagged/pandoc); vous pouvez aussi consulter les archives du site [Digital Humanities Q&A](https://dhanswers.ach.org/) qui était actif de 2010 à 2019. Les questions peuvent également être posées en direct, sur la chaîne Pandoc de Freenode IRC, qui est fréquentée par un groupe d'habitué(e)s plutôt sympathiques. Au fur et à mesure que vous en apprendrez davantage sur Pandoc, vous pouvez également explorer l'une de ses fonctionnalités les plus puissantes : les [filtres](https://github.com/jgm/pandoc/wiki/Pandoc-Filters). -Bien que nous suggérions de commencer avec un simple éditeur de texte, nombre d'alternatives à MS Word spécifiques à Markdown sont disponibles en ligne, et souvent sans frais (d'après [cette entrée de blogue](http://web.archive.org/web/20140120195538/http://mashable.com/2013/06/24/markdown-tools/) qui date de 2013, il en existait alors plus de 70). Parmi les projets autonomes, nous apprécions particulièrement [Mou](http://mouapp.com/), [Write Monkey](https://web.archive.org/web/20260327163157/http://writemonkey.com/), et [Sublime Text](http://www.sublimetext.com/). Plusieurs plateformes sur le Web ont récemment vu le jour et fournissent des interfaces graphiques élégantes pour l'écriture collaborative et le suivi des versions à l'aide de Markdown. Il s'agit entre autres de [prose.io](http://prose.io), [Authorea](http://www.authorea.com), [Draft](http://www.draftin.com), et [StackEdit](https://stackedit.io). +Bien que nous suggérions de commencer avec un simple éditeur de texte, nombre d'alternatives à MS Word spécifiques à Markdown sont disponibles en ligne, et souvent sans frais (d'après [cette entrée de blogue](https://web.archive.org/web/20140120195538/http://mashable.com/2013/06/24/markdown-tools/) qui date de 2013, il en existait alors plus de 70). Parmi les projets autonomes, nous apprécions particulièrement [Mou](https://mouapp.com/), [Write Monkey](https://web.archive.org/web/20260327163157/http://writemonkey.com/), et [Sublime Text](https://www.sublimetext.com/). Plusieurs plateformes sur le Web ont récemment vu le jour et fournissent des interfaces graphiques élégantes pour l'écriture collaborative et le suivi des versions à l'aide de Markdown. Il s'agit entre autres de [prose.io](https://prose.io), [Authorea](https://www.authorea.com), [Draft](https://www.draftin.com), et [StackEdit](https://stackedit.io). -Cependant, l'écosystème ne se limite pas aux éditeurs de texte. [Gitit](https://github.com/jgm/gitit) et [Ikiwiki](https://github.com/dubiousjim/pandoc-iki) supportent la rédaction en Markdown avec Pandoc comme analyseur de syntaxe. À cette liste, nous pouvons ajouter une gamme d'outils qui génèrent des pages web rapides et statiques, [Yst](https://github.com/jgm/yst), [Jekyll](http://github.com/fauno/jekyll-pandoc-multiple-formats), [Hakyll](http://jaspervdj.be/hakyll/), et [bash shell script](https://github.com/wcaleb/website), un projet de l'historien Caleb McDaniel. +Cependant, l'écosystème ne se limite pas aux éditeurs de texte. [Gitit](https://github.com/jgm/gitit) et [Ikiwiki](https://github.com/dubiousjim/pandoc-iki) supportent la rédaction en Markdown avec Pandoc comme analyseur de syntaxe. À cette liste, nous pouvons ajouter une gamme d'outils qui génèrent des pages web rapides et statiques, [Yst](https://github.com/jgm/yst), [Jekyll](https://github.com/fauno/jekyll-pandoc-multiple-formats), [Hakyll](https://jaspervdj.be/hakyll/), et [bash shell script](https://github.com/wcaleb/website), un projet de l'historien Caleb McDaniel. -Enfin, des plates-formes d'édition entières se développent autour de l'utilisation de Markdown. Un marché de l'édition en Markdown, comme le fait déjà [Leanpub](https://leanpub.com), pourrait être une alternative intéressante au modèle d'édition traditionnel. Nous-mêmes expérimentons avec la conception de revues universitaires basées sur GitHub et [readthedocs.org](http://readthedocs.org) (ces outils sont habituellement utilisés pour la documentation technique). +Enfin, des plates-formes d'édition entières se développent autour de l'utilisation de Markdown. Un marché de l'édition en Markdown, comme le fait déjà [Leanpub](https://leanpub.com), pourrait être une alternative intéressante au modèle d'édition traditionnel. Nous-mêmes expérimentons avec la conception de revues universitaires basées sur GitHub et [readthedocs.org](https://readthedocs.org) (ces outils sont habituellement utilisés pour la documentation technique). ### En français (N.D.L.R. : il s’agit de notes ajoutées à la version traduite) : @@ -378,7 +378,7 @@ Pour la gestion des bibliographies, consulter aussi: Raphaël Grolimund, Frédé [^2]: Les documents d'origine peuvent être [téléchargés à partir de GitHub](https://github.com/dhcolumbia/pandoc-workflow). Utilisez l'option "Raw" (brut) lors de la visualisation dans GitHub pour voir la source en Markdown. Les auteurs tiennent à remercier Alex Gil et ses collègues du Digital Humanities Center de Columbia, ainsi que les participants du studio openLab de la Bibliothèque Butler, qui ont testé le code de ce tutoriel sur diverses plateformes. -[^3]: Voir l'excellente réflexion de Charlie Stross sur ce sujet: [Why Microsoft Word Must Die](http://www.antipope.org/charlie/blog-static/2013/10/why-microsoft-word-must-die.html). +[^3]: Voir l'excellente réflexion de Charlie Stross sur ce sujet: [Why Microsoft Word Must Die](https://www.antipope.org/charlie/blog-static/2013/10/why-microsoft-word-must-die.html). [^4]: Il n'y a pas de bonnes solutions pour passer directement de LaTeX à MS Word. diff --git a/fr/lecons/telecharger-des-pages-web-avec-python.md b/fr/lecons/telecharger-des-pages-web-avec-python.md index c7c2e9beef..c0bc39b338 100644 --- a/fr/lecons/telecharger-des-pages-web-avec-python.md +++ b/fr/lecons/telecharger-des-pages-web-avec-python.md @@ -55,7 +55,7 @@ Voyons quelques exemples. http://oldbaileyonline.org ``` -Le type d'URL le plus élémentaire se contente de spécifier le protocole et l'hôte. Si vous fournissez cet URL à votre navigateur, vous obtiendrez la page d'accueil du site [Old Bailey Online](http://www.oldbaileyonline.org/). Par défaut, on assume que la page principale dans un répertoire donné est nommée 'index', le plus souvent `index.html`. +Le type d'URL le plus élémentaire se contente de spécifier le protocole et l'hôte. Si vous fournissez cet URL à votre navigateur, vous obtiendrez la page d'accueil du site [Old Bailey Online](https://www.oldbaileyonline.org/). Par défaut, on assume que la page principale dans un répertoire donné est nommée 'index', le plus souvent `index.html`. L'URL peut aussi inclure un *numéro de port* (optionnel). Sans entrer dans les détails, le protocole de communication qui gouverne les échanges d'information sur Internet permet aux ordinateurs de connecter de multiples façons. Les numéros de ports servent à identifier ces différentes manières de se connecter. Puisque le port par défaut pour les connexions HTTP est le 80, l'URL suivant est équivalent au précédent : @@ -95,7 +95,7 @@ En étudiant la structure de l'URL, il est possible d'apprendre plusieurs choses {% include figure.html filename="bowsey-trial-page.png" caption="Page de la transcription du procès de Benjamin Bowsey, 1780" %} -Examinez la page du procès de Benjamin Bowsey pendant quelques minutes. Concentrez-vous sur les caractéristiques de la page plutôt que sur la transcription elle-même. Par exemple, notez la présence du lien [View as XML](http://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/17800628.xml&div=t17800628-33&xml=yes) au bas de la page, qui vous amènera vers une version abondamment balisée du texte qui pourrait être utile pour certains types de recherche. Vous pouvez aussi consulter une [image numérisée du document d'origine](http://www.oldbaileyonline.org/images.jsp?doc=178006280084) qui a été transcrit pour construire cette ressource. +Examinez la page du procès de Benjamin Bowsey pendant quelques minutes. Concentrez-vous sur les caractéristiques de la page plutôt que sur la transcription elle-même. Par exemple, notez la présence du lien [View as XML](https://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/17800628.xml&div=t17800628-33&xml=yes) au bas de la page, qui vous amènera vers une version abondamment balisée du texte qui pourrait être utile pour certains types de recherche. Vous pouvez aussi consulter une [image numérisée du document d'origine](https://www.oldbaileyonline.org/images.jsp?doc=178006280084) qui a été transcrit pour construire cette ressource. Essayons maintenant d'ouvrir cette page en Python. Copiez le programme suivant dans votre éditeur de texte et sauvegardez-le sous le titre `open-webpage.py`. Lorsque vous exécuterez le programme, il ouvrira (`open`) la page du procès, il lira (`read`) son contenu dans une chaîne de caractères Python nommée contenu_web, puis il affichera les 300 premiers caractères du fichier à l'écran. Utilisez la commande `Outils -> Développement web -> Code source de la page` de Firefox (ou son équivalent dans votre propre navigateur) pour vérifier que le code source HTML de la page est bien identique à ce que vous venez de télécharger. Notez aussi que chaque navigateur possède son propre raccourci clavier qui permet d'accéder au code source HTML d'une page ; dans le cas de la version Windows de Firefox, il s'agit de `CTRL+u`. Si vous ne parvenez pas à trouver l'équivalent pour votre propre navigateur, essayez de faire appel à votre moteur de recherche favori pour y arriver. (Consultez la documentation de Python pour en savoir plus au sujet de [urllib](https://docs.python.org/fr/3/library/urllib.html?highlight=urllib).) diff --git a/fr/lecons/transcription-automatisee-graphies-non-latines.md b/fr/lecons/transcription-automatisee-graphies-non-latines.md index d02f331793..54693a5ba9 100644 --- a/fr/lecons/transcription-automatisee-graphies-non-latines.md +++ b/fr/lecons/transcription-automatisee-graphies-non-latines.md @@ -436,7 +436,7 @@ Une approche par *baselines* (en rouge sur la figure 10, il s'agit de la li ```xml - + Calfa 2022-08-23T14:48:18+00:00 @@ -838,7 +838,7 @@ Les données générées pour cet article et dans le cadre du projet CGPG sont d [^38]: *Ibid.* -[^39]: Bastien Kindt et Vidal-Gorène Chahan, « From Manuscript to Tagged Corpora. An Automated Process for Ancient Armenian or Other Under-Resourced Languages of the Christian East ». *Armeniaca. International Journal of Armenian Studies* 1, 73-96, 2022. [http://doi.org/10.30687/arm/9372-8175/2022/01/005]( http://doi.org/10.30687/arm/9372-8175/2022/01/005) +[^39]: Bastien Kindt et Vidal-Gorène Chahan, « From Manuscript to Tagged Corpora. An Automated Process for Ancient Armenian or Other Under-Resourced Languages of the Christian East ». *Armeniaca. International Journal of Armenian Studies* 1, 73-96, 2022. [https://doi.org/10.30687/arm/9372-8175/2022/01/005]( https://doi.org/10.30687/arm/9372-8175/2022/01/005) [^40]: Vidal-Gorène, Lucas, Salah, Decours-Perez, et Dupin. « RASAM–A Dataset for the Recognition and Analysis of Scripts in Arabic Maghrebi », 265-281. diff --git a/fr/nos-soutiens.md b/fr/nos-soutiens.md index d89e1fe507..98379cd78a 100644 --- a/fr/nos-soutiens.md +++ b/fr/nos-soutiens.md @@ -1,7 +1,7 @@ --- layout: blank title: Nos Soutiens -redirect_from: /nos-soutiens +redirect_from: /nos-soutiens/ original: supporters --- @@ -18,7 +18,7 @@ _Programming Historian_ est reconnaissant envers ses contributeurs passés et ac - [Jisc](https://www.jisc.ac.uk/), Royaume-Uni ## Partenariat institutionnel -Les institutions suivantes font partie de notre programme de [Partenariat institutionnel](pi) : +Les institutions suivantes font partie de notre programme de [Partenariat institutionnel](/fr/pi) : - [Bibliothèques de la KU Leuven](https://bib.kuleuven.be/), Belgique - [Western University Library](https://www.lib.uwo.ca/), Canada @@ -29,7 +29,7 @@ Les institutions suivantes font partie de notre programme de [Partenariat instit - [Cambridge Digital Humanities](https://www.cdh.cam.ac.uk/), Royaume-Uni - [Georg-August-Universität Göttingen](https://www.uni-goettingen.de/), Allemagne - [MIT Libraries](https://libraries.mit.edu/), États-Unis -- [Center for Digital Research in the Humanities, University of Nebraska-Lincoln](http://cdrh.unl.edu/), États-Unis +- [Center for Digital Research in the Humanities, University of Nebraska-Lincoln](https://cdrh.unl.edu/), États-Unis - [The National Archives](https://www.nationalarchives.gov.uk/), Royaume-Uni - [College of the Liberal Arts, Penn State University](https://la.psu.edu/), États-Unis - [University of Bristol Library](https://www.bristol.ac.uk/library/), Royaume-Uni @@ -49,10 +49,10 @@ Les institutions suivantes font partie de notre programme de [Partenariat instit - [University of Cambridge](https://www.cam.ac.uk/), Royaume-Uni -Nous accueillons volontiers les demandes de renseignements des contributeurs potentiels au [Partenariat institutionnel](pi). +Nous accueillons volontiers les demandes de renseignements des contributeurs potentiels au [Partenariat institutionnel](/fr/pi). ## Partenariats révolus -Les institutions suivantes ont participé à de notre programme de [Partenariat institutionnel](pi) : +Les institutions suivantes ont participé à de notre programme de [Partenariat institutionnel](/fr/pi) : - [Roy Rosenzweig Center for History and New Media, George Mason University](https://rrchnm.org/), États-Unis [2021-2022] - [UCL Centre for Digital Humanities](https://www.ucl.ac.uk/digital-humanities/), Royaume-Uni [2021-2022] diff --git a/fr/pi.md b/fr/pi.md index d9d8c1e99f..7bff3a3e7b 100644 --- a/fr/pi.md +++ b/fr/pi.md @@ -2,7 +2,7 @@ layout: blank title: Partenariat Institutionnele redirect_from: -- /pi +- /pi/ original: ipp --- @@ -45,7 +45,7 @@ Joindre le programme de Partenariat institutionnel vous donnera accès aux avant - Une invitation à notre [assemblée générale annuelle](https://beta.companieshouse.gov.uk/company/12192946) en tant que membre consultatif (une personne par Partenaire). - La ventilation annuelle des dépenses de ProgHist Ltd. -- La reconnaissance explicite de votre contribution dans la section [Nos soutiens](nos-soutiens). +- La reconnaissance explicite de votre contribution dans la section [Nos soutiens](/fr/nos-soutiens). - Le droit de souligner ce partenariat dans les activités de promotion de votre institution. - Des conseils ad hoc de notre équipe sur la gestion d'une publication en libre accès, la mise en œuvre de processus de publication multilingue ou l'adaptation d'articles du _Programming Historian_ pour un cadre d'atelier (consultation informelle sur demande). - Pour les bibliothèques partenaires: une liste d’articles publiés par toutes les versions du _Programming Historian_ (sur demande). @@ -55,7 +55,7 @@ Joindre le programme de Partenariat institutionnel vous donnera accès aux avant
    - + diff --git a/fr/politique-retrait-lecons.md b/fr/politique-retrait-lecons.md index 7c369967e1..8ac287deac 100644 --- a/fr/politique-retrait-lecons.md +++ b/fr/politique-retrait-lecons.md @@ -35,7 +35,7 @@ Qu'une leçon dérivée soit créée ou pas, voici les étapes à suivre pour re {% comment %} The following anchors need to be checked/replaced once all pages have been created and published in the FR branch) -[Author Guidelines for Writing Sustainably](/author-guidelines#write-sustainably) +[Author Guidelines for Writing Sustainably](/en/author-guidelines#write-sustainably) [Reviewer Guidelines for Assessing Lesson Sustainability](/reviewer-guidelines#sustainability) diff --git a/fr/reaction.md b/fr/reaction.md index b23e0b2aab..ad0b2374d4 100644 --- a/fr/reaction.md +++ b/fr/reaction.md @@ -13,7 +13,7 @@ Vous avez suivi méticuleusement les instructions d'une leçon et avez tout de m Nous définissons un bogue (ou *bug*) comme: "Une erreur dans un programme informatique qui produit un résultat inattendu ou qui se comporte différemment des instructions d'une leçon." Notez que nous ne pouvons pas nous occuper des erreurs causées par l'utilisateur qui modifie le code ou la documentation d'une leçon (ensemble de données, fichiers d'entrée, etc.) -Jetez un coup d'oeil aux [tickets qui traitent les bugs](https://github.com/orgs/programminghistorian/projects/6) pour voir si le problème que vous avez identifié n'a pas déjà été signalé. Si c'est le cas, mais que vous souhaitez partager des informations complémentaires, laissez un commentaire sur le ticket en question. Si le problème n'a pas déjà été signalé, vous pouvez procéder d'une des façons suivantes: +Jetez un coup d'oeil aux [tickets qui traitent les bugs](https://github.com/programminghistorian/jekyll/issues) pour voir si le problème que vous avez identifié n'a pas déjà été signalé. Si c'est le cas, mais que vous souhaitez partager des informations complémentaires, laissez un commentaire sur le ticket en question. Si le problème n'a pas déjà été signalé, vous pouvez procéder d'une des façons suivantes:
    S'il vous plaît, ne créez pas de pull request avec la correction. diff --git a/fr/recherche.md b/fr/recherche.md index a1ebe46999..4757bbf262 100644 --- a/fr/recherche.md +++ b/fr/recherche.md @@ -10,17 +10,17 @@ L'équipe du projet et les membres de la communauté plus large qui la compose s ## Édition originale du Programming Historian -* William J. Turkel et Alan MacEachern, [_The Programming Historian_](http://niche-canada.org/wp-content/uploads/2013/09/programming-historian-1.pdf) 1ère édition (Network in Canadian History & Environment: 2007-2008). +* William J. Turkel et Alan MacEachern, [_The Programming Historian_](https://niche-canada.org/wp-content/uploads/2013/09/programming-historian-1.pdf) 1ère édition (Network in Canadian History & Environment: 2007-2008). * Traduction en japonais de William J. Turkel et Alan MacEachern, [_The Programming Historian_](https://www.dh.ku-orcas.kansai-u.ac.jp/?cat=2) 1ère édition (Network in Canadian History & Environment: 2007-2008). ## Comptes-rendus -* Björn Ekström, Elisa Tattersall Wallin and Hana Marčetić, '[_Programming Historian_: Novice-friendly tutorials on digital methods](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1508542&dswid=7551)', _Tidskrift för ABM_, Vol. 5, no 1 (2020), pp. 71-75. +* Björn Ekström, Elisa Tattersall Wallin and Hana Marčetić, '[_Programming Historian_: Novice-friendly tutorials on digital methods](https://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1508542&dswid=7551)', _Tidskrift för ABM_, Vol. 5, no 1 (2020), pp. 71-75. * Dries Daems, '[A Review and Roadmap of Online Learning Platforms and Tutorials in Digital Archaeology](https://doi.org/10.1017/aap.2019.47)', _Advances in Archaeological Practice_, vol. 8, issue 1 (2020), pp. 87-92. * Martin Dröge, '[Review of: The Programming Historian](https://www.hsozkult.de/webreview/id/rezwww-184)', _H-Soz-Kult_ (2019). * Priscila Pilatowsky Goñi, '[Reseña a The programming historian](https://revistas.uned.es/index.php/RHD/article/view/22420)', _Revista de Humanidades Digitales_, vol. 2 (2018). * Lincoln Mullen, '[Review of the Programming Historian](https://academic.oup.com/jah/article-abstract/103/1/299/1751315)', _The Journal of American History_, vol. 103, no. 1 (2016), pp. 299-301. -* Cameron Blevins, '[Review of the Programming Historian](http://jitp.commons.gc.cuny.edu/review-of-the-programming-historian/)', _The Journal of Interactive Technology & Pedagogy_, vol. 8 (2015). +* Cameron Blevins, '[Review of the Programming Historian](https://jitp.commons.gc.cuny.edu/review-of-the-programming-historian/)', _The Journal of Interactive Technology & Pedagogy_, vol. 8 (2015). ## Publications scientifiques @@ -31,22 +31,22 @@ L'équipe du projet et les membres de la communauté plus large qui la compose s * Jennifer Isasi, Riva Quiroga, Nabeel Sidiqqui, Joana Vieira Paulino, Alex Wermer-Colan, [“A Model for Multilingual and Multicultural Digital Scholarship Methods Publishing"](https://www.taylorfrancis.com/chapters/edit/10.4324/9781003393696-3/model-multilingual-multicultural-digital-scholarship-methods-publishing-jennifer-isasi-riva-quiroga-nabeel-siddiqui-joana-vieira-paulino-alex-wermer-colan), dans _Multilingual Digital Humanities_, edité par Viola, L., & Spence, P., Routledge, 2023. * Adam Crymble & Charlotte M. H. Im, ['Measuring digital humanities learning requirements in Spanish & English-speaking practitioner communities'](https://doi.org/10.1007/s42803-023-00066-x), International Journal of Digital Humanities, (2023). * Eric Brasil, '[_pyHDB - Ferramenta Heurística para a Hemeroteca Digital Brasileira: utilizando técnicas de web scraping para a pesquisa em História_'](https://doi.org/10.15848/hh.v15i40.1904), _História Da Historiografia: International Journal of Theory and History of Historiography_, 15(40) (2022), 186–217. -* Matthew Lincoln, Sarah Melton, Jennifer Isasi, François Dominic Laramée, '[Relocating Complexity: The Programming Historian and Multilingual Static Site Generation](http://www.digitalhumanities.org/dhq/vol/16/2/000585/000585.html)', _Digital Humanities Quarterly_ 16, 2 (2022). +* Matthew Lincoln, Sarah Melton, Jennifer Isasi, François Dominic Laramée, '[Relocating Complexity: The Programming Historian and Multilingual Static Site Generation](https://www.digitalhumanities.org/dhq/vol/16/2/000585/000585.html)', _Digital Humanities Quarterly_ 16, 2 (2022). * Jennifer Isasi et Antonio Rojas Castro, ‘[¿Sin equivalencia? Una reflexión sobre la traducción al español de recursos educativos abiertos](https://muse.jhu.edu/article/842253)’, _Hispania_, 104, no. 4 (2021), 613-624. * Adam Crymble et Maria José Afanador Llach, ‘The Globally Unequal Promise of Digital Tools for History: UK and Colombia Case Study’ dans _Teaching History for the Contemporary World_, edité par Adele Nye, 85-98, Springer, 2021. * Daniel Alves, ['Ensinar Humanidades Digitais sem as Humanidades Digitais: um olhar a partir das licenciaturas em História'](https://novaresearch.unl.pt/files/32228034/Ensinar_Humanidades_Digitais.pdf), _Revista EducaOnline_, v. 15, n. 2 (2021). * Adam Crymble, [_Technology & the Historian: Transformations in the Digital Age_](https://www.press.uillinois.edu/books/catalog/57hxp7wr9780252043710.html), (University of Illinois Press, 2021). * Anna-Maria Sichani, James Baker, Maria José Afanador Llach, et Brandon Walsh, [‘Diversity and Inclusion in Digital Scholarship and Pedagogy: The Case of The Programming Historian’](https://doi.org/10.1629/uksg.465), _Insights_, (2019). -* Katrina Navickas et Adam Crymble, ['From Chartist Newspaper to Digital Map of Grass-roots Meetings, 1841-44: Documenting Workflows'](http://www.tandfonline.com/doi/full/10.1080/13555502.2017.1301179), _Journal of Victorian Culture_, (2017). +* Katrina Navickas et Adam Crymble, ['From Chartist Newspaper to Digital Map of Grass-roots Meetings, 1841-44: Documenting Workflows'](https://www.tandfonline.com/doi/full/10.1080/13555502.2017.1301179), _Journal of Victorian Culture_, (2017). * Adam Crymble, ['Identifying and Removing Gender Barriers in Open Learning Communities: The Programming Historian'](https://www.herts.ac.uk/__data/assets/pdf_file/0016/138013/Blip-2016-Autumn-2016-Final-Autumn-2016.pdf), _Blended Learning in Practice_, (2016), 49-60. [[pre-print pdf](/researchpapers/openLearningCommunities2016.pdf)] -* Fred Gibbs, ['Editorial Sustainability and Open Peer Review at Programming Historian',](http://web.archive.org/web/20180713014622/http://dhcommons.org/journal/issue-1/editorial-sustainability-and-open-peer-review-programming-historian) _DH Commons_, Vol. 1 (2015). -* Shawn Graham, Ian Milligan et Scott Weingart, [_Exploring Big Historical Data: The Historian's Macroscope_](http://www.themacroscope.org/2.0/), (Imperial College Press, 2015). +* Fred Gibbs, ['Editorial Sustainability and Open Peer Review at Programming Historian',](https://web.archive.org/web/20180713014622/https://dhcommons.org/journal/issue-1/editorial-sustainability-and-open-peer-review-programming-historian) _DH Commons_, Vol. 1 (2015). +* Shawn Graham, Ian Milligan et Scott Weingart, [_Exploring Big Historical Data: The Historian's Macroscope_](https://www.themacroscope.org/2.0/), (Imperial College Press, 2015). ## Rapports * Maria José Afanador-Llach & Andrés Rivera, '[Segundo ciclo de talleres: Herramientas y procesos digitales para la investigación y creación en artes y humanidades](/researchpapers/Informe_final_Talleres%20EHCN_2023-ENG_PH.pdf)', (2023). * Incllewsion et l'équipe du Programming Historian, 'Initial Accessibility Testing: Summary of Findings', (2021). -* Penny Andrews et l'équipe du Programming Historian, ['The Programming Historian: developing and sustaining impact in the Global South'](http://doi.org/10.5281/zenodo.3813763) (2020). +* Penny Andrews et l'équipe du Programming Historian, ['The Programming Historian: developing and sustaining impact in the Global South'](https://doi.org/10.5281/zenodo.3813763) (2020). * Amy Kavanagh et l'équipe du Programming Historian, 'Programming Historian – Access for visually impaired researchers', (n.d.). ## Ateliers et évènements @@ -65,7 +65,7 @@ L'équipe du projet et les membres de la communauté plus large qui la compose s * Alex Wermer-Colan, ['Learning Digital Methods with the _Programming Historian_'](https://charlesstudy.temple.edu/event/11953011), Temple University [En ligne], (22 février 2024). * Carlo Blum, Adam Crymble, Vicky Garnett, Timothée Giraud, Alíz Horváth, Stefan Krebs, Ralph Marschall, Sofia Papastamkou, & Lorella Viola, 'Invisible College of Digital History: Workshop on Multilingual Educational Resources', C²DH [En ligne], (8 novembre 2023). * Nabeel Siddiqui, 'Convolutional Neural Networks for Image Classification', University of Edinburgh [En ligne], (7 novembre 2023). -* Eric Brasil, '[História Digital e História Digital da Educação: Caminhos Cruzados](http://www.iea.usp.br/eventos/historia-digital-educacao-caminhos-cruzados)', Instituto de Estudos Avançados, USP, São Paulo, Brésil, (17 octobre 2023). +* Eric Brasil, '[História Digital e História Digital da Educação: Caminhos Cruzados](https://www.iea.usp.br/eventos/historia-digital-educacao-caminhos-cruzados)', Instituto de Estudos Avançados, USP, São Paulo, Brésil, (17 octobre 2023). * Scott Kleinman, Alex Wermer-Colan, Joana Vieira Paulino, Nabeel Siddiqui, Zoe LeBlanc, 'Developing a Digital Humanities Tutorial', [DH 2023](https://dh2023.adho.org/), Graz, Autriche, (10 juillet 2023). * Daphné Mathelier, 'Atelier Markdown', [11e journées du réseau Medici](https://web.archive.org/web/20230629084307/https://medici2023.sciencesconf.org/resource/page/id/2), Université de Liège, Belgique, (29 juin 2023). * María José Afanador Llach, Jennifer Isasi, Riva Quiroga, 'Sobre _Programming Historian en español_ y cómo contribuir a la publicación', Semana de Humanidades Digitales 2023 [En ligne], (10 mai 2023). @@ -152,10 +152,10 @@ L'équipe du projet et les membres de la communauté plus large qui la compose s * Adam Crymble, 'Facilitating Making in Digital Humanities', The Archaeology of Making, Université de Londres, Royaume-Uni, 5 mai 2021. * Daniel Alves, Jennifer Isasi, Sarah Melton, Sofia Papastamkou, Jessica Parr, Riva Quiroga, Nabeel Siddiqui, Brandon Walsh, '[The Programming Historian: A Global Case Study in Multilingual Open Access and DH Tutelage/Instruction](https://msuglobaldh.org/abstracts/#programming-historian)' (panel), _Global Digital Humanities Symposium_, Michigan State University, East Lansing, USA, 12 avril, 2021. * Jessica Parr, '[Cambridge Cultural Heritage Data School: Final plenary](https://www.cdh.cam.ac.uk/events/cambridge-cultural-heritage-data-school-final-plenary)', University of Cambridge, Royaume-Uni, 30 mars 2021. -* Jennifer Isasi & Riva Quiroga, ['_Programming Historian_: Un proyecto colaborativo para poner la programación al alcance de los humanistas'](http://ixa2.si.ehu.eus/intele/?q=webinars), _INTELE : INfraestructura de TEcnologías del LEnguaje_, España, 25 mars, 2021. +* Jennifer Isasi & Riva Quiroga, ['_Programming Historian_: Un proyecto colaborativo para poner la programación al alcance de los humanistas'](https://ixa2.si.ehu.eus/intele/?q=webinars), _INTELE : INfraestructura de TEcnologías del LEnguaje_, España, 25 mars, 2021. * Sofia Papastamkou, Jessica Parr & Riva Quiroga, 'Challenges for Digital Literacy in the Humanities: The Open, Community-Based and Multilinguistic Approach of _The Programming Historian_', NewsEye’s International Conference, France, 17 mars, 2021. * Riva Quiroga, ['Multilingual Digital Humanites'](https://mediacentral.ucl.ac.uk/Play/59506), Digital Humanities Long View Seminar, UCLDH, UK & CESTA, USA, 10 mars, 2021. -* Brandon Walsh, '[The Programming Historian and Editorial Process in Digital Publishing](http://walshbr.com/blog/the-programming-historian-and-editorial-process-in-digital-publishing/)', Modern Languages Association Conference 2021, 7-10 janvier 2021. +* Brandon Walsh, '[The Programming Historian and Editorial Process in Digital Publishing](https://walshbr.com/blog/the-programming-historian-and-editorial-process-in-digital-publishing/)', Modern Languages Association Conference 2021, 7-10 janvier 2021. * Sofia Papastamkou, François Dominic Laramée, Martin Grandjean, '[Le Programming Historian en français: quelles ressources éducatives libres pour les méthodes numériques ?](https://zenodo.org/record/3819954)', *Humanistica 2020*, Bordeaux, France, 12-14 mai 2020. * Sofia Papastamkou, 'A Beating Heart of Digital History: The Programming Historian', [Atelier Teaching Digital History](https://cas.au.dk/en/cedhar/events/show/artikel/teaching-digital-history-workshop), Center for Digital History Aarhus, Université d'Aarhus, Danemark, 23 Octobre 2019. * Jennifer Isasi, Maria José Afanador y Antonio Rojas Castro, 'Retos en la producción de tutoriales de HD en contexto hispanohablantes', Conferencia ACH 2019, The Association for Computers and the Humanities, Pittsburgh, 23-26 juillet, 2019, Pittsburgh. @@ -168,7 +168,7 @@ L'équipe du projet et les membres de la communauté plus large qui la compose s * Victor Gayol, 'La investigación del pasado y la historia digital: análisis de datos y cómo aprender (The Programming Historian en español)', _Humanidades Digitales_, IV Feria Internacional de Ciencias Sociales y Humanidades, Centro Universitario de Los Lagos - Universidad de Guadalajara, Lagos de Moreno, Jalisco (9 mars 2017). * Victor Gayol, 'The Programming Historian: 'un modelo colaborativo para la investigación y la ensenñanza en ciencias sociales y humanidades digitales', _Mesa de Trabajo sobre Ciencias Sociales y Humanidades Digitales_, El Colegio De Michoacán, Mexico (21 février 2017). * Adam Crymble, 'Bringing Digital Humanities into the University for Free', Université de Cape Town, Afrique du Sud (27-28 juin 2016). -* Fred Gibbs, 'The Programming Historian' (Poster), _American Historical Association_, New York (janvier 2015). +* Fred Gibbs, 'The Programming Historian' (Poster), _American Historical Association_, New York (janvier 2015). * Adam Crymble, 'The Programming Historian 2', _Digital History Seminar_, Institute of Historical Research, Londres (13 octobre 2013). * Adam Crymble, 'The Programming Historian 2', _Digital Humanities 2012_, Hamburg (juillet 2012). @@ -179,11 +179,11 @@ L'équipe du projet et les membres de la communauté plus large qui la compose s * Matthew Lincoln, 'Multilingual Jekyll: How The Programming Historian Does That', *matthewlincoln.net*, 1er mars 2020, . * Sue Levine, 'The Early-Stage Ph.D.'s Guide to Summer', _Inside Higher Education_, 10 juin 2019, . * 'Championing open access with online digital history journal', _University of Sussex Press Office_, 9 octobre 2018, . -* Adam Crymble, 'A Decade of Programming Historians', _Network in Canadian History & Environment_, 23 mars 2018, . +* Adam Crymble, 'A Decade of Programming Historians', _Network in Canadian History & Environment_, 23 mars 2018, . * Fred Gibbs, "Sustainable Publishing: Reflections of a Former Programming Historian Editor", FredGibbs.net, 2017, . -* Anaclet Pons, "The Programming Historian en español", _Clionauta: Blog de historia_, 14 juin 2017, . +* Anaclet Pons, "The Programming Historian en español", _Clionauta: Blog de historia_, 14 juin 2017, . * Seth Denbo, “Historian, Program! Self-Help for Digital Neophytes,” _Perspectives on History: The Newsmagazine of the American Historical Association_, mai 2017, . -* Víctor Gayol, '*The Programming Historian* en español', *Blog de Humanidades Digitales*, 17 mars 2017, . +* Víctor Gayol, '*The Programming Historian* en español', *Blog de Humanidades Digitales*, 17 mars 2017, . ## Projets utilisant le Programming Historian diff --git a/htmlproofer-output.txt b/htmlproofer-output.txt new file mode 100644 index 0000000000..6ced9a4734 --- /dev/null +++ b/htmlproofer-output.txt @@ -0,0 +1,12 @@ +/Users/zleblanc/programming_historian_materials/jekyll/vendor/bundle/ruby/3.2.0/gems/io-event-1.11.2/lib/io/event/support.rb:48: warning: IO::Buffer is experimental and both the Ruby and C interface may change in the future! +Running 3 checks (Images, Links, Scripts) in ["_site"] on *.html files ... + + +Checking 5338 external links +Checking 5120 internal links +Checking internal link hashes in 302 files +Ran on 493 files! + + +HTML-Proofer finished successfully. +Finished in 167.34 seconds diff --git a/htmlproofer-report.csv b/htmlproofer-report.csv new file mode 100644 index 0000000000..e83aa1120f --- /dev/null +++ b/htmlproofer-report.csv @@ -0,0 +1 @@ +File,Line,Message diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-01.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-01.png new file mode 100644 index 0000000000..d52dd0c94f Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-01.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-02.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-02.png new file mode 100644 index 0000000000..d38deff3e8 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-02.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-03.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-03.png new file mode 100644 index 0000000000..6134b4725c Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-03.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-04.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-04.png new file mode 100644 index 0000000000..9b55a43eca Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-04.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-05.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-05.png new file mode 100644 index 0000000000..2e0f8f101e Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-05.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-06.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-06.png new file mode 100644 index 0000000000..49cccd164b Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-06.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-07.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-07.png new file mode 100644 index 0000000000..99d642bf52 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-07.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-08.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-08.png new file mode 100644 index 0000000000..ee17746932 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-08.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-09.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-09.png new file mode 100644 index 0000000000..93a87a7de7 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-09.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-10.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-10.png new file mode 100644 index 0000000000..989b033d2b Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-10.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-11.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-11.png new file mode 100644 index 0000000000..db7ef09bca Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-11.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-12.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-12.png new file mode 100644 index 0000000000..16b4967f00 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-12.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-13.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-13.png new file mode 100644 index 0000000000..4c5d8f69e1 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-13.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-14.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-14.png new file mode 100644 index 0000000000..28dd8288bc Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-14.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-15.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-15.png new file mode 100644 index 0000000000..684f52dc09 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-15.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-16.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-16.png new file mode 100644 index 0000000000..b58979f8e2 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-16.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-17.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-17.png new file mode 100644 index 0000000000..43ee28a684 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-17.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-18.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-18.png new file mode 100644 index 0000000000..a9762e1916 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-18.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-19.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-19.png new file mode 100644 index 0000000000..13ff3eac6e Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-19.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-20.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-20.png new file mode 100644 index 0000000000..c4e040c466 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-20.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-21.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-21.png new file mode 100644 index 0000000000..0417992c70 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-21.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-22.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-22.png new file mode 100644 index 0000000000..08faa40737 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-22.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-23.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-23.png new file mode 100644 index 0000000000..d1c8783011 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-23.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-24.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-24.png new file mode 100644 index 0000000000..7d2099a6f7 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-24.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-25.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-25.png new file mode 100644 index 0000000000..4c7a5eb307 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-25.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-26.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-26.png new file mode 100644 index 0000000000..7f20e4d228 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-26.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-27.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-27.png new file mode 100644 index 0000000000..3bf5c1bc01 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-27.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-28.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-28.png new file mode 100644 index 0000000000..4aeb2025e0 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-28.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-29.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-29.png new file mode 100644 index 0000000000..911d8d2408 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-29.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-30.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-30.png new file mode 100644 index 0000000000..c2ad6fdc1f Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-30.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-31.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-31.png new file mode 100644 index 0000000000..c2ca8d06a9 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-31.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-32.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-32.png new file mode 100644 index 0000000000..577445fc43 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-32.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-33.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-33.png new file mode 100644 index 0000000000..9e0beac579 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-33.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-34.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-34.png new file mode 100644 index 0000000000..5ea5c81487 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-34.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-35.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-35.png new file mode 100644 index 0000000000..39bd311cd9 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-35.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-36.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-36.png new file mode 100644 index 0000000000..63e92f74c2 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-36.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-37.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-37.png new file mode 100644 index 0000000000..27e15297e9 Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-37.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-38.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-38.png new file mode 100644 index 0000000000..3bf33b8cfe Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-38.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-39.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-39.png new file mode 100644 index 0000000000..6352ec005c Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-39.png differ diff --git a/images/googlemaps-googleearth/en-or-googlemaps-googleearth-40.png b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-40.png new file mode 100644 index 0000000000..f999c6988d Binary files /dev/null and b/images/googlemaps-googleearth/en-or-googlemaps-googleearth-40.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-01.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-01.png new file mode 100644 index 0000000000..30b5076e70 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-01.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-02.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-02.png new file mode 100644 index 0000000000..8ca5667e6c Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-02.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-03.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-03.png new file mode 100644 index 0000000000..3dd9739f82 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-03.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-04.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-04.png new file mode 100644 index 0000000000..989e357513 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-04.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-05.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-05.png new file mode 100644 index 0000000000..6bfe17e9db Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-05.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-06.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-06.png new file mode 100644 index 0000000000..2c70cb9ee2 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-06.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-07.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-07.png new file mode 100644 index 0000000000..c7e9f83646 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-07.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-08.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-08.png new file mode 100644 index 0000000000..c265944cdf Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-08.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-09.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-09.png new file mode 100644 index 0000000000..7fe9e16a46 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-09.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-10.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-10.png new file mode 100644 index 0000000000..248570a449 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-10.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-11.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-11.png new file mode 100644 index 0000000000..e14259d4ee Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-11.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-13.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-13.png new file mode 100644 index 0000000000..3ba049171b Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-13.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-14.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-14.png new file mode 100644 index 0000000000..231b3be272 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-14.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-15.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-15.png new file mode 100644 index 0000000000..e67af1409f Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-15.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-16.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-16.png new file mode 100644 index 0000000000..ccb2c1c144 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-16.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-17.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-17.png new file mode 100644 index 0000000000..31b3ca294b Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-17.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-18.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-18.png new file mode 100644 index 0000000000..3f261a4c8f Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-18.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-19.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-19.png new file mode 100644 index 0000000000..21e5b6fd59 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-19.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-20.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-20.png new file mode 100644 index 0000000000..5a3557f055 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-20.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-21.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-21.png new file mode 100644 index 0000000000..caf42c2704 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-21.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-22.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-22.png new file mode 100644 index 0000000000..e1cbc4152b Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-22.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-23.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-23.png new file mode 100644 index 0000000000..ec22aabf55 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-23.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-24.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-24.png new file mode 100644 index 0000000000..a1cfe3f351 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-24.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-25.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-25.png new file mode 100644 index 0000000000..f608d392b5 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-25.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-26.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-26.png new file mode 100644 index 0000000000..27674017e8 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-26.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-27.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-27.png new file mode 100644 index 0000000000..88add889ad Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-27.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-28.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-28.png new file mode 100644 index 0000000000..4aeb2025e0 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-28.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-29.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-29.png new file mode 100644 index 0000000000..911d8d2408 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-29.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-30.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-30.png new file mode 100644 index 0000000000..ea6b0db176 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-30.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-31.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-31.png new file mode 100644 index 0000000000..6cdfe426f6 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-31.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-32.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-32.png new file mode 100644 index 0000000000..c66110319e Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-32.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-33.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-33.png new file mode 100644 index 0000000000..10d78502ee Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-33.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-34.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-34.png new file mode 100644 index 0000000000..4a5f7c8a1e Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-34.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-35.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-35.png new file mode 100644 index 0000000000..4323532f25 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-35.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-36.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-36.png new file mode 100644 index 0000000000..5d395a8671 Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-36.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-37.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-37.png new file mode 100644 index 0000000000..d5f780810e Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-37.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-38.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-38.png new file mode 100644 index 0000000000..27ede6b54c Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-38.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-39.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-39.png new file mode 100644 index 0000000000..1fcddd5a2b Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-39.png differ diff --git a/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-40.png b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-40.png new file mode 100644 index 0000000000..f999c6988d Binary files /dev/null and b/images/googlemaps-googleearth/es-tr-intro-a-google-maps-y-google-earth-40.png differ diff --git a/index.md b/index.md index 3414600e0a..ae149e537e 100644 --- a/index.md +++ b/index.md @@ -1,6 +1,7 @@ --- layout: base title: Programming Historian +lang: en ---
    diff --git a/parse_htmlproofer_log.rb b/parse_htmlproofer_log.rb new file mode 100644 index 0000000000..979d0873ba --- /dev/null +++ b/parse_htmlproofer_log.rb @@ -0,0 +1,41 @@ +require 'csv' + +input_path = "htmlproofer-output.txt" +output_path = "htmlproofer-report.csv" + +rows = [] +lines = File.readlines(input_path) +i = 0 + +while i < lines.size + line = lines[i] + + if line =~ /^\* At (.+?):(\d+):/ + file = $1.strip + lineno = $2.strip + + # Skip blank lines and fetch next non-empty message + message = nil + j = i + 1 + while j < lines.size + candidate = lines[j].strip + if !candidate.empty? + message = candidate + break + end + j += 1 + end + + rows << [file, lineno, message] if message + i = j + else + i += 1 + end +end + +CSV.open(output_path, "w") do |csv| + csv << ["File", "Line", "Message"] + rows.each { |row| csv << row } +end + +puts "✅ Parsed #{rows.size} errors to #{output_path}" \ No newline at end of file diff --git a/pt/apoiadores.md b/pt/apoiadores.md index 3615ba8eec..e27131b961 100644 --- a/pt/apoiadores.md +++ b/pt/apoiadores.md @@ -31,7 +31,7 @@ Contribuintes para o nosso [Programa de Parceria Institucional](/pt/ppi): - [Cambridge Digital Humanities](https://www.cdh.cam.ac.uk/), Reino Unido - [Georg-August-Universität Göttingen](https://www.uni-goettingen.de/), Alemanhã - [MIT Libraries](https://libraries.mit.edu/), Estados Unidos -- [Center for Digital Research in the Humanities, University of Nebraska-Lincoln](http://cdrh.unl.edu/), Estados Unidos +- [Center for Digital Research in the Humanities, University of Nebraska-Lincoln](https://cdrh.unl.edu/), Estados Unidos - [The National Archives](https://www.nationalarchives.gov.uk/), Reino Unido - [College of the Liberal Arts, Penn State University](https://la.psu.edu/), Estados Unidos - [University of Bristol Library](https://www.bristol.ac.uk/library/), Reino Unido diff --git a/pt/contribua.md b/pt/contribua.md index f6ef9ce59a..c5266196e1 100755 --- a/pt/contribua.md +++ b/pt/contribua.md @@ -22,7 +22,7 @@ Se desejar propor uma lição (escrita por si ou para outra pessoa escrever), [e {{ site.data.snippets.editor-guidelines-image-alt[page.lang] }} -Os membros do nosso conselho editorial ajudam a facilitar a revisão por pares e trabalham em estreita colaboração com os autores para melhorar as suas lições. As [nossas diretrizes para editores](directrizes-editor) visam assegurar que todos, desde os autores aos revisores, passando pelos membros da comunidade em geral, recebam uma experiência justa e consistente durante a revisão por pares. +Os membros do nosso conselho editorial ajudam a facilitar a revisão por pares e trabalham em estreita colaboração com os autores para melhorar as suas lições. As [nossas diretrizes para editores](/pt/directrizes-editor) visam assegurar que todos, desde os autores aos revisores, passando pelos membros da comunidade em geral, recebam uma experiência justa e consistente durante a revisão por pares. Periodicamente, podemos anunciar a procura de mais editores. @@ -48,9 +48,9 @@ Estamos especialmente gratos por dicas sobre lições que apresentam links quebr Este projeto é a nossa tentativa de demonstrar que a publicação acadêmica pode e deve ser de acesso aberto. Por favor, ajude-nos a divulgar essa mensagem e a proporcionar o maior acesso possível a este recurso, solicitando ao bibliotecário que inclua o projeto no catálogo da sua biblioteca. -O _Programming Historian_ está listado no WorldCat (em [português](https://search.worldcat.org/title/1332987197), [inglês](http://www.worldcat.org/title/programming-historian/oclc/951537099), [espanhol](https://www.worldcat.org/title/programming-historian-en-espanol/oclc/1061292935&referer=brief_results), e [francês](https://uva.worldcat.org/title/programming-historian-en-franais/oclc/1104391842)). +O _Programming Historian_ está listado no WorldCat (em [português](https://search.worldcat.org/title/1332987197), [inglês](https://www.worldcat.org/title/programming-historian/oclc/951537099), [espanhol](https://www.worldcat.org/title/programming-historian-en-espanol/oclc/1061292935&referer=brief_results), e [francês](https://uva.worldcat.org/title/programming-historian-en-franais/oclc/1104391842)). -Os nossos agradecimentos à [Biblioteca da Universidade de Purdue](http://purdue-primo-prod.hosted.exlibrisgroup.com/primo_library/libweb/action/dlDisplay.do?vid=PURDUE&search_scope=everything&docId=PURDUE_ALMA51671812890001081&fn=permalink), à Amanda Visconti e à Universidade da Virgínia. +Os nossos agradecimentos à [Biblioteca da Universidade de Purdue](https://purdue-primo-prod.hosted.exlibrisgroup.com/primo_library/libweb/action/dlDisplay.do?vid=PURDUE&search_scope=everything&docId=PURDUE_ALMA51671812890001081&fn=permalink), à Amanda Visconti e à Universidade da Virgínia. A versão em Inglês do projeto está indexada no [Directory of Open Access Journals](https://doaj.org/toc/2397-2068). diff --git a/pt/directrizes-editor.md b/pt/directrizes-editor.md index 2f6fcc9272..5e705e9902 100755 --- a/pt/directrizes-editor.md +++ b/pt/directrizes-editor.md @@ -22,12 +22,12 @@ Obrigado por editar uma lição para o *Programming Historian em português*. So Incentivamos sempre potenciais autores ou tradutores de lições a apresentar as suas ideias antes de começarem a escrever. Se uma proposta não é adequada para o *Programming Historian em português*, o papel do editor é informar o autor antes que ele tenha escrito a lição completa. Queremos com isto poupar o tempo e a energia de todos. Uma vez conversado com o autor e encorajadas as suas ideias, o objetivo é apoiar o autor até que a lição esteja pronta para publicar. O objetivo é ajudá-lo da forma mais eficiente possível e com directrizes claras. Para isso pode ser útil conhecer as nossas [diretrizes para autores](/pt/directrizes-autor). ### Espaços seguros -O *Programming Historian em português* compromete-se em manter um espaço seguro para a troca de ideias, onde todos possam falar sem medo de assédio ou abuso. O editor desempenha um papel fundamental ao garantir a tolerância neste espaço. O trabalho inclui reforçar permanentemente a nossa política anti-assédio. Se for preciso ajuda, basta perguntar a um dos [outros editores ou ao nosso mediador independente](/pt/equipe#programming-historian-em-português). Para saber mais, pode ler sobre o [compromisso com espaços seguros](/posts/PH-commitment-to-diversity) no blog do projeto. +O *Programming Historian em português* compromete-se em manter um espaço seguro para a troca de ideias, onde todos possam falar sem medo de assédio ou abuso. O editor desempenha um papel fundamental ao garantir a tolerância neste espaço. O trabalho inclui reforçar permanentemente a nossa política anti-assédio. Se for preciso ajuda, basta perguntar a um dos [outros editores ou ao nosso mediador independente](/pt/equipe). Para saber mais, pode ler sobre o [compromisso com espaços seguros](/posts/PH-commitment-to-diversity) no blog do projeto. ### Política anti-assédio Esta é uma declaração dos princípios do *Programming Historian em português* onde são definidas as expectativas para o tom e estilo de toda a comunicação entre revisores, autores, editores e participantes dos nossos fóruns públicos. -O *Programming Historian em português* dedica-se a criar um ambiente académico aberto em que os membros da comunidade podem examinar em liberdade e detalhadamente ideias, fazer perguntas, sugestões ou pedir esclarecimentos. Este espaço tem que ser livre de assédio para todos no projeto, independentemente do género, identidade e expressão de género, orientação sexual, deficiência, aparência física, raça, idade ou religião ou experiência técnica. Não é tolerado de nenhuma forma qualquer assédio ou ataque *ad hominem* a membros da comunidade. Os membros que violarem estas regras podem ser expulsos da comunidade, por avaliação do conselho editorial. Se alguém testemunhar ou sentir que foi vítima das atividades descritas acima, deve entrar em [contato com o nosso mediador independente](/pt/equipe#programming-historian-em-português). Obrigado por nos ajudar a criar um espaço seguro. +O *Programming Historian em português* dedica-se a criar um ambiente académico aberto em que os membros da comunidade podem examinar em liberdade e detalhadamente ideias, fazer perguntas, sugestões ou pedir esclarecimentos. Este espaço tem que ser livre de assédio para todos no projeto, independentemente do género, identidade e expressão de género, orientação sexual, deficiência, aparência física, raça, idade ou religião ou experiência técnica. Não é tolerado de nenhuma forma qualquer assédio ou ataque *ad hominem* a membros da comunidade. Os membros que violarem estas regras podem ser expulsos da comunidade, por avaliação do conselho editorial. Se alguém testemunhar ou sentir que foi vítima das atividades descritas acima, deve entrar em [contato com o nosso mediador independente](/pt/equipe). Obrigado por nos ajudar a criar um espaço seguro. ### Acompanhar as lições propostas Depois que uma proposta de lição receber "sinal verde" da equipa editorial e ter um editor atribuído, este trabalha com o autor para definir os objetivos da lição e acordar um prazo de submissão. O prazo recomendado é de 90 dias a contar do início da conversa editorial, embora possa ser ajustado, se necessário. @@ -63,7 +63,7 @@ O *Programming Historian em português* usa um modelo de revisão por pares aber Antes de solicitar revisões externas, o editor deve ler e experimentar o tutorial da lição e usar da sua experiência com o Programming Historian em português para ajudar o autor a fazer melhorias (se necessário). Não se espera que o editor seja um especialista no conteúdo da lição, esse é o papel dos [revisores](/pt/directrizes-revisor). -O editor deve ter uma visão geral para a sustentabilidade da lição e garantir que estão identificadas as versões, dependências e requisitos do software, as capturas de tela estão limitadas às necessárias para concluir a lição e que existe referência à documentação do próprio software (se disponível e apropriada). Os editores também devem garantir que as lições evitem, na medida do possível, instruções específicas do software, como "Clique com o botão direito do mouse no ícone x para aceder ao menu x", e sim favorecer visões gerais metodológicas. A [lista de validação editorial](/pt/directrizes-editor#c-revis%C3%A3o-para-a-sustentabilidade-e-internacionaliza%C3%A7%C3%A3o) contém mais detalhes sobre práticas de sustentabilidade do Programming Historian em português. +O editor deve ter uma visão geral para a sustentabilidade da lição e garantir que estão identificadas as versões, dependências e requisitos do software, as capturas de tela estão limitadas às necessárias para concluir a lição e que existe referência à documentação do próprio software (se disponível e apropriada). Os editores também devem garantir que as lições evitem, na medida do possível, instruções específicas do software, como "Clique com o botão direito do mouse no ícone x para aceder ao menu x", e sim favorecer visões gerais metodológicas. A [lista de validação editorial](#c-revisão-para-a-sustentabilidade-e-internacionalização) contém mais detalhes sobre práticas de sustentabilidade do Programming Historian em português. Muitas vezes, os editores precisam de ajuda para definir qual o público-alvo de uma lição ou identificar o jargão que precisa de explicação. Esta revisão inicial vai ajudar os revisores externos a concentrar-se em melhorar a lição. Normalmente, é feito abertamente no nosso sistema de submissão (abaixo), mas poderá ser uma revisão fechada a pedido de qualquer das partes. @@ -293,7 +293,7 @@ Aqui estão alguns locais para procurar imagens para a lição: - [British Library](https://www.flickr.com/photos/britishlibrary) - [Internet Archive Book Images](https://archive.org/details/bookimages) - [Virtual Manuscript Library of Switzerland](https://www.e-codices.unifr.ch/en) - - [Library of Congress Maps](http://www.loc.gov/maps/collections) + - [Library of Congress Maps](https://www.loc.gov/maps/collections) É preciso verificar se a imagem corresponde ao estilo das anteriores (deve ser uma imagem de livro, não uma fotografia), ter pelo menos 200 pixels em ambas as dimensões e não ter restrições de direitos de autor. A imagem não pode ser ofensiva e deve seguir o nosso [compromisso com a diversidade (em inglês)](/posts/PH-commitment-to-diversity). Convém encontrar algo que não perpetue estereótipos ou tenha uma mensagem subliminar de machismo ou superioridade branca. @@ -358,7 +358,7 @@ Existem várias maneiras de executar um *pull request* para publicar os ficheiro * A) Siga as nossas ["Orientações para contribuições técnicas"](https://github.com/programminghistorian/jekyll/wiki/Making-Technical-Contributions), que usam a GUI do Github. -* B) Use `git` na linha de comando. As instruções que se seguem assumem que os repositórios `jekyll` e` ph-submissions` já estão copiados para a máquina local. (a [lição sobre o uso do GitHub Desktop](/lessons/getting-started-with-github-desktop) pode ser útil se tudo isto for novidade.) Em caso de incertezas ou dúvidas sobre como fazer, entrar em contato com Matthew Lincoln para ajudar. +* B) Use `git` na linha de comando. As instruções que se seguem assumem que os repositórios `jekyll` e` ph-submissions` já estão copiados para a máquina local. (a [lição sobre o uso do GitHub Desktop](/en/lessons/getting-started-with-github-desktop) pode ser útil se tudo isto for novidade.) Em caso de incertezas ou dúvidas sobre como fazer, entrar em contato com Matthew Lincoln para ajudar. 1. Ir para o diretório do repositório local `ph-submissions`. 2. Usar o comando `git pull` para obter todas as alterações mais recentes na máquina local (ou `sync` se estiver no GitHub Desktop) @@ -405,7 +405,7 @@ O status destes testes (geralmente chamado de "Status de Compilação (Build sta Assim, pode ser vista a lista de todas as alterações feitas no repositório principal, juntamente com o ícone do status: -- Check verde: indica que a página está pronta para ir ao ar! Todos os links da página foram verificados e considerados válidos. [**O restante desta lição pode ser ignorado e passar directamente para a secção de agradecimento.**](#11-thank-everyone-and-encourage-promotion) +- Check verde: indica que a página está pronta para ir ao ar! Todos os links da página foram verificados e considerados válidos. [**O restante desta lição pode ser ignorado e passar directamente para a secção de agradecimento.**](#7-agradecer-a-todos-e-incentivar-a-divulgação) - Círculo amarelo: o último commit ainda está a compilar. Dentro de 1-2 minutos deve estar pronto. - X vermelho: houve um erro na compilação. diff --git a/pt/directrizes-revisor.md b/pt/directrizes-revisor.md index b541ac787c..1b6a399856 100755 --- a/pt/directrizes-revisor.md +++ b/pt/directrizes-revisor.md @@ -26,12 +26,12 @@ Valorizamos muito a transparência no nosso processo de produção e revisão de Portanto, o trabalho como revisor - e a sua identidade - será totalmente visível para o autor. Os comentários devem envolver diretamente o autor e a lição, e não o revisor. Se em algum momento não tiver certeza do seu papel ou do que fazer a seguir, sinta-se à vontade para fazer uma pergunta e um editor responderá assim que puder. -Seguindo as ideias de investigação pública e de revisão aberta por pares, geralmente incentivamos as discussões a permanecer no GitHub, conforme descrito no fluxo de trabalho editorial. No entanto, queremos também que todos se sintam à vontade. Em alguns casos, uma conversa privada pode ser mais apropriada. Se sentir a necessidade de discutir um assunto relacionado a um tutorial ou relacionado à revisão, por favor, sinta-se à vontade para enviar um [email diretamente para o editor designado](/pt/equipe#programming-historian-em-português), ou entrar em contato com o [nosso mediador independente](/pt/equipe#programming-historian-em-português). +Seguindo as ideias de investigação pública e de revisão aberta por pares, geralmente incentivamos as discussões a permanecer no GitHub, conforme descrito no fluxo de trabalho editorial. No entanto, queremos também que todos se sintam à vontade. Em alguns casos, uma conversa privada pode ser mais apropriada. Se sentir a necessidade de discutir um assunto relacionado a um tutorial ou relacionado à revisão, por favor, sinta-se à vontade para enviar um [email diretamente para o editor designado](/pt/equipe), ou entrar em contato com o [nosso mediador independente](/pt/equipe). A menos que seja dada indicação contrária, o nome dos revisores estará na página da lição quando for publicada oficialmente e também estará listado na nossa página de colaboradores. ### Abertura e Inclusão -O _Programming Historian em português_ se dedica a prover um ambiente académico aberto que oferece aos participantes da comunidade a liberdade de examinar minuciosamente as ideias, fazer perguntas, fazer sugestões ou pedidos de esclarecimento. Insistimos em ter um espaço livre de assédio para todos os colaboradores do projeto, independentemente do género, identidade e expressão de género, orientação sexual, deficiência, aparência física, tamanho corporal, raça, idade ou religião ou experiência técnica. Não toleramos o assédio ou ataques _ad hominem_ aos participantes da comunidade, de nenhuma forma. Os participantes que violem estas regras podem ser expulsos da comunidade, a critério do conselho editorial. Se alguém testemunhar ou sentir que foi vítima da atividade acima descrita, por favor [contacte o nosso mediador independente](/pt/equipe#programming-historian-em-português). Obrigado por nos ajudar a criar um espaço seguro. +O _Programming Historian em português_ se dedica a prover um ambiente académico aberto que oferece aos participantes da comunidade a liberdade de examinar minuciosamente as ideias, fazer perguntas, fazer sugestões ou pedidos de esclarecimento. Insistimos em ter um espaço livre de assédio para todos os colaboradores do projeto, independentemente do género, identidade e expressão de género, orientação sexual, deficiência, aparência física, tamanho corporal, raça, idade ou religião ou experiência técnica. Não toleramos o assédio ou ataques _ad hominem_ aos participantes da comunidade, de nenhuma forma. Os participantes que violem estas regras podem ser expulsos da comunidade, a critério do conselho editorial. Se alguém testemunhar ou sentir que foi vítima da atividade acima descrita, por favor [contacte o nosso mediador independente](/pt/equipe). Obrigado por nos ajudar a criar um espaço seguro. ## O que comentar O estilo informal das lições do _Programming Historian em português_ pode fazer com que estas pareçam enganosamente simples de escrever. De fato, escrever um bom tutorial é tão exigente, se não mais, do que qualquer outro tipo de escrita académica. Abaixo estão algumas perguntas comuns a ter em mente ao rever uma lição. Algumas serão mais relevantes do que outras, dependendo do tema, do público-alvo e da dificuldade do tutorial. É importante ressaltar que esta não é uma lista restritiva ou abrangente; não pedimos aos revisores que respondam a cada uma das perguntas, mas esperamos que possam fornecer alguma orientação geral. @@ -79,4 +79,4 @@ Integrando a lição no _Programming Historian em português_ - A lição está ligada a outras lições existentes e tem os links apropriados? ## Submeter a revisão -Gerimos todos os comentários de revisão por pares através do GitHub. Quando uma nova lição estiver pronta para revisão, o editor fornecerá um link para a lição e um link para o fórum de discussão onde podem ser feitos comentários construtivos. Esta discussão é realizada no Github, um ambiente de codificação social livre. É preciso se inscrever numa conta gratuita do GitHub para publicar o comentário. Encorajamos que a discussão permaneça no Github, mas pode enviar um e-mail ao editor em particular ou entrar em contato com o [nosso mediador independente](/pt/equipe#programming-historian-em-português). +Gerimos todos os comentários de revisão por pares através do GitHub. Quando uma nova lição estiver pronta para revisão, o editor fornecerá um link para a lição e um link para o fórum de discussão onde podem ser feitos comentários construtivos. Esta discussão é realizada no Github, um ambiente de codificação social livre. É preciso se inscrever numa conta gratuita do GitHub para publicar o comentário. Encorajamos que a discussão permaneça no Github, mas pode enviar um e-mail ao editor em particular ou entrar em contato com o [nosso mediador independente](/pt/equipe). diff --git a/pt/directrizes-tradutor.md b/pt/directrizes-tradutor.md index 933e88830f..cf5a09e71b 100644 --- a/pt/directrizes-tradutor.md +++ b/pt/directrizes-tradutor.md @@ -32,7 +32,7 @@ Todas as nossas lições também devem ser escritas em Markdown e seguir as noss ## Submeter uma lição traduzida Depois do ficheiro de tradução ter as especificações acima mencionadas, estará pronto a ser enviado para revisão por pares. -Temos uma página do [_Programming Historian em português_ no GitHub](https://github.com/programminghistorian), onde mantemos dois repositórios (um repositório é um local para armazenar ficheiros e pastas relacionados, ou seja, um tipo de pasta). Um deles, chamado [jekyll](https://github.com/programminghistorian/jekyll), hospeda o código da versão online do site disponível em http://programminghistorian.org. O outro repositório é chamado [ph-submissions](https://github.com/programminghistorian/ph-submissions). +Temos uma página do [_Programming Historian em português_ no GitHub](https://github.com/programminghistorian), onde mantemos dois repositórios (um repositório é um local para armazenar ficheiros e pastas relacionados, ou seja, um tipo de pasta). Um deles, chamado [jekyll](https://github.com/programminghistorian/jekyll), hospeda o código da versão online do site disponível em https://programminghistorian.org. O outro repositório é chamado [ph-submissions](https://github.com/programminghistorian/ph-submissions). A melhor maneira para enviar uma tradução é adicioná-la diretamente ao repositório [ph-submissions](https://github.com/programminghistorian/ph-submissions). Graças aos recursos do GitHub, pode fazer isso usando ações de arrastar e soltar, com as quais provavelmente já está familiarizado. Para os novos tradutores, estas são as etapas: @@ -55,7 +55,7 @@ A etapa seguinte mais importante é que seu editor crie um ticket para a nova tr ### Aguardar comentários do revisor O nosso objetivo é concluir o processo de revisão dentro de quatro semanas, mas às vezes ocorrem atrasos ou as pessoas ficam ocupadas, pelo que o processo pode demorar mais do que esperávamos. -Seguindo os ideais da investigação académica pública e de revisão aberta por pares, incentivamos as discussões a permanecer no GitHub, para serem abertas a todos. No entanto, também queremos que todos se sintam confortáveis com o processo. Se precisar de discutir algo em particular, não hesite em enviar um [e-mail diretamente ao seu editor](/pt/equipe#programming-historian-em-português) ou entre em [contato com o nosso mediador independente](/pt/equipe#programming-historian-em-português). +Seguindo os ideais da investigação académica pública e de revisão aberta por pares, incentivamos as discussões a permanecer no GitHub, para serem abertas a todos. No entanto, também queremos que todos se sintam confortáveis com o processo. Se precisar de discutir algo em particular, não hesite em enviar um [e-mail diretamente ao seu editor](/pt/equipe) ou entre em [contato com o nosso mediador independente](/pt/equipe). ### Responder aos comentários YO editor e os revisores provavelmente farão algumas sugestões de melhoria no ticket da sua tradução. O editor deve esclarecer quais sugestões são essenciais, quais são opcionais e quais podem ser deixadas de lado. diff --git a/pt/doacoes.md b/pt/doacoes.md index c02fae4f11..6316d0e38f 100644 --- a/pt/doacoes.md +++ b/pt/doacoes.md @@ -19,7 +19,7 @@ O seu suporte contribui diretamente para manter a infraestrutura de divulgação
    - + diff --git a/pt/index.md b/pt/index.md index 2cfb95b62f..1c3762cf97 100644 --- a/pt/index.md +++ b/pt/index.md @@ -2,6 +2,7 @@ layout: base title: O Programming Historian em português original: index +permalink: /pt/ ---
    @@ -31,13 +32,13 @@ original: index

    Contribua

    -

    Escreva uma lição, junte-se à nossa equipe de revisores, dê o seu feedback. Estamos sempre interessados em ouvir os nossos leitores!

    +

    Escreva uma lição, junte-se à nossa equipe de revisores, dê o seu feedback. Estamos sempre interessados em ouvir os nossos leitores!

    A Nossa Equipe

    -

    Sendo o Programming Historian em português uma iniciativa voluntária, voltada para a comunidade, nos orgulhamos de mostrar e dar crédito às muitas pessoas que têm contribuído com o seu tempo e energia para o projeto.

    +

    Sendo o Programming Historian em português uma iniciativa voluntária, voltada para a comunidade, nos orgulhamos de mostrar e dar crédito às muitas pessoas que têm contribuído com o seu tempo e energia para o projeto.

    diff --git a/pt/licoes-politica-remocao.md b/pt/licoes-politica-remocao.md index 97871730d3..460feb104b 100755 --- a/pt/licoes-politica-remocao.md +++ b/pt/licoes-politica-remocao.md @@ -29,11 +29,11 @@ Quer uma versão derivada seja criada ou não, a lição a remover seguirá as s ## Mais sobre sustentabilidade das lições -[Directrizes para Autores para escrever de maneira sustentável](/directrizes-autor#escrita-sustentável) +[Directrizes para Autores para escrever de maneira sustentável](/pt/directrizes-autor#escrita-sustentável) -[Diretrizes para Revisores para avaliar a sustentabilidade da lição](/directrizes-revisor#sustentabilidade) +[Diretrizes para Revisores para avaliar a sustentabilidade da lição](/pt/directrizes-revisor#sustentabilidade) -[Directrizes para Editores para promoverem a sustentabilidade da lição](/directrizes-editor#c-revisão-para-a-sustentabilidade-e-internacionalização) +[Directrizes para Editores para promoverem a sustentabilidade da lição](/pt/directrizes-editor#c-revisão-para-a-sustentabilidade-e-internacionalização) ## Lições removidas diff --git a/pt/licoes/HTML-lista-palavras-1.md b/pt/licoes/HTML-lista-palavras-1.md index 107882f503..f339c4177e 100644 --- a/pt/licoes/HTML-lista-palavras-1.md +++ b/pt/licoes/HTML-lista-palavras-1.md @@ -1,152 +1,152 @@ ---- -title: De HTML para Lista de Palavras (parte 1) -layout: lesson -collection: lessons -slug: HTML-lista-palavras-1 -date: 2012-07-17 -translation_date: 2022-10-27 -authors: -- William J. Turkel -- Adam Crymble -reviewers: -- Jim Clifford -- Frederik Elwert -editors: -- Miriam Posner -translator: -- Felipe Lamarca -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Daniel Bonatto Seco -- Diana Rebelo Rodriguez -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/442 -next: HTML-lista-palavras-1 -series_total: 2 lessons -sequence: 1 -activity: transforming -topics: [python] -abstract: "Nesta lição de duas partes, aprofundaremos o que aprendeu sobre o Download de Páginas Web com Python, aprendendo como remover a marcação HTML de uma página web da transcrição do julgamento criminal de Benjamin Bowsey em 1780. Faremos isso usando uma variedade de operadores de string, métodos de string e habilidades de leitura atenta. Introduziremos looping e branching de modo que os programas possam repetir tarefas e testar certas condições, tornando possível a separação do conteúdo das tags HTML. Finalmente, faremos a conversão do conteúdo de uma string longa para uma lista de palavras, que podem ser ordenadas, indexadas e contabilizadas posteriormente." -original: from-html-to-list-of-words-1 -avatar_alt: Uma girafa a ser imitada por um humano -doi: 10.46430/phpt0027 ---- - -{% include toc.html %} - -
    -O site do Old Bailey Online foi recentemente atualizado. Infelizmente, devido às diversas mudanças, muitos (se não todos) os elementos do site de exemplo usado nesta lição não funcionarão conforme descrito. No entanto, as metodologias ensinadas por esta lição permanecem relevantes e podem ser adaptadas pelos leitores para um site de exemplo diferente. Estamos trabalhando na adaptação da lição para o novo site do Old Bailey Online, mas ainda não temos cronograma preciso de quando a lição será atualizada. [Abril de 2024] -
    - -## Objetivos da lição - -Nesta lição de duas partes, aprofundaremos o que aprendeu sobre o [Download de Páginas Web com Python](/pt/licoes/download-paginas-web-python), aprendendo como remover a *marcação HTML* de uma página web da [transcrição do julgamento criminal de Benjamin Bowsey em 1780](https://perma.cc/8LM6-W39K). Faremos isso usando uma variedade de *operadores de string*, *métodos de string* e habilidades de leitura atenta. Introduziremos *looping* e *branching* de modo que os programas possam repetir tarefas e testar certas condições, tornando possível a separação do conteúdo das tags HTML. Finalmente, faremos a conversão do conteúdo de uma string longa para uma *lista de palavras*, que podem ser ordenadas, indexadas e contabilizadas posteriormente. - -## O Desafio - -Para ter uma ideia mais clara da tarefa que temos pela frente, abra o ficheiro *obo-t17800628-33.html* que criou em [Download de Páginas Web com Python](/pt/licoes/download-paginas-web-python) (ou faça o [download e guarde a transcrição do julgamento](/assets/from-html-to-list-of-words-1/obo-t17800628-33.html) caso ainda não tenha uma cópia) e depois verifique o código-fonte do HTML clicando em *Ferramentas -> Ferramentas do Navegador -> Fonte da página* (para usuários do navegador Firefox). À medida que for olhando o código-fonte, notará que há tags HTML misturadas com texto. Caso não tenha experiência com HTML, recomendamos que faça o tutorial do W3 Schools [HTML](http://www.w3schools.com/html/) para se familiarizar com a marcação HTML. Se o seu trabalho frequentemente requer que remova a marcação HTML, certamente será útil entendê-la ao visualizá-la. - -## Ficheiros Necessários para esta Lição - -- *[obo-t17800628-33.html](/assets/from-html-to-list-of-words-1/obo-t17800628-33.html)* - -## Idealizando um Algoritmo - -Uma vez que o objetivo é nos livrarmos do HTML, o primeiro passo é criar um algoritmo que retorna apenas o texto (removendo as tags HTML) do artigo. Um algoritmo é um procedimento suficientemente detalhado a ponto de poder ser implementado em um computador. Facilita escrever o seu algoritmo no português direto; é uma ótima maneira de delinear exatamente o que deseja fazer antes de mergulhar no código. Para construir esse algoritmo, utilizaremos as nossas habilidades de leitura atenta para descobrir um modo de capturar apenas o conteúdo textual da biografia. - -Ao verificar o código-fonte do *obo-t17800628-33.html*, notará que a transcrição real não começa imediatamente. Na verdade, há um número de tags HTML e algumas informações de citação. Nesse caso, o conteúdo não começa antes da linha 81! - -``` xml -

    324. BENJAMIN BOWSEY (a blackmoor ) was indicted for that he together with five hundred other persons and more, did, unlawfully, riotously, and tumultuously assemble on the 6th of June -``` - -Estamos interessados apenas na transcrição em si e não nos metadados extras contidos nas tags. No entanto, irá notar que o final dos metadados corresponde ao início da transcrição. Isso torna a localização dos metadados uma marcação potencialmente útil para isolar o texto transcrito. - -À primeira vista, percebemos que a transcrição do julgamento em si começa com uma tag HTML: `

    `, que significa 'parágrafo'. Essa é coincidentemente a primeira tag de parágrafo no documento. Podemos usar isso para encontrar o ponto de partida do nosso texto transcrito. Temos sorte nesse caso porque essa tag é uma maneira confiável de encontrar o início do texto transcrito no julgamento (caso deseje, dê uma olhada em alguns outros julgamentos para verificar). - -O texto do julgamento termina na linha 82 com outra tag HTML: `
    `, que significa uma quebra de linha. Essa é a última quebra de linha no documento. Essas duas tags (tag de primeiro parágrafo e última quebra de linha), portanto, nos oferecem uma forma de isolar o texto desejado. Sites bem formatados quase sempre terão uma forma única de sinalizar o fim de um conteúdo. Você frequentemente só precisa verificar de forma atenta. - -A próxima tarefa é remover toda a marcação HTML que permanece mesclada ao conteúdo. Como sabe que tags HTML são sempre encontradas em pares correspondentes de parênteses angulares, é provavelmente uma aposta segura o fato de que, se remover tudo o que estiver entre parênteses angulares, todo o HTML será removido e restará somente a transcrição. Note que estamos assumindo que a transcrição não possuirá os símbolos matemáticos de "menor que" ou "maior que". Se Bowsey fosse um matemático, essa suposição não seria tão segura. - -A seguir, descreve-se o algoritmo em palavras. - -Para isolar o conteúdo: - -- Fazer o download do texto transcrito -- Buscar no HTML e guardar a localização da primeira tag `

    ` -- Buscar no HTML e guardar a localização da última tag `
    ` -- Armazenar tudo que vier após a tag `

    ` e antes da tag `
    ` numa string: *pageContents* - -Neste ponto, temos o texto da transcrição do julgamento, além da marcação HTML. Em seguida: - -- Verificar cada caractere na string *pageContents*, um por um -- Se o caractere for um colchete angular esquerdo (\<), estamos dentro de uma tag e deve-se ignorar os caracteres subsequentes -- Se o caractere for um colchete angular direito (\>), estamos deixando a tag; deve-se ignorar este caractere, mas verificar cada um dos caracteres subsequentes -- Se não estivermos dentro de uma tag, adiciona-se cada caractere a uma nova variável: *text* - -Finalmente: - -- Separar a string de texto em uma lista de palavras individuais, que podem ser manipuladas posteriormente. - -## Isolar o Conteúdo Desejado - -Os próximos passos utilizam os comandos de Python introduzidos na lição [Manipular strings com Python](/pt/licoes/manipular-strings-python) para implementar a primeira metade do algoritmo: remover todo o conteúdo antes da tag `

    ` e depois da tag `
    `. Para recapitular, o algoritmo era o seguinte: - -- Fazer o download do texto transcrito -- Buscar no HTML e guardar a localização da primeira tag `

    ` -- Buscar no HTML e guardar a localização da última tag `
    ` -- Armazenar tudo que vier após a tag `

    ` e antes da tag `
    ` numa string: *pageContents* - -Para fazer isso, você utilizará o método de string 'find', o método .rfind() (que encontra a última correspondência de algo) e criará uma nova substring contendo apenas o conteúdo desejado entre essas posições de índice. - -Enquanto trabalha, desenvolverá ficheiros separados para armazenar o seu código. Um deles será chamado `obo.py` (para "Old Bailey Online"). Esse ficheiro conterá todo o código que deseja reutilizar; em outras palavras, `obo.py` é um módulo. Discutimos a ideia de módulo em [Reutilização de código e modularidade em Python](/pt/licoes/reutilizacao-codigo-modularidade-python), quando salvamos nossas funções em `cumprimento.py`. - -Crie um novo ficheiro chamado `obo.py` e armazene-o no seu diretório *programming-historian*. Utilizaremos esse ficheiro para manter cópias das funções necessárias para processar o The Old Bailey Online. Digite ou copie o código a seguir no seu ficheiro: - -``` python -# obo.py - -def stripTags(pageContents): - pageContents = str(pageContents) - startLoc = pageContents.find("

    ") - endLoc = pageContents.rfind("
    ") - - pageContents = pageContents[startLoc:endLoc] - return pageContents -``` - -Crie um segundo ficheiro, `trial-content.py`, e salve o programa mostrado abaixo: - - -``` python -# trial-content.py - -import urllib.request, urllib.error, urllib.parse, obo - -url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' - -response = urllib.request.urlopen(url) -HTML = response.read().decode('UTF-8') - -print((obo.stripTags(HTML))) -``` - -Quando executar o `trial-content.py`, ele acessará a página web da transcrição do julgamento de Bowsey e depois verificará o módulo `obo.py` para buscar a função *stripTags*. Ele utilizará essa função para extrair tudo após a primeira tag `

    ` e antes da última tag `
    `. Com alguma sorte, esse deve ser o conteúdo textual da transcrição de Bowsey, além de alguma marcação HTML. Não se preocupe se a sua tela de Saída de Comando terminar em uma linha preta grossa. A tela de saída do Komodo Edit possui um número máximo de caracteres para exibição, após o qual os caracteres começarão a literalmente escrever uns sobre os outros na tela, dando a aparência de uma linha preta. Não se preocupe: o texto está lá, ainda que não consiga vê-lo; pode cortá-lo e colá-lo em um ficheiro de texto para verificar. - -Vamos reservar um momento para ter certeza de que entendemos como `trial-contents.py` é capaz de usar as funções armazenadas em `obo.py`. A função *stripTags* que salvamos em `obo.py` requer um argumento. Em outras palavras, para que seja executada apropriadamente ela precisa que uma informação seja oferecida. Lembre-se do exemplo do cão treinado na lição anterior. Para latir, o cachorro precisa de duas coisas: ar e uma guloseima deliciosa. A função *stripTags* em `obo.py` precisa de uma coisa: a string chamada *pageContents*. Mas você perceberá que, quando chamamos *stripTags* no programa final (`trial-contents.py`), não há menção ao "*pageContents*". Em vez disso, a função recebe HTML como um argumento. Isso pode ser confuso para muitas pessoas quando começam a programar. Uma vez que uma função foi declarada, não precisamos usar o mesmo nome de variável quando chamamos a função. Desde que forneçamos o mesmo tipo de argumento, tudo deve funcionar bem, independente de como o chamarmos. Nesse caso, queríamos que *pageContents* usasse o conteúdo da nossa variável HTML. Você poderia ter passado qualquer string, inclusive uma que você insira diretamente entre aspas. Tente executar novamente `trial-content.py`, alterando o argumento de *stripTags* para "Eu gosto muito de cachorros" e veja o que acontece. Note que, dependendo de como defina a sua função (e o que ela faz), o seu argumento pode precisar ser algo que não seja uma string: um número inteiro (*integer*), por exemplo. - -Leituras sugeridas ------------------ - -- Lutz, *Learning Python* - - Ch. 7: Strings - - Ch. 8: Lists and Dictionaries - - Ch. 10: Introducing Python Statements - - Ch. 15: Function Basics - -## Sincronização de Código - -Para acompanhar lições futuras, é importante ter os ficheiros e programas corretos no seu diretório “programming-historian”. No final de cada lição, é possível fazer o download do ficheiro zip “programming-historian” para ter a certeza de que o ficheiro correto está a ser utilizado. Observe que removemos os ficheiros desnecessários das lições anteriores. Seu diretório pode conter mais ficheiros e não há problema! - -- programming-historian-2 ([zip](/assets/python-lessons2.zip)) +--- +title: De HTML para Lista de Palavras (parte 1) +layout: lesson +collection: lessons +slug: HTML-lista-palavras-1 +date: 2012-07-17 +translation_date: 2022-10-27 +authors: +- William J. Turkel +- Adam Crymble +reviewers: +- Jim Clifford +- Frederik Elwert +editors: +- Miriam Posner +translator: +- Felipe Lamarca +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Daniel Bonatto Seco +- Diana Rebelo Rodriguez +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/442 +next: /pt/licoes/HTML-lista-palavras-1 +series_total: 2 lessons +sequence: 1 +activity: transforming +topics: [python] +abstract: "Nesta lição de duas partes, aprofundaremos o que aprendeu sobre o Download de Páginas Web com Python, aprendendo como remover a marcação HTML de uma página web da transcrição do julgamento criminal de Benjamin Bowsey em 1780. Faremos isso usando uma variedade de operadores de string, métodos de string e habilidades de leitura atenta. Introduziremos looping e branching de modo que os programas possam repetir tarefas e testar certas condições, tornando possível a separação do conteúdo das tags HTML. Finalmente, faremos a conversão do conteúdo de uma string longa para uma lista de palavras, que podem ser ordenadas, indexadas e contabilizadas posteriormente." +original: from-html-to-list-of-words-1 +avatar_alt: Uma girafa a ser imitada por um humano +doi: 10.46430/phpt0027 +--- + +{% include toc.html %} + +

    +O site do Old Bailey Online foi recentemente atualizado. Infelizmente, devido às diversas mudanças, muitos (se não todos) os elementos do site de exemplo usado nesta lição não funcionarão conforme descrito. No entanto, as metodologias ensinadas por esta lição permanecem relevantes e podem ser adaptadas pelos leitores para um site de exemplo diferente. Estamos trabalhando na adaptação da lição para o novo site do Old Bailey Online, mas ainda não temos cronograma preciso de quando a lição será atualizada. [Abril de 2024] +
    + +## Objetivos da lição + +Nesta lição de duas partes, aprofundaremos o que aprendeu sobre o [Download de Páginas Web com Python](/pt/licoes/download-paginas-web-python), aprendendo como remover a *marcação HTML* de uma página web da [transcrição do julgamento criminal de Benjamin Bowsey em 1780](https://perma.cc/8LM6-W39K). Faremos isso usando uma variedade de *operadores de string*, *métodos de string* e habilidades de leitura atenta. Introduziremos *looping* e *branching* de modo que os programas possam repetir tarefas e testar certas condições, tornando possível a separação do conteúdo das tags HTML. Finalmente, faremos a conversão do conteúdo de uma string longa para uma *lista de palavras*, que podem ser ordenadas, indexadas e contabilizadas posteriormente. + +## O Desafio + +Para ter uma ideia mais clara da tarefa que temos pela frente, abra o ficheiro *obo-t17800628-33.html* que criou em [Download de Páginas Web com Python](/pt/licoes/download-paginas-web-python) (ou faça o [download e guarde a transcrição do julgamento](/assets/from-html-to-list-of-words-1/obo-t17800628-33.html) caso ainda não tenha uma cópia) e depois verifique o código-fonte do HTML clicando em *Ferramentas -> Ferramentas do Navegador -> Fonte da página* (para usuários do navegador Firefox). À medida que for olhando o código-fonte, notará que há tags HTML misturadas com texto. Caso não tenha experiência com HTML, recomendamos que faça o tutorial do W3 Schools [HTML](https://www.w3schools.com/html/) para se familiarizar com a marcação HTML. Se o seu trabalho frequentemente requer que remova a marcação HTML, certamente será útil entendê-la ao visualizá-la. + +## Ficheiros Necessários para esta Lição + +- *[obo-t17800628-33.html](/assets/from-html-to-list-of-words-1/obo-t17800628-33.html)* + +## Idealizando um Algoritmo + +Uma vez que o objetivo é nos livrarmos do HTML, o primeiro passo é criar um algoritmo que retorna apenas o texto (removendo as tags HTML) do artigo. Um algoritmo é um procedimento suficientemente detalhado a ponto de poder ser implementado em um computador. Facilita escrever o seu algoritmo no português direto; é uma ótima maneira de delinear exatamente o que deseja fazer antes de mergulhar no código. Para construir esse algoritmo, utilizaremos as nossas habilidades de leitura atenta para descobrir um modo de capturar apenas o conteúdo textual da biografia. + +Ao verificar o código-fonte do *obo-t17800628-33.html*, notará que a transcrição real não começa imediatamente. Na verdade, há um número de tags HTML e algumas informações de citação. Nesse caso, o conteúdo não começa antes da linha 81! + +``` xml +

    324. BENJAMIN BOWSEY (a blackmoor ) was indicted for that he together with five hundred other persons and more, did, unlawfully, riotously, and tumultuously assemble on the 6th of June +``` + +Estamos interessados apenas na transcrição em si e não nos metadados extras contidos nas tags. No entanto, irá notar que o final dos metadados corresponde ao início da transcrição. Isso torna a localização dos metadados uma marcação potencialmente útil para isolar o texto transcrito. + +À primeira vista, percebemos que a transcrição do julgamento em si começa com uma tag HTML: `

    `, que significa 'parágrafo'. Essa é coincidentemente a primeira tag de parágrafo no documento. Podemos usar isso para encontrar o ponto de partida do nosso texto transcrito. Temos sorte nesse caso porque essa tag é uma maneira confiável de encontrar o início do texto transcrito no julgamento (caso deseje, dê uma olhada em alguns outros julgamentos para verificar). + +O texto do julgamento termina na linha 82 com outra tag HTML: `
    `, que significa uma quebra de linha. Essa é a última quebra de linha no documento. Essas duas tags (tag de primeiro parágrafo e última quebra de linha), portanto, nos oferecem uma forma de isolar o texto desejado. Sites bem formatados quase sempre terão uma forma única de sinalizar o fim de um conteúdo. Você frequentemente só precisa verificar de forma atenta. + +A próxima tarefa é remover toda a marcação HTML que permanece mesclada ao conteúdo. Como sabe que tags HTML são sempre encontradas em pares correspondentes de parênteses angulares, é provavelmente uma aposta segura o fato de que, se remover tudo o que estiver entre parênteses angulares, todo o HTML será removido e restará somente a transcrição. Note que estamos assumindo que a transcrição não possuirá os símbolos matemáticos de "menor que" ou "maior que". Se Bowsey fosse um matemático, essa suposição não seria tão segura. + +A seguir, descreve-se o algoritmo em palavras. + +Para isolar o conteúdo: + +- Fazer o download do texto transcrito +- Buscar no HTML e guardar a localização da primeira tag `

    ` +- Buscar no HTML e guardar a localização da última tag `
    ` +- Armazenar tudo que vier após a tag `

    ` e antes da tag `
    ` numa string: *pageContents* + +Neste ponto, temos o texto da transcrição do julgamento, além da marcação HTML. Em seguida: + +- Verificar cada caractere na string *pageContents*, um por um +- Se o caractere for um colchete angular esquerdo (\<), estamos dentro de uma tag e deve-se ignorar os caracteres subsequentes +- Se o caractere for um colchete angular direito (\>), estamos deixando a tag; deve-se ignorar este caractere, mas verificar cada um dos caracteres subsequentes +- Se não estivermos dentro de uma tag, adiciona-se cada caractere a uma nova variável: *text* + +Finalmente: + +- Separar a string de texto em uma lista de palavras individuais, que podem ser manipuladas posteriormente. + +## Isolar o Conteúdo Desejado + +Os próximos passos utilizam os comandos de Python introduzidos na lição [Manipular strings com Python](/pt/licoes/manipular-strings-python) para implementar a primeira metade do algoritmo: remover todo o conteúdo antes da tag `

    ` e depois da tag `
    `. Para recapitular, o algoritmo era o seguinte: + +- Fazer o download do texto transcrito +- Buscar no HTML e guardar a localização da primeira tag `

    ` +- Buscar no HTML e guardar a localização da última tag `
    ` +- Armazenar tudo que vier após a tag `

    ` e antes da tag `
    ` numa string: *pageContents* + +Para fazer isso, você utilizará o método de string 'find', o método .rfind() (que encontra a última correspondência de algo) e criará uma nova substring contendo apenas o conteúdo desejado entre essas posições de índice. + +Enquanto trabalha, desenvolverá ficheiros separados para armazenar o seu código. Um deles será chamado `obo.py` (para "Old Bailey Online"). Esse ficheiro conterá todo o código que deseja reutilizar; em outras palavras, `obo.py` é um módulo. Discutimos a ideia de módulo em [Reutilização de código e modularidade em Python](/pt/licoes/reutilizacao-codigo-modularidade-python), quando salvamos nossas funções em `cumprimento.py`. + +Crie um novo ficheiro chamado `obo.py` e armazene-o no seu diretório *programming-historian*. Utilizaremos esse ficheiro para manter cópias das funções necessárias para processar o The Old Bailey Online. Digite ou copie o código a seguir no seu ficheiro: + +``` python +# obo.py + +def stripTags(pageContents): + pageContents = str(pageContents) + startLoc = pageContents.find("

    ") + endLoc = pageContents.rfind("
    ") + + pageContents = pageContents[startLoc:endLoc] + return pageContents +``` + +Crie um segundo ficheiro, `trial-content.py`, e salve o programa mostrado abaixo: + + +``` python +# trial-content.py + +import urllib.request, urllib.error, urllib.parse, obo + +url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' + +response = urllib.request.urlopen(url) +HTML = response.read().decode('UTF-8') + +print((obo.stripTags(HTML))) +``` + +Quando executar o `trial-content.py`, ele acessará a página web da transcrição do julgamento de Bowsey e depois verificará o módulo `obo.py` para buscar a função *stripTags*. Ele utilizará essa função para extrair tudo após a primeira tag `

    ` e antes da última tag `
    `. Com alguma sorte, esse deve ser o conteúdo textual da transcrição de Bowsey, além de alguma marcação HTML. Não se preocupe se a sua tela de Saída de Comando terminar em uma linha preta grossa. A tela de saída do Komodo Edit possui um número máximo de caracteres para exibição, após o qual os caracteres começarão a literalmente escrever uns sobre os outros na tela, dando a aparência de uma linha preta. Não se preocupe: o texto está lá, ainda que não consiga vê-lo; pode cortá-lo e colá-lo em um ficheiro de texto para verificar. + +Vamos reservar um momento para ter certeza de que entendemos como `trial-contents.py` é capaz de usar as funções armazenadas em `obo.py`. A função *stripTags* que salvamos em `obo.py` requer um argumento. Em outras palavras, para que seja executada apropriadamente ela precisa que uma informação seja oferecida. Lembre-se do exemplo do cão treinado na lição anterior. Para latir, o cachorro precisa de duas coisas: ar e uma guloseima deliciosa. A função *stripTags* em `obo.py` precisa de uma coisa: a string chamada *pageContents*. Mas você perceberá que, quando chamamos *stripTags* no programa final (`trial-contents.py`), não há menção ao "*pageContents*". Em vez disso, a função recebe HTML como um argumento. Isso pode ser confuso para muitas pessoas quando começam a programar. Uma vez que uma função foi declarada, não precisamos usar o mesmo nome de variável quando chamamos a função. Desde que forneçamos o mesmo tipo de argumento, tudo deve funcionar bem, independente de como o chamarmos. Nesse caso, queríamos que *pageContents* usasse o conteúdo da nossa variável HTML. Você poderia ter passado qualquer string, inclusive uma que você insira diretamente entre aspas. Tente executar novamente `trial-content.py`, alterando o argumento de *stripTags* para "Eu gosto muito de cachorros" e veja o que acontece. Note que, dependendo de como defina a sua função (e o que ela faz), o seu argumento pode precisar ser algo que não seja uma string: um número inteiro (*integer*), por exemplo. + +Leituras sugeridas +----------------- + +- Lutz, *Learning Python* + - Ch. 7: Strings + - Ch. 8: Lists and Dictionaries + - Ch. 10: Introducing Python Statements + - Ch. 15: Function Basics + +## Sincronização de Código + +Para acompanhar lições futuras, é importante ter os ficheiros e programas corretos no seu diretório “programming-historian”. No final de cada lição, é possível fazer o download do ficheiro zip “programming-historian” para ter a certeza de que o ficheiro correto está a ser utilizado. Observe que removemos os ficheiros desnecessários das lições anteriores. Seu diretório pode conter mais ficheiros e não há problema! + +- programming-historian-2 ([zip](/assets/python-lessons2.zip)) diff --git a/pt/licoes/HTML-lista-palavras-2.md b/pt/licoes/HTML-lista-palavras-2.md index 6556921bda..52441d6e3b 100644 --- a/pt/licoes/HTML-lista-palavras-2.md +++ b/pt/licoes/HTML-lista-palavras-2.md @@ -22,7 +22,7 @@ translation-reviewer: - Diana Rebelo Rodriguez difficulty: 2 review-ticket: https://github.com/programminghistorian/ph-submissions/issues/443 -previous: HTML-lista-palavras-1 +previous: /pt/licoes/HTML-lista-palavras-1 series_total: 2 lessons sequence: 2 activity: transforming diff --git a/pt/licoes/analise-correspondencia-pesquisa-historica-R.md b/pt/licoes/analise-correspondencia-pesquisa-historica-R.md index 44c8b6450f..af0e9b31b3 100644 --- a/pt/licoes/analise-correspondencia-pesquisa-historica-R.md +++ b/pt/licoes/analise-correspondencia-pesquisa-historica-R.md @@ -1,474 +1,474 @@ ---- -title: "Análise de Correspondência para Pesquisa Histórica com R" -slug: analise-correspondencia-pesquisa-historica-R -original: correspondence-analysis-in-R -layout: lesson -collection: lessons -date: 2017-09-13 -translation_date: 2023-05-23 -authors: -- Ryan Deschamps -reviewers: -- Sandra van Ginhoven -- Taylor Arnold -editors: -- Matthew Lincoln -translator: -- Diana Rodriguez -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Yuri Pires -- André Salvo -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/422 -difficulty: 3 -activity: analyzing -topics: [data-manipulation, network-analysis, r, data-visualization] -abstract: Esta lição explica como realizar e interpretar uma análise de correspondência com R, que pode ser usada para identificar relacionamentos dentro de dados categóricos. -avatar_alt: Diagrama de um cubo com arestas legendadas -mathjax: true -doi: 10.46430/phpt0042 ---- - -{% include toc.html %} - -A análise de correspondência (*correspondence analysis* ou CA) produz um gráfico bidimensional ou tridimensional baseado nas relações entre duas ou mais categorias de dados. Essas categorias poderiam ser "membros e clubes", "palavras e livros" ou "países e acordos comerciais". Por exemplo, um membro do clube pode ser equivalente a outro membro com base nos clubes compartilhados aos quais ele pertence. Os membros que frequentam os mesmos clubes provavelmente têm mais em comum do que aqueles que frequentam clubes diferentes. Da mesma forma, os clubes que compartilham membros provavelmente terão mais em comum do que aqueles que compartilham membros diferentes.[^1] - -Discernir essas correspondências significativas pode ser muito difícil de fazer quando há muitos elementos em cada uma de suas categorias (por exemplo, se tivermos centenas de membros espalhados por dezenas de clubes.) A CA mede as correspondências mais fortes em um *dataset* e as projeta em um espaço multidimensional, possibilitando sua visualização e interpretação. Normalmente, as duas principais dimensões são mostradas de uma só vez, embora seja possível mostrar três dimensões em um display 3D. - -Uma vez que a CA visualiza as relações entre elementos de seus dados como distâncias em um gráfico, muitas vezes é possível descobrir padrões amplos com base em que elementos de uma categoria aparecem próximos a elementos da outra. Assim, a CA pode ser um bom primeiro passo para filtrar os principais padrões de um grande *dataset*. É uma ferramenta particularmente poderosa para entender informações históricas dentro de coleções digitais. - -Depois de ler este tutorial, deve ser possível: - -* Saber o que é a CA e para que é usada. -* Saber como executar a CA usando o pacote FactoMineR do R. -* Descrever com exatidão os resultados de uma CA. - -## Pré-requisitos - -Este tutorial é para historiadores e pesquisadores com habilidades intermédias em programação. Pressupõe que já se tem um conhecimento básico de R e alguns conhecimentos básicos de estatística. - -O tutorial [Noções básicas de R com dados tabulares](/pt/licoes/nocoes-basicas-R-dados-tabulares) tem informações sobre como organizar e configurar o R e o tutorial [Processamento Básico de Texto em R](/pt/licoes/processamento-basico-texto-r) também pode ser útil como treinamento. - -Como a CA é uma espécie de *social network analysis* (análise de redes sociais), pode ser interessante olhar a lição [From Hermeneutics to Data to Networks: Data Extraction and Network Visualization of Historical Sources](/en/lessons/creating-network-diagrams-from-historical-sources) (em inglês), que também tem algumas informações úteis sobre a estruturação de dados para análise de redes. - -## O que é a Análise de Correspondência? - -A análise de correspondência (CA), também chamada "escala multidimensional" ou "análise bivariada de rede", permite observar a inter-relação de dois grupos em um gráfico de dispersão com dois eixos (*two-way graph plot*). Por exemplo, foi utilizada pelo sociólogo francês Pierre Bourdieu para mostrar como categorias sociais como a ocupação influenciam a opinião política.[^2] É especialmente poderosa como ferramenta para encontrar padrões em grandes *datasets*. - -A CA funciona com qualquer tipo de dados categóricos (*datasets* que foram agrupados em categorias). Vamos começar com um exemplo simples. Se quisesse entender o papel dos acordos internacionais de livre comércio na interconexão das nações do G8, seria possível criar uma tabela para os países e as relações de livre comércio que eles mantinham em um determinado momento. - -Uma pequena seleção de acordos comerciais (em azul) incluindo o Espaço Económico Europeu (*European Economic Area* ou EEA), o Acordo Comercial Canadá-UE (*Canada-EU Trade Agreement* ou CETA), o Acordo de Livre Comércio Norte-Americano (*North American Free Trade Agreement* ou NAFTA), a Parceria Trans-Pacífico (*Trans Pacific Partnership* ou TPP) e a Associação das Nações do Sudeste Asiático (*Association of Southeast Asian Nations* ou ASEAN) corresponde aos países do G8. Os países (de cor vermelha) agrupam-se geograficamente, com países do Pacífico à direita, países europeus à esquerda e países da América do Norte ao centro. O Canadá e os Estados Unidos, como previsto, estão juntos. Alemanha, Itália, França e Reino Unido pertencem todos aos mesmos dois acordos (CETA e EEA), portanto todos caem exatamente no mesmo ponto. - -{% include figure.html filename="tr-pt-analise-correspondenciaR-1.png" alt="Imagem representando um gráfico de correspondência sobre acordos comerciais" caption="Figura 1. Análise de correspondência de países selecionados do G8 e seus acordos comerciais" %} - -Por outro lado, enquanto a Rússia e os Estados Unidos estão um pouco próximos no eixo horizontal, estão em polos opostos no eixo vertical. A Rússia só compartilha um acordo de comércio com um outro país (Japão) e os Estados Unidos com dois (Japão e Canadá). Em um gráfico de CA, unidades com poucas correlações ficarão nos arreadores, enquanto aquelas unidades com maior quantidade de correlações ficarão mais próximo do centro do gráfico. A conexão relativa ou falta de conexão de um *datapoint* é quantificada como *inertia* (inércia) na CA. A falta relativa de conexão produz uma inércia maior. - -Um ponto mais substancial sobre a Rússia e os Estados Unidos é que a Rússia é um país do Pacífico que não pertence à TPP. Observando esta relação, um historiador poder-se-ia perguntar se isto ocorre por causa de uma relação comercial tensa entre a Rússia e os Estados Unidos em comparação com outros países do G8, ou por atitudes gerais em relação a acordos comerciais para estes países.[^3] - -Com mais dados, a CA pode descobrir distinções mais subtis entre grupos dentro de uma categoria particular. Neste tutorial, analisaremos a vida política canadense - especificamente, como representantes políticos são organizados em comités durante um ou outro governo. Semelhante aos acordos comerciais, esperaríamos que os comités que têm membros semelhantes estivessem mais próximos uns dos outros. Além disso, os comités que têm poucos representantes em comum se encontrarão nos cantos do gráfico. - -## Comités Parlamentares Canadenses (CPCs) - -No sistema parlamentar canadense, os cidadãos elegem representantes chamados membros do Parlamento, ou deputados, para a Câmara dos Comuns. Os parlamentares são responsáveis por votar e propor alterações à legislação no Canadá. Os [Comités Parlamentares (CPCs)](https://perma.cc/3PT6-77DB) (em inglês) consistem de parlamentares que informam à Câmara sobre detalhes importantes da política em uma área temática. Exemplos de tais comités incluem os CPCs sobre Finanças, Justiça e Saúde. - -Usaremos abreviações para os comités parlamentares, porque os nomes podem ficar longos, tornando-os difíceis de ler em um gráfico. É possível usar esta tabela como um guia de referência para as abreviações e seus respectivos nomes de comités: - -| Abbreviation (Abreviação) | Committee Name (Tradução do Nome do Comité) | -| :----------- | :----------------------------------------------------------------------------------------: | -| INAN | Indigenous and Northern Affairs (Assuntos Indígenas e do Norte) | -| HUMA | Human Resources, Skills and Social Development and the Status of Persons with Disabilities (Recursos Humanos, Habilidades e Desenvolvimento Social e o Status das Pessoas com Deficiência) | -| FINA | Finance (Finanças) | -| FAAE | Foreign Affairs and International Development (Relações Exteriores e Desenvolvimento Internacional) | -| ETHI | Access to Information, Privacy and Ethics (Acesso à Informação, Privacidade e Ética) | -| ENVI | Environment and Sustainable Development (Meio Ambiente e Desenvolvimento Sustentável) | -| CHPC | Canadian Heritage (Herança Canadense) | -| CIMM | Citizenship and Immigration (Cidadania e Imigração) | -| ACVA | Veterans Affairs (Assuntos de Veteranos) | -| HESA | Health (Saúde) | -| TRAN | Transport, Infrastructure and Communities (Transporte, Infraestrutura e Comunidades) | -| FOPO | Fisheries and Oceans (Pesca e Oceanos) | -| RNNR | Natural Resources (Recursos Naturais) | -| FEWO | Status of Women (Status das Mulheres) | -| ESPE | Pay Equity (Igualdade de Remuneração) | -| IWFA | Violence against Indigenous Women (Violência Contra as Mulheres Indígenas) | -| BILI | Library of Parliament (Biblioteca do Parlamento) | -| AGRI | Agriculture and Agri-food (Agricultura e Agroalimentação) | -| JUST | Justice and Human Rights (Justiça e Direitos Humanos) | - -O autor da lição, o historiador Ryan Deschamps, suspeitava que os deputados estariam organizados de acordo com os tópicos do comité de forma diferente de governo para governo. Por exemplo, os comités formados durante o primeiro gabinete do governo conservador de Stephen Harper podem ser organizados de forma diferente do gabinete inicial do Liberal de Justin Trudeau. Há uma série de razões para esta suspeita. Primeiro, os CPCs são formados por lideranças partidárias e as decisões dos comités precisam de coordenação entre os membros da Câmara. Em outras palavras, os partidos políticos usarão os CPCs como ferramentas para marcar pontos políticos, e os governos devem garantir que as pessoas certas sejam membros dos comités certos para proteger suas agendas políticas. Em segundo lugar, os dois governos têm um enfoque político diferente. O governo conservador de Harper se concentrou mais em questões de desenvolvimento económico, enquanto os Liberais de Trudeau enfatizaram, em primeiro lugar a igualdade social. Em resumo, pode haver algumas decisões calculadas sobre quem entra em que comité, fornecendo evidências sobre as atitudes do governo em relação ou contra certos tópicos. - -## Preparando o R para a CA - -Para fazer uma CA, precisaremos de um pacote de álgebra linear. Para os mais inclinados à matemática, há um apêndice com alguns detalhes sobre como isto é feito. Em R, há várias opções para CA, mas usaremos o [pacote FactoMineR](http://factominer.free.fr/) (em inglês), focado na "análise de dados exploratórios multivariados".[^4] A FactoMineR pode ser usada para conduzir todos os tipos de análises multivariadas diferentes, incluindo *clusters* hierárquicos, análise fatorial e assim por diante. - -Mas, primeiro, aqui está como instalar e puxar os pacotes, depois colocá-los em um objeto R para que possam ser discutidos. - -```R - -## Estes comandos só precisam ser feitos na primeira vez que se realiza uma análise. -## FactoMineR é um pacote bastante grande, portanto pode levar algum tempo para ser carregado. - -install.packages("FactoMineR") # Inclui um módulo para a condução de CA. -install.packages("factoextra") # Pacote para embelezar os nossos gráficos de CA. - -# Importar os pacotes: -library(FactoMineR) -library(factoextra) - -# set.seed(189981) # Opcional para reprodução. - -# Leia os ficheiros csv: - -harper_df <- read.csv("http://programminghistorian.org/assets/correspondence-analysis-in-R/HarperCPC.csv", stringsAsFactors = FALSE) -``` - - -## Os dados - -Se quiser ver os dados brutos, os dados para este tutorial podem ser encontrados no [Zenodo](https://doi.org/10.5281/zenodo.889846) (em inglês). Foram convenientemente incluídos também no formato tabular (nota: não é necessário baixar estes ficheiros manualmente. Usaremos o R para baixá-los diretamente): - -1) [CPCs do Harper](/assets/correspondence-analysis-in-R/HarperCPC.csv) -2) [CPCs do Trudeau's](/assets/correspondence-analysis-in-R/TrudeauCPC.csv) - -Uma amostra dos dados para a primeira sessão do governo de Stephen Harper. As filas representam comités e as colunas são membros específicos. Se um membro pertence a um comité, a célula terá um 1; se não, terá um 0. - -``` -harper_df - C Bennett D Wilks DV Kesteren G Rickford J Crowder K Block K Seeback -FAAE 0 0 1 0 0 0 0 -FEWO 0 0 0 0 0 0 0 -FINA 0 0 1 0 0 0 0 -HESA 0 1 0 0 0 1 0 -INAN 1 0 0 1 1 0 1 -IWFA 1 0 0 1 1 1 0 -JUST 0 1 0 0 0 0 1 - - L Davies N Ashton R Goguen R Saganash S Ambler S Truppe -FAAE 0 0 0 1 0 0 -FEWO 0 1 0 0 1 1 -FINA 0 0 0 0 0 0 -HESA 1 0 0 0 0 0 -INAN 0 0 0 0 1 0 -IWFA 1 1 1 1 1 1 -JUST 0 0 1 0 0 0 -``` - -Estruturado de outra forma (através de uma tabela R) podemos mostrar que os comités têm muitos deputados e alguns deputados são membros de vários comités. Por exemplo, a deputada liberal Carolyn Bennett era membro do "INAN" (Assuntos Indígenas e do Norte) e do "IWFA" (Violência contra Mulheres Indígenas) e o "HESA" (Comité Parlamentar de Saúde) incluía tanto o D Wilks como o K Block. Em geral, os comités têm entre nove e doze membros. Alguns parlamentares são membros de apenas um comité, enquanto outros podem pertencer a vários comités. - - -## Análise de Correspondência dos Comités Parlamentares Canadenses 2006 e 2016 - -O nosso *data frame* `harper_df` consiste em nomes completos de comités e nomes de deputados, mas alguns dos nomes dos comités (por exemplo, "Recursos Humanos, Habilidades e Desenvolvimento Social" e o "Status das Pessoas com Deficiência") são muito longos para serem bem mostrados em um gráfico: vamos usar as abreviações. - -```R -harper_table <- table(harper_df$abbr, harper_df$membership) -``` - -O comando `table` (tabela) faz um *dataset* de dados cruzados de duas categorias no *data frame*. As colunas são MPs individuais e as linhas são comités. Cada célula contém um 0 ou um 1 baseado na existência ou não de uma conexão. Se olhássemos a presença real em cada reunião, poderíamos também incluir valores ponderados (por exemplo, 5 para um membro do parlamento que participa de uma reunião de comité 5 vezes). Como regra geral, usar valores ponderados quando as quantidades importam (quando as pessoas investem dinheiro, por exemplo), e usar 0s e 1s quando não importam. - -Infelizmente, temos mais um problema. Muitos deputados são membros de apenas 1 comité. Isso fará com que esses deputados se sobreponham quando criarmos o gráfico, tornando-o menos legível. Vamos exigir que os parlamentares pertençam a pelo menos 2 comités antes de executarmos o comando CA da FactoMineR. - -```R -harper_table <- harper_table[,colSums(harper_table) > 1] -CA_harper <- CA(harper_table) -plot(CA_harper) -``` - -O comando `colSums` soma os valores para cada coluna da tabela. `rowSums` poderia ser usado para somar as linhas se isso fosse necessário (não é para nós, porque todos os comités têm mais de um deputado). - -O comando `CA` traça os resultados para as duas dimensões superiores e armazena o resumo dos dados em uma variável chamada `CA_harper`. Na maioria das vezes, `CA` faz a maior parte do trabalho. Como discutido, mais detalhes sobre a matemática por trás da CA são fornecidos no [apêndice](#Apêndice:AMatemáticaportrásdaAnálisedeCorrespondência). - -Deve-se obter um gráfico que se parece com isto: - -{% include figure.html filename="tr-pt-analise-correspondenciaR-2.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 2. Análise de correspondência dos Comités Parlamentares para a 1ª Sessão do Governo Harper" %} - -Vamos tratar os dados do governo Trudeau exatamente da mesma maneira. - -```R -trudeau_df <- read.csv("http://programminghistorian.org/assets/correspondence-analysis-in-R/TrudeauCPC.csv", stringsAsFactors = FALSE) -trudeau_table <- table(trudeau_df$abbr, trudeau_df$membership) -trudeau_table <- trudeau_table[,colSums(trudeau_table) > 1] -CA_trudeau <- CA(trudeau_table) -plot(CA_trudeau) -``` -{% include figure.html filename="tr-pt-analise-correspondenciaR-3.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 3. Análise de correspondência dos Comités Parlamentares para a 1ª Sessão do Governo de Justin Trudeau" %} - -As nossas etiquetas de dados não são muito legíveis no momento. Mesmo com a mudança para abreviações, as etiquetas estão sobrepostas. O pacote [factoextra](https://cran.r-project.org/web/packages/factoextra/index.html) (em inglês) tem uma característica de repelir que ajuda a mostrar as coisas mais claramente.[^5] - -``` -fviz_ca_biplot(CA_harper, repel = TRUE) -``` - -{% include figure.html filename="tr-pt-analise-correspondenciaR-4.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 4. Análise de correspondência dos Comités Parlamentares para a 1ª Sessão do Governo Harper" %} - -``` -fviz_ca_biplot(CA_trudeau, repel = TRUE) -``` - -{% include figure.html filename="tr-pt-analise-correspondenciaR-5.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 5. Análise de correspondência dos Comités Parlamentares para a 1ª Sessão do Governo de Justin Trudeau" %} - -Em vez de se sobrepor, as etiquetas agora usam setas para mostrar sua localização onde for apropriado. - -## Interpretando a Análise de Correspondência (CA) - -Os gráficos de dados parecem mais bonitos, mas quão bem podemos confiar na validade desses dados? A nossa primeira dica é olhar para as dimensões. Nos dados Harper, apenas onze e dez por cento de valor explicativo aparecem no eixo horizontal e vertical respectivamente para um total de 21%![^6] Isso não soa promissor para a nossa análise. Lembrando que o número total de dimensões é igual ao número de filas ou colunas (o que for menor), isto pode ser preocupante. Quando tais valores baixos ocorrem, geralmente significa que os pontos de dados são distribuídos de forma bastante uniforme, e que os MPs são distribuídos de forma uniforme nos CPCs é uma convenção bastante bem estabelecida do parlamento. - -Outra maneira de olhar para os dados é através de valores de inércia.[^7] Mais detalhes sobre inércia podem ser encontrados no [apêndice](#Apêndice:AMatemáticaportrásdaAnálisedeCorrespondência) mas, no gráfico, os pontos de dados distantes da origem têm maior inércia. Pontos de inércia elevados sugerem *outliers* (valores atípicos) - atores ou eventos que têm menos conexões do que aqueles próximos ao centro. Os baixos valores de inércia sugerem pontos de dados que têm mais em comum com o grupo como um todo. Como uma ferramenta de análise, pode ser útil para encontrar atores ou subgrupos renegados no *dataset*. Se todos os pontos tiverem alta inércia, pode ser um indicador de alta diversidade ou fragmentação para as redes. A baixa inércia geral pode ser um indicador de maior coesão ou convergência geral. O que isso significa dependerá do *dataset*. Para os nossos gráficos, nenhum projeto de *datapoint* vai muito além de 2 passos da média. Mais uma vez, este é um indicador de que as relações estão relativamente distribuídas de maneira uniforme. - -Vamos analisar os dados mais de perto: - -```R -summary(CA_harper) -``` - -Isto nos retorna - -``` -HARPER - -O qui-quadrado da independência entre as duas variáveis é igual a 655.6636 -(p-value = 0.7420958 ). - -Eigenvalues - Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 -Variance 0.831 0.779 0.748 0.711 0.666 0.622 -% of var. 11.024 10.342 9.922 9.440 8.839 8.252 -Cumulative % of var. 11.024 21.366 31.288 40.729 49.568 57.820 - - Dim.7 Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 -Variance 0.541 0.498 0.463 0.346 0.305 0.263 -% of var. 7.174 6.604 6.138 4.591 4.041 3.488 -Cumulative % of var. 64.995 71.599 77.736 82.328 86.368 89.856 - - Dim.13 Dim.14 Dim.15 Dim.16 Dim.17 -Variance 0.240 0.195 0.136 0.105 0.088 -% of var. 3.180 2.591 1.807 1.396 1.170 -Cumulative % of var. 93.036 95.627 97.434 98.830 100.000 -``` - -O cabeçalho `Eigenvalues` do resumo apresenta métricas sobre as dimensões recém computadas, listando a percentagem de variância contida em cada uma delas. Infelizmente, a percentagem de variância encontrada nas duas dimensões superiores é muito baixa. Mesmo se conseguíssemos visualizar 7 ou 8 dimensões dos dados, capturaríamos apenas uma percentagem acumulada de cerca de 70%. O teste de independência do [qui-quadrado](https://perma.cc/8B82-YAX6) nos diz que não podemos rejeitar a hipótese de que nossas duas categorias (CPCs e MPs) são independentes. O valor p (ou *p-value*) é 0,74, bem acima do 0,05 comumente usado como um recorte para rejeitar uma hipótese nula.[^8] Um valor p menor ocorreria, por exemplo, se todos ou a maioria dos deputados fossem membros de um ou dois comités. A propósito, o valor de p quadrado de chi da amostra de Trudeau é menor em 0,54, mas ainda não o suficiente para rejeitar a hipótese de categorias mutuamente independentes. - -Como discutido, este resultado não é muito surpreendente. Esperamos que os deputados sejam distribuídos de forma relativamente uniforme entre os comités. Se optarmos por ponderar as nossas medidas com base na participação dos parlamentares em cada reunião de comité ou em seu desejo de 1-100 de ser membro de cada comité, poderemos ver resultados diferentes (por exemplo, pode ser mais comum que os parlamentares participem regularmente nas reuniões financeiras em comparação com outras reuniões). - -A CA falhou conosco? Bem, na verdade não. Isto significa apenas que não podemos simplesmente lançar dados em um algoritmo e esperar responder a perguntas reais de história. Mas nós não somos apenas programadores, mas historiadores de programação. Vamos colocar nossos bonés da história e ver se podemos refinar as nossas pesquisas! - -## Trudeau ampliou a Agenda para a Igualdade das Mulheres no Parlamento? - -Uma das primeiras medidas políticas que Justin Trudeau tomou foi garantir que o Canadá tinha um gabinete com 50% de mulheres. É discutível que o objetivo deste anúncio era professar uma agenda de igualdade de género. Na sua primeira sessão, o governo de Trudeau também criou um novo Comité Parlamentar sobre igualdade de remuneração para as mulheres. Além disso, o governo de Trudeau apresentou uma moção para que houvesse um inquérito sobre Mulheres Indígenas Desaparecidas e Assassinadas, substituindo o mandato do comité parlamentar de Harper para a Violência Contra as Mulheres Indígenas. - -Se Trudeau tivesse a intenção de levar a igualdade das mulheres a sério, poderíamos esperar que mais membros do comité do Status da Mulher estivessem ligados a pastas maiores, como Justiça, Finanças, Saúde e Relações Exteriores, em comparação com o governo de Harper. Como o regime de Harper não tinha um CPC de salário igual, incluiremos o CPC para "Violência contra Mulheres Indígenas". - -```R -# Inclua apenas os comités desejados: -# HESA: Health, JUST: Justice, FEWO: Status of Women -# INAN: Indigenous and Northern Affairs, FINA: Finance -# FAAE: Foreign Affairs and International Trade -# IWFA: Violence against Indigenous Women - -harper_df2 <- harper_df[which(harper_df$abbr %in% - c("HESA", "JUST", "FEWO", "INAN", "FINA", "FAAE", "IWFA")),] -harper_table2 <- table(harper_df2$abbr, harper_df2$membership) - -# Remova os singles de novo. -harper_table2 <- harper_table2[, colSums(harper_table2) > 1] -CA_Harper2 <- CA(harper_table2) -plot(CA_Harper2) -``` - -Isto produz o seguinte gráfico: - -{% include figure.html filename="tr-pt-analise-correspondenciaR-6.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 6. Análise de correspondência de Comités Parlamentares selecionados para a 1ª Sessão do Governo de Stephen Harper" %} - -O valor p do qui-quadrado para este resultado se move apenas ligeiramente em direção a zero, para 0,71. Ainda não podemos tirar nenhuma conclusão quantitativa sobre uma relação clara entre CPCs e MPs. Para os nossos dados, este não é um resultado muito importante. Se pesquisássemos os CPCs sobre qual CPC era o mais produtivo ou importante, talvez encontrássemos valores p mais baixos. A inércia no eixo horizontal praticamente dobrou, sugerindo que o FINA (Finance) é um valor mais baixo no gráfico em comparação com os outros portfólios. - -O significado de um CA depende de uma interpretação qualitativa da trama. Por exemplo, observando os elementos do gráfico Harper podemos dizer que as preocupações económicas caem para a direita do eixo y e as preocupações sociais caem para a esquerda. Portanto, uma das "razões" para escolher os parlamentares para participar de comités no governo Harper parece ser a distinção entre preocupações sociais e económicas. - -Entretanto, quando fazemos a mesma análise com o governo de Trudeau... - -```R -trudeau_df2 <- trudeau_df[which(trudeau_df$abbr %in% - c("HESA", "JUST", "FEWO", "INAN", "FINA", "FAAE", "ESPE")),] -trudeau_table2 <- table(trudeau_df2$abbr, trudeau_df2$membership) -trudeau_table2 <- trudeau_table2[, colSums(trudeau_table2) > 1] # remova os singles de novo -CA_trudeau2 <- CA(trudeau_table2) -plot(CA_trudeau2) -``` - -Produzimos um gráfico incompleto e esta mensagem aparece: - -``` -Warning message: -In CA(trudeau_table2) : -The rows FAAE, INAN, JUST sum at 0. They were suppressed from the analysis. -``` - -Isto significa que o gráfico produzido não nos mostra as colunas FAEE, INAN e JUST. Como o valor de cada uma delas é 0, elas foram suprimidas da análise. Olhando para a tabela `trudeau_table2`, vemos que: - -``` - A Vandenbeld D Albas M Gladu R Harder S Sidhu -ESPE 1 1 1 0 1 -FAAE 0 0 0 0 0 -FEWO 1 0 1 1 0 -FINA 0 1 0 0 0 -HESA 0 0 0 1 1 -INAN 0 0 0 0 0 -JUST 0 0 0 0 0 -``` - -Não há nenhuma associação cruzada para FAEE, INAN ou JUST! Bem, isso é um resultado em si mesmo. Podemos concluir, em geral, que as agendas dos dois governos são bastante diferentes, e que houve uma abordagem diferente utilizada para organizar os parlamentares em comités. - -Para um historiador canadense, o resultado faz algum sentido, dado que a Violência contra as Mulheres Indígenas (IWFA) tem muito mais probabilidade de estar ligada aos Assuntos Indígenas e do Norte (INAN), e à Justiça e Direitos Humanos (JUST), do que à Igualdade de Remuneração (ESPE). Afinal, a história da Violência contra as Mulheres Indígenas está ligada a uma série de casos criminais de alto nível no Canadá. Como discutido anteriormente, a análise de CA requer uma quantidade de interpretação para se tornar significativa. - -Talvez possamos observar alguns comités diferentes em seu lugar. Ao retirar “JUST”, “INAN” e “FAAE” (Relações Exteriores) e substituí-los por “CIMM” (Imigração), “ETHI” (Ética e Acesso à Informação) e “HUMA” (Recursos Humanos), podemos obter uma imagem melhor da estrutura dos comités parlamentares neste contexto. - -```R -trudeau_df3 <- trudeau_df[which(trudeau_df$abbr %in% - c("HESA", "CIMM", "FEWO", "ETHI", "FINA", "HUMA", "ESPE")),] -trudeau_table3 <- table(trudeau_df3$abbr, trudeau_df3$membership) -trudeau_table3 <- trudeau_table3[, colSums(trudeau_table3) > 1] # remova os singles de novo -CA_trudeau3 <- CA(trudeau_table3) -plot(CA_trudeau3) -``` - -{% include figure.html filename="tr-pt-analise-correspondenciaR-7.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 7. Análise de correspondência de Comités Parlamentares selecionados para a 1ª Sessão do Governo de Justin Trudeau" %} - -Em geral, a inércia no eixo horizontal é menor que a do governo de Harper, mas a separação tem "HUMA" (Recursos Humanos) e "ETHI" (Ética) contra os outros portfólios à direita. A delimitação entre questões sociais e económicas não é tão evidente como para Harper, sugerindo uma filosofia diferente para a seleção. Dito isto, também há menos deputados compartilhando as posições. Isto pode ser outro mistério para uma maior exploração. No entanto, o processo CA nos fornece uma visão sólida das relações que ocorrem dentro dos comités com um olhar rápido e com muito poucos comandos. - -## Análise - -Como na maioria das pesquisas interpretativas, não obtemos uma resposta direta à nossa pergunta sobre o poder para as mulheres nos governos parlamentares. No caso Harper, vemos uma divisão no eixo horizontal entre questões sociais como Saúde e Justiça e questões económicas como Finanças e Relações Exteriores, respondendo por 35% da variação. Pela visualização, podemos adivinhar que Finanças (FINA) e Relações Exteriores (FAAE) têm um membro comum e que Relações Exteriores (FAAE) tem um membro comum com Violência contra Mulheres Indígenas (IWFA). Este resultado é, possivelmente, uma preocupação, pois as agendas mais divulgadas de Stephen Harper tendiam a se concentrar em preocupações económicas como o comércio e a contenção fiscal. A separação dos comités implica que a filosofia de governança de Harper separava as preocupações económicas das sociais e que os direitos das mulheres eram principalmente uma preocupação social. A própria pasta Status da Mulher (FEWO) é separada do resto das pastas, encontrando-se ligada às outras pastas somente através de parlamentares comuns com os comités Violência contra Mulheres Indígenas (IWFA) e Assuntos Indígenas e do Norte (INAN). - -O gráfico do governo de Trudeau não mostra conexões cruzadas do Status da Mulher com a Justiça, Relações Exteriores e Povos Indígenas, mas conexões mais fortes com Finanças, Cidadania, Recursos Humanos e Ética. Os Direitos da Mulher estão ligados às Finanças e à Imigração através da carteira de Igualdade de Remuneração. - -É discutível que o regime do governo Harper alinhou os Direitos das Mulheres às pastas sociais como Justiça e Saúde, enquanto Trudeau elevou o perfil do Status da Mulher até certo ponto ao incluir o comité de Igualdade de Remuneração. A conexão entre os comités focados nos Direitos da Mulher e fortes carteiras como Saúde, Finanças e Cidadania e Imigração no governo Trudeau é digna de uma análise mais detalhada. Neste contexto, o Status da Mulher parece ter uma posição mais central (mais próxima da origem) do que o comité Status da Mulher no governo de Harper. Dito isto, o número de pontos de dados neste caso ainda é bastante pequeno para se chegar a uma conclusão definitiva. Talvez outras fontes de evidência possam ser visualizadas de maneira semelhante para confirmar ou negar este ponto. - -A agenda anteriormente mantida entre as mulheres e os povos indígenas foi deslocada no caso Trudeau. Como discutido anteriormente, o [National Inquiry into Missing and Murdered Indigenous Women and Girls](https://perma.cc/U38Y-4CY9) (Inquérito Nacional sobre Mulheres Indígenas Desaparecidas e Assassinadas) (em inglês) deslocou o mandato para o comité Violência contra as Mulheres Indígenas que existia durante o mandato de Harper. A história desta transição é complexa, mas a pressão política foi aplicada ao governo Harper para criar o Inquérito Nacional sobre Mulheres Indígenas Desaparecidas e Assassinadas após o julgamento de Robert Pickton e relatos de investigações policiais insuficientes para mulheres indígenas desaparecidas. Harper recusou-se a conduzir um inquérito citando que o CPC era a melhor abordagem.[^9] Trudeau fez uma promessa eleitoral de incluir o inquérito, deslocando assim o comité. Até certo ponto, Harper parece ter dado à violência contra as mulheres indígenas um papel bastante central no planejamento do Comité Parlamentar. Esta evidência é um contraponto às críticas de que Harper não levou a sério a questão das Mulheres Indígenas Desaparecidas e Assassinadas. - -As diferenças entre as duas relações levantam questões importantes sobre o papel do Status da Mulher no discurso político e suas interconexões entre identidade racial, finanças públicas, saúde e justiça social, a serem exploradas talvez em um trabalho qualitativo mais detalhado. Também levanta questões importantes sobre o foco no género em geral (de acordo com a carteira do Status da Mulher) ou mais especificamente, uma vez que se aplica a um grupo marginalizado (Mulheres Indígenas Desaparecidas e Assassinadas). Um documento de política relacionado aos benefícios de um Inquérito versus discussão do Comité Parlamentar parece razoável após examinar esta evidência. Talvez haja um argumento de que a troca do "IWFA" por "ESPE" é uma espécie de teto de vidro, colocando artificialmente uma cota em questões de mulheres enquanto as carteiras estabelecidas permanecem intocadas. Como uma ferramenta exploratória, a CA nos ajuda a identificar tais temas a partir da observação empírica, em vez de confiar na teoria ou em preconceitos pessoais. - -## Conclusão - -Agora que este tutorial está completo, é possível ter alguma noção do que é a CA e como pode ser usada para responder perguntas exploratórias sobre dados. Usamos o comando `CA` do FactoMineR para criar a análise e traçar os resultados em duas dimensões. Quando as etiquetas se cruzaram, aplicamos o comando `viz_ca_biplot` do pacote factoextra para exibir os dados em um formato mais legível. - -Também aprendemos como interpretar uma CA e como detectar potenciais armadilhas analíticas, incluindo casos em que as relações entre categorias são distribuídas de forma muito uniforme e têm baixo valor explicativo. Neste caso, refinamos a nossa pergunta e os dados de pesquisa para fornecer uma imagem mais significativa do que aconteceu. - -Em geral, o benefício desta análise é fornecer uma rápida visão geral do *dataset* de duas categorias, como um guia para questões históricas mais substantivas. O uso de membros e reuniões ou eventos em todas as áreas da vida (negócios, sem fins lucrativos, reuniões municipais, *hashtags* de twitter, etc.) é uma abordagem comum para tal análise. Os grupos sociais e as suas preferências são outro uso comum para a CA. Em cada caso, a visualização oferece um mapa com o qual se pode observar um retrato da vida social, cultural e política. - -Os próximos passos podem incluir a adição de outras dimensões categóricas à nossa análise, como a incorporação do partido político, idade ou sexo. Quando se faz CA com mais de duas categorias, é chamada de [Análise de Correspondência Múltipla ou MCA](https://www.youtube.com/watch?v=RDexHE5Iqrg) (em inglês). Enquanto a matemática para a MCA é mais complicada, os resultados finais são bastante semelhantes aos da CA. - -Esperamos que, agora, estes métodos sejam aplicados aos seus próprios dados, ajudando a descobrir perguntas e hipóteses que enriquecem a sua pesquisa histórica. Boa sorte! - -## Apêndice: A Matemática por trás da Análise de Correspondência - -Como a matemática da CA será interessante para alguns e não para outros, optamos por discuti-la neste Apêndice. A secção também contém um pouco mais de detalhes sobre outros tópicos, tais como inércia (*inertia*), dimensões (*dimensions*) e decomposição de valores singulares (*singular value decomposition* ou SVD). - -A fim de facilitar a compreensão, começaremos com apenas alguns comités. "FEWO" (Status das Mulheres ou *Status of Women*), "HESA" (Saúde ou *Health*), "INAN" (Assuntos Indígenas e do Norte ou *Indigenous and Northern Affairs*), "IWFA" (Violência contra as Mulheres Indígenas ou *Violence Against Indigenous Women*) e "JUST" (Justiça ou *Justice*). - -``` - C Bennett D Wilks G Rickford J Crowder K Block K Seeback L Davies N Ashton -FEWO 0 0 0 0 0 0 0 1 -HESA 0 1 0 0 1 0 1 0 -INAN 1 0 1 1 0 1 0 0 -IWFA 1 0 1 1 1 0 1 1 -JUST 0 1 0 0 0 1 0 0 - - R Goguen S Ambler S Truppe -FEWO 0 1 1 -HESA 0 0 0 -INAN 0 1 0 -IWFA 1 1 1 -JUST 1 0 0 -``` - -A CA é feita em um *dataset* “normalizado” que é criado pela divisão do valor de cada célula pela raiz quadrada do produto da coluna e totais de linhas, ou célula \\(\frac{1}{\sqrt{column total \times row total}}\\). Por exemplo, a célula de "FEWO" e S Ambler é \\(\frac{1}{\sqrt{3 \times 3}}\\) ou 0.333.[^10] - -A tabela “normalizada” se parece com isto: - -``` - C Bennett D Wilks G Rickford J Crowder K Block K Seeback L Davies N Ashton -FEWO 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.408 -HESA 0.000 0.408 0.000 0.000 0.408 0.000 0.408 0.000 -INAN 0.316 0.000 0.316 0.316 0.000 0.316 0.000 0.000 -IWFA 0.235 0.000 0.235 0.235 0.235 0.000 0.235 0.235 -JUST 0.000 0.408 0.000 0.000 0.000 0.408 0.000 0.000 - - R Goguen S Ambler S Truppe -FEWO 0.000 0.333 0.408 -HESA 0.000 0.000 0.000 -INAN 0.000 0.258 0.000 -IWFA 0.235 0.192 0.235 -JUST 0.408 0.000 0.000 -``` - -O processo de normalização faz algo interessante. Aqueles que são membros de múltiplos comités e/ou que pertencem a comités com muitos membros tendem a ter notas de normalização mais baixas, sugerindo que são mais centrais para a rede. Estes membros serão colocados mais próximos do centro da matriz. Por exemplo, a célula pertencente a S Ambler e "IWFA" tem a pontuação mais baixa de 0,192 porque S Ambler é membro de três comités e o comité "IWFA" tem nove membros no gráfico representado. - -A próxima etapa é encontrar a decomposição de valor singular destes dados normalizados. Isto envolve álgebra linear bastante complexa que não será abordada aqui, mas pode-se aprender mais com este tutorial de *[Single Value Decomposition](https://perma.cc/CD5F-AL7W)* (Decomposição de Valores Singulares) (em inglês) ou com mais detalhes [neste pdf sobre SVD](https://perma.cc/F7MJ-EGET) (em inglês). Vou tentar resumir o que acontece em termos leigos. - -* Duas novas matrizes são criadas que mostram pontuações de “dimensão” para as linhas (comités) e as colunas (MPs) baseadas em vetores próprios. -* O número de dimensões é igual ao tamanho das colunas ou filas menos 1, que é sempre menor. Neste caso, há cinco comités em comparação com as MPs onze, portanto o número de dimensões é 4. -* Uma outra matriz mostra os valores singulares (valores próprios ou *eigenvalues*), que podem ser usados para mostrar a influência de cada dimensão na análise. -* Um dos vários “tratamentos” é aplicado aos dados para facilitar a plotagem. O mais comum é a abordagem de “coordenadas padrão”, que compara cada pontuação normalizada de forma positiva ou negativa com a pontuação média. - -Ao usar coordenadas padrão, a nossa tabela de dados mostra o seguinte: -``` -Columns (MPs): - -Dim 1 Dim 2 Dim 3 Dim 4 -C Bennett -0.4061946 -0.495800254 0.6100171 0.07717508 -D Wilks 1.5874119 0.147804035 -0.4190637 -0.34058221 -G Rickford -0.4061946 -0.495800254 0.6100171 0.07717508 -J Crowder -0.4061946 -0.495800254 0.6100171 0.07717508 -K Block 0.6536800 0.897240970 0.5665289 0.04755678 -K Seeback 0.5275373 -1.245237189 -0.3755754 -0.31096392 -L Davies 0.6536800 0.897240970 0.5665289 0.04755678 -N Ashton -0.8554566 0.631040866 -0.6518568 0.02489229 -R Goguen 0.6039463 -0.464503802 -0.6602408 0.73424971 -S Ambler -0.7311723 -0.004817303 -0.1363437 -0.30608465 -S Truppe -0.8554566 0.631040866 -0.6518568 0.02489229 - -$inertia -[1] 0.06859903 0.24637681 0.06859903 0.06859903 0.13526570 0.17971014 0.13526570 -[8] 0.13526570 0.13526570 0.08438003 0.13526570 - -Rows (Committees): - -Dim 1 Dim 2 Dim 3 Dim 4 -FEWO -1.0603194 0.6399308 -0.8842978 -0.30271466 -HESA 1.2568696 0.9885976 0.4384432 -0.28992174 -INAN -0.3705046 -0.8359969 0.4856563 -0.27320374 -IWFA -0.2531830 0.1866016 0.1766091 0.31676507 -JUST 1.1805065 -0.7950050 -0.8933999 0.09768076 - -$inertia -[1] 0.31400966 0.36956522 0.24927536 0.09017713 0.36956522 -``` - -Cada pontuação para uma “dimensão” pode ser usada como uma coordenada nesse gráfico. Como não podemos visualizar em quatro dimensões, as saídas CA normalmente se concentram nas primeiras duas ou três dimensões para produzir um gráfico (por exemplo, "HESA" será plotado em `[1.245, 0.989]` ou `[1.245, 0.989, 0.438]` em um gráfico 3D). - -{% include figure.html filename="tr-pt-analise-correspondenciaR-8.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 8. Análise de correspondência de Comités Parlamentares selecionados para a 1ª Sessão do Governo Stephen Harper, 2006" %} - -As pontuações de inércia são uma forma de mostrar a variação nos dados. Saúde e Justiça possuem a menor quantidade de membros com uma alta pontuação de inércia, enquanto o comité mais popular - "IWFA" - tem uma pequena inércia. Assim, a inércia é uma forma de quantificar a distância dos pontos em relação ao centro do gráfico. - -Outra pontuação importante é visível no gráfico de CA - a percentagem do valor explicativo para cada dimensão. Isto significa que o eixo horizontal explica 42,32% da variação no gráfico, enquanto o eixo vertical explica quase 31%. O que estes eixos significam deve ser interpretado com base no gráfico. Por exemplo, podemos dizer que o lado esquerdo representa questões relativas à identidade social e os do lado direito são mais reguladores. Uma análise histórica mais aprofundada das atas destes comités poderia, por sua vez, oferecer uma maior compreensão sobre o significado da participação destes membros na época. - -## Notas -[^1]: A CA tem uma história ramificada de várias disciplinas e, assim, a terminologia pode ser confusa. Para simplificar, as categorias se referem aos tipos de dados que estão sendo comparados (por exemplo, membros e clubes) enquanto cada item dentro dessas categorias (por exemplo, “The Tennis Club” ou “John McEnroe”) será um elemento dentro dessa categoria. A localização quantitativa dos elementos (coordenadas x e y) são *datapoints*. - -[^2]: Brigitte Le Roux and Henry Rouanet, *Multiple Correspondence Analysis* (Los Angeles: SAGE Publications, 2010): 3. - -[^3]: Não pretendemos sugerir que esta análise seja de forma alguma conclusiva sobre os laços comerciais entre os EUA e a Rússia. A questão é que, como a Rússia não faz parte da TPP neste acordo, ela se separa dos EUA. Por outro lado, se a adesão à TPP pudesse ser comprovada como representando laços tensos entre os EUA e a Rússia, apareceria no gráfico de CA. - -[^4]: Sebastien Le, Julie Josse, Francois Husson (2008). FactoMineR: An R Package for Multivariate Analysis. Journal of Statistical Software, 25(1), 1-18. [10.18637/jss.v025.i01](https://doi.org/10.18637/jss.v025.i01). - -[^5]: Alboukadel Kassambara and Fabian Mundt (2017). factoextra: Extract and Visualize the Results of Multivariate Data Analyses. R package version 1.0.4. [https://CRAN.R-project.org/package=factoextra](https://perma.cc/Z2RC-F4J7). - -[^6]: O valor explicativo é a distância dos *datapoints* afastados do centro do gráfico. Cada dimensão é responsável por parte da distância que os *datapoints* divergem do centro. - -[^7]: Em geral, a inércia nas estatísticas refere-se à variação ou “disseminação” de um *dataset*. Esta é análoga ao desvio padrão nos dados de distribuição. - -[^8]: Ver Laura Kane (3 de abril de 2017), "Missing and murdered women's inquiry not reaching out to families, say advocates." *CBC News Indigenous*. [http://www.cbc.ca/news/indigenous/mmiw-inquiry-not-reaching-out-to-families-says-advocates-1.4053694](https://perma.cc/UQ6J-8QVZ). - -[^9]: Em estatística, um valor p (*p-value*), abreviação de valor de probabilidade, é um indicador de quão provável um resultado teria ocorrido em circunstâncias aleatórias. Um baixo valor de p sugere uma probabilidade baixa de que o resultado teria ocorrido ao acaso e, portanto, fornece algumas evidências de que uma hipótese nula (neste caso, que os MPs e CPCs são categorias independentes) é improvável. - -[^10]: Katherine Faust (2005) "Using Correspondence Analysis for Joint Displays of Affiliation Network" in *Models and Methods in Social Network Analysis* eds. Peter J. Carrington, John Scott and Stanley Wasserman. +--- +title: "Análise de Correspondência para Pesquisa Histórica com R" +slug: analise-correspondencia-pesquisa-historica-R +original: correspondence-analysis-in-R +layout: lesson +collection: lessons +date: 2017-09-13 +translation_date: 2023-05-23 +authors: +- Ryan Deschamps +reviewers: +- Sandra van Ginhoven +- Taylor Arnold +editors: +- Matthew Lincoln +translator: +- Diana Rodriguez +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Yuri Pires +- André Salvo +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/422 +difficulty: 3 +activity: analyzing +topics: [data-manipulation, network-analysis, r, data-visualization] +abstract: Esta lição explica como realizar e interpretar uma análise de correspondência com R, que pode ser usada para identificar relacionamentos dentro de dados categóricos. +avatar_alt: Diagrama de um cubo com arestas legendadas +mathjax: true +doi: 10.46430/phpt0042 +--- + +{% include toc.html %} + +A análise de correspondência (*correspondence analysis* ou CA) produz um gráfico bidimensional ou tridimensional baseado nas relações entre duas ou mais categorias de dados. Essas categorias poderiam ser "membros e clubes", "palavras e livros" ou "países e acordos comerciais". Por exemplo, um membro do clube pode ser equivalente a outro membro com base nos clubes compartilhados aos quais ele pertence. Os membros que frequentam os mesmos clubes provavelmente têm mais em comum do que aqueles que frequentam clubes diferentes. Da mesma forma, os clubes que compartilham membros provavelmente terão mais em comum do que aqueles que compartilham membros diferentes.[^1] + +Discernir essas correspondências significativas pode ser muito difícil de fazer quando há muitos elementos em cada uma de suas categorias (por exemplo, se tivermos centenas de membros espalhados por dezenas de clubes.) A CA mede as correspondências mais fortes em um *dataset* e as projeta em um espaço multidimensional, possibilitando sua visualização e interpretação. Normalmente, as duas principais dimensões são mostradas de uma só vez, embora seja possível mostrar três dimensões em um display 3D. + +Uma vez que a CA visualiza as relações entre elementos de seus dados como distâncias em um gráfico, muitas vezes é possível descobrir padrões amplos com base em que elementos de uma categoria aparecem próximos a elementos da outra. Assim, a CA pode ser um bom primeiro passo para filtrar os principais padrões de um grande *dataset*. É uma ferramenta particularmente poderosa para entender informações históricas dentro de coleções digitais. + +Depois de ler este tutorial, deve ser possível: + +* Saber o que é a CA e para que é usada. +* Saber como executar a CA usando o pacote FactoMineR do R. +* Descrever com exatidão os resultados de uma CA. + +## Pré-requisitos + +Este tutorial é para historiadores e pesquisadores com habilidades intermédias em programação. Pressupõe que já se tem um conhecimento básico de R e alguns conhecimentos básicos de estatística. + +O tutorial [Noções básicas de R com dados tabulares](/pt/licoes/nocoes-basicas-R-dados-tabulares) tem informações sobre como organizar e configurar o R e o tutorial [Processamento Básico de Texto em R](/pt/licoes/processamento-basico-texto-r) também pode ser útil como treinamento. + +Como a CA é uma espécie de *social network analysis* (análise de redes sociais), pode ser interessante olhar a lição [From Hermeneutics to Data to Networks: Data Extraction and Network Visualization of Historical Sources](/en/lessons/creating-network-diagrams-from-historical-sources) (em inglês), que também tem algumas informações úteis sobre a estruturação de dados para análise de redes. + +## O que é a Análise de Correspondência? + +A análise de correspondência (CA), também chamada "escala multidimensional" ou "análise bivariada de rede", permite observar a inter-relação de dois grupos em um gráfico de dispersão com dois eixos (*two-way graph plot*). Por exemplo, foi utilizada pelo sociólogo francês Pierre Bourdieu para mostrar como categorias sociais como a ocupação influenciam a opinião política.[^2] É especialmente poderosa como ferramenta para encontrar padrões em grandes *datasets*. + +A CA funciona com qualquer tipo de dados categóricos (*datasets* que foram agrupados em categorias). Vamos começar com um exemplo simples. Se quisesse entender o papel dos acordos internacionais de livre comércio na interconexão das nações do G8, seria possível criar uma tabela para os países e as relações de livre comércio que eles mantinham em um determinado momento. + +Uma pequena seleção de acordos comerciais (em azul) incluindo o Espaço Económico Europeu (*European Economic Area* ou EEA), o Acordo Comercial Canadá-UE (*Canada-EU Trade Agreement* ou CETA), o Acordo de Livre Comércio Norte-Americano (*North American Free Trade Agreement* ou NAFTA), a Parceria Trans-Pacífico (*Trans Pacific Partnership* ou TPP) e a Associação das Nações do Sudeste Asiático (*Association of Southeast Asian Nations* ou ASEAN) corresponde aos países do G8. Os países (de cor vermelha) agrupam-se geograficamente, com países do Pacífico à direita, países europeus à esquerda e países da América do Norte ao centro. O Canadá e os Estados Unidos, como previsto, estão juntos. Alemanha, Itália, França e Reino Unido pertencem todos aos mesmos dois acordos (CETA e EEA), portanto todos caem exatamente no mesmo ponto. + +{% include figure.html filename="tr-pt-analise-correspondenciaR-1.png" alt="Imagem representando um gráfico de correspondência sobre acordos comerciais" caption="Figura 1. Análise de correspondência de países selecionados do G8 e seus acordos comerciais" %} + +Por outro lado, enquanto a Rússia e os Estados Unidos estão um pouco próximos no eixo horizontal, estão em polos opostos no eixo vertical. A Rússia só compartilha um acordo de comércio com um outro país (Japão) e os Estados Unidos com dois (Japão e Canadá). Em um gráfico de CA, unidades com poucas correlações ficarão nos arreadores, enquanto aquelas unidades com maior quantidade de correlações ficarão mais próximo do centro do gráfico. A conexão relativa ou falta de conexão de um *datapoint* é quantificada como *inertia* (inércia) na CA. A falta relativa de conexão produz uma inércia maior. + +Um ponto mais substancial sobre a Rússia e os Estados Unidos é que a Rússia é um país do Pacífico que não pertence à TPP. Observando esta relação, um historiador poder-se-ia perguntar se isto ocorre por causa de uma relação comercial tensa entre a Rússia e os Estados Unidos em comparação com outros países do G8, ou por atitudes gerais em relação a acordos comerciais para estes países.[^3] + +Com mais dados, a CA pode descobrir distinções mais subtis entre grupos dentro de uma categoria particular. Neste tutorial, analisaremos a vida política canadense - especificamente, como representantes políticos são organizados em comités durante um ou outro governo. Semelhante aos acordos comerciais, esperaríamos que os comités que têm membros semelhantes estivessem mais próximos uns dos outros. Além disso, os comités que têm poucos representantes em comum se encontrarão nos cantos do gráfico. + +## Comités Parlamentares Canadenses (CPCs) + +No sistema parlamentar canadense, os cidadãos elegem representantes chamados membros do Parlamento, ou deputados, para a Câmara dos Comuns. Os parlamentares são responsáveis por votar e propor alterações à legislação no Canadá. Os [Comités Parlamentares (CPCs)](https://perma.cc/3PT6-77DB) (em inglês) consistem de parlamentares que informam à Câmara sobre detalhes importantes da política em uma área temática. Exemplos de tais comités incluem os CPCs sobre Finanças, Justiça e Saúde. + +Usaremos abreviações para os comités parlamentares, porque os nomes podem ficar longos, tornando-os difíceis de ler em um gráfico. É possível usar esta tabela como um guia de referência para as abreviações e seus respectivos nomes de comités: + +| Abbreviation (Abreviação) | Committee Name (Tradução do Nome do Comité) | +| :----------- | :----------------------------------------------------------------------------------------: | +| INAN | Indigenous and Northern Affairs (Assuntos Indígenas e do Norte) | +| HUMA | Human Resources, Skills and Social Development and the Status of Persons with Disabilities (Recursos Humanos, Habilidades e Desenvolvimento Social e o Status das Pessoas com Deficiência) | +| FINA | Finance (Finanças) | +| FAAE | Foreign Affairs and International Development (Relações Exteriores e Desenvolvimento Internacional) | +| ETHI | Access to Information, Privacy and Ethics (Acesso à Informação, Privacidade e Ética) | +| ENVI | Environment and Sustainable Development (Meio Ambiente e Desenvolvimento Sustentável) | +| CHPC | Canadian Heritage (Herança Canadense) | +| CIMM | Citizenship and Immigration (Cidadania e Imigração) | +| ACVA | Veterans Affairs (Assuntos de Veteranos) | +| HESA | Health (Saúde) | +| TRAN | Transport, Infrastructure and Communities (Transporte, Infraestrutura e Comunidades) | +| FOPO | Fisheries and Oceans (Pesca e Oceanos) | +| RNNR | Natural Resources (Recursos Naturais) | +| FEWO | Status of Women (Status das Mulheres) | +| ESPE | Pay Equity (Igualdade de Remuneração) | +| IWFA | Violence against Indigenous Women (Violência Contra as Mulheres Indígenas) | +| BILI | Library of Parliament (Biblioteca do Parlamento) | +| AGRI | Agriculture and Agri-food (Agricultura e Agroalimentação) | +| JUST | Justice and Human Rights (Justiça e Direitos Humanos) | + +O autor da lição, o historiador Ryan Deschamps, suspeitava que os deputados estariam organizados de acordo com os tópicos do comité de forma diferente de governo para governo. Por exemplo, os comités formados durante o primeiro gabinete do governo conservador de Stephen Harper podem ser organizados de forma diferente do gabinete inicial do Liberal de Justin Trudeau. Há uma série de razões para esta suspeita. Primeiro, os CPCs são formados por lideranças partidárias e as decisões dos comités precisam de coordenação entre os membros da Câmara. Em outras palavras, os partidos políticos usarão os CPCs como ferramentas para marcar pontos políticos, e os governos devem garantir que as pessoas certas sejam membros dos comités certos para proteger suas agendas políticas. Em segundo lugar, os dois governos têm um enfoque político diferente. O governo conservador de Harper se concentrou mais em questões de desenvolvimento económico, enquanto os Liberais de Trudeau enfatizaram, em primeiro lugar a igualdade social. Em resumo, pode haver algumas decisões calculadas sobre quem entra em que comité, fornecendo evidências sobre as atitudes do governo em relação ou contra certos tópicos. + +## Preparando o R para a CA + +Para fazer uma CA, precisaremos de um pacote de álgebra linear. Para os mais inclinados à matemática, há um apêndice com alguns detalhes sobre como isto é feito. Em R, há várias opções para CA, mas usaremos o [pacote FactoMineR](https://factominer.free.fr/) (em inglês), focado na "análise de dados exploratórios multivariados".[^4] A FactoMineR pode ser usada para conduzir todos os tipos de análises multivariadas diferentes, incluindo *clusters* hierárquicos, análise fatorial e assim por diante. + +Mas, primeiro, aqui está como instalar e puxar os pacotes, depois colocá-los em um objeto R para que possam ser discutidos. + +```R + +## Estes comandos só precisam ser feitos na primeira vez que se realiza uma análise. +## FactoMineR é um pacote bastante grande, portanto pode levar algum tempo para ser carregado. + +install.packages("FactoMineR") # Inclui um módulo para a condução de CA. +install.packages("factoextra") # Pacote para embelezar os nossos gráficos de CA. + +# Importar os pacotes: +library(FactoMineR) +library(factoextra) + +# set.seed(189981) # Opcional para reprodução. + +# Leia os ficheiros csv: + +harper_df <- read.csv("http://programminghistorian.org/assets/correspondence-analysis-in-R/HarperCPC.csv", stringsAsFactors = FALSE) +``` + + +## Os dados + +Se quiser ver os dados brutos, os dados para este tutorial podem ser encontrados no [Zenodo](https://doi.org/10.5281/zenodo.889846) (em inglês). Foram convenientemente incluídos também no formato tabular (nota: não é necessário baixar estes ficheiros manualmente. Usaremos o R para baixá-los diretamente): + +1) [CPCs do Harper](/assets/correspondence-analysis-in-R/HarperCPC.csv) +2) [CPCs do Trudeau's](/assets/correspondence-analysis-in-R/TrudeauCPC.csv) + +Uma amostra dos dados para a primeira sessão do governo de Stephen Harper. As filas representam comités e as colunas são membros específicos. Se um membro pertence a um comité, a célula terá um 1; se não, terá um 0. + +``` +harper_df + C Bennett D Wilks DV Kesteren G Rickford J Crowder K Block K Seeback +FAAE 0 0 1 0 0 0 0 +FEWO 0 0 0 0 0 0 0 +FINA 0 0 1 0 0 0 0 +HESA 0 1 0 0 0 1 0 +INAN 1 0 0 1 1 0 1 +IWFA 1 0 0 1 1 1 0 +JUST 0 1 0 0 0 0 1 + + L Davies N Ashton R Goguen R Saganash S Ambler S Truppe +FAAE 0 0 0 1 0 0 +FEWO 0 1 0 0 1 1 +FINA 0 0 0 0 0 0 +HESA 1 0 0 0 0 0 +INAN 0 0 0 0 1 0 +IWFA 1 1 1 1 1 1 +JUST 0 0 1 0 0 0 +``` + +Estruturado de outra forma (através de uma tabela R) podemos mostrar que os comités têm muitos deputados e alguns deputados são membros de vários comités. Por exemplo, a deputada liberal Carolyn Bennett era membro do "INAN" (Assuntos Indígenas e do Norte) e do "IWFA" (Violência contra Mulheres Indígenas) e o "HESA" (Comité Parlamentar de Saúde) incluía tanto o D Wilks como o K Block. Em geral, os comités têm entre nove e doze membros. Alguns parlamentares são membros de apenas um comité, enquanto outros podem pertencer a vários comités. + + +## Análise de Correspondência dos Comités Parlamentares Canadenses 2006 e 2016 + +O nosso *data frame* `harper_df` consiste em nomes completos de comités e nomes de deputados, mas alguns dos nomes dos comités (por exemplo, "Recursos Humanos, Habilidades e Desenvolvimento Social" e o "Status das Pessoas com Deficiência") são muito longos para serem bem mostrados em um gráfico: vamos usar as abreviações. + +```R +harper_table <- table(harper_df$abbr, harper_df$membership) +``` + +O comando `table` (tabela) faz um *dataset* de dados cruzados de duas categorias no *data frame*. As colunas são MPs individuais e as linhas são comités. Cada célula contém um 0 ou um 1 baseado na existência ou não de uma conexão. Se olhássemos a presença real em cada reunião, poderíamos também incluir valores ponderados (por exemplo, 5 para um membro do parlamento que participa de uma reunião de comité 5 vezes). Como regra geral, usar valores ponderados quando as quantidades importam (quando as pessoas investem dinheiro, por exemplo), e usar 0s e 1s quando não importam. + +Infelizmente, temos mais um problema. Muitos deputados são membros de apenas 1 comité. Isso fará com que esses deputados se sobreponham quando criarmos o gráfico, tornando-o menos legível. Vamos exigir que os parlamentares pertençam a pelo menos 2 comités antes de executarmos o comando CA da FactoMineR. + +```R +harper_table <- harper_table[,colSums(harper_table) > 1] +CA_harper <- CA(harper_table) +plot(CA_harper) +``` + +O comando `colSums` soma os valores para cada coluna da tabela. `rowSums` poderia ser usado para somar as linhas se isso fosse necessário (não é para nós, porque todos os comités têm mais de um deputado). + +O comando `CA` traça os resultados para as duas dimensões superiores e armazena o resumo dos dados em uma variável chamada `CA_harper`. Na maioria das vezes, `CA` faz a maior parte do trabalho. Como discutido, mais detalhes sobre a matemática por trás da CA são fornecidos no [apêndice](#apêndice-a-matemática-por-trás-da-análise-de-correspondência). + +Deve-se obter um gráfico que se parece com isto: + +{% include figure.html filename="tr-pt-analise-correspondenciaR-2.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 2. Análise de correspondência dos Comités Parlamentares para a 1ª Sessão do Governo Harper" %} + +Vamos tratar os dados do governo Trudeau exatamente da mesma maneira. + +```R +trudeau_df <- read.csv("http://programminghistorian.org/assets/correspondence-analysis-in-R/TrudeauCPC.csv", stringsAsFactors = FALSE) +trudeau_table <- table(trudeau_df$abbr, trudeau_df$membership) +trudeau_table <- trudeau_table[,colSums(trudeau_table) > 1] +CA_trudeau <- CA(trudeau_table) +plot(CA_trudeau) +``` +{% include figure.html filename="tr-pt-analise-correspondenciaR-3.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 3. Análise de correspondência dos Comités Parlamentares para a 1ª Sessão do Governo de Justin Trudeau" %} + +As nossas etiquetas de dados não são muito legíveis no momento. Mesmo com a mudança para abreviações, as etiquetas estão sobrepostas. O pacote [factoextra](https://cran.r-project.org/web/packages/factoextra/index.html) (em inglês) tem uma característica de repelir que ajuda a mostrar as coisas mais claramente.[^5] + +``` +fviz_ca_biplot(CA_harper, repel = TRUE) +``` + +{% include figure.html filename="tr-pt-analise-correspondenciaR-4.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 4. Análise de correspondência dos Comités Parlamentares para a 1ª Sessão do Governo Harper" %} + +``` +fviz_ca_biplot(CA_trudeau, repel = TRUE) +``` + +{% include figure.html filename="tr-pt-analise-correspondenciaR-5.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 5. Análise de correspondência dos Comités Parlamentares para a 1ª Sessão do Governo de Justin Trudeau" %} + +Em vez de se sobrepor, as etiquetas agora usam setas para mostrar sua localização onde for apropriado. + +## Interpretando a Análise de Correspondência (CA) + +Os gráficos de dados parecem mais bonitos, mas quão bem podemos confiar na validade desses dados? A nossa primeira dica é olhar para as dimensões. Nos dados Harper, apenas onze e dez por cento de valor explicativo aparecem no eixo horizontal e vertical respectivamente para um total de 21%![^6] Isso não soa promissor para a nossa análise. Lembrando que o número total de dimensões é igual ao número de filas ou colunas (o que for menor), isto pode ser preocupante. Quando tais valores baixos ocorrem, geralmente significa que os pontos de dados são distribuídos de forma bastante uniforme, e que os MPs são distribuídos de forma uniforme nos CPCs é uma convenção bastante bem estabelecida do parlamento. + +Outra maneira de olhar para os dados é através de valores de inércia.[^7] Mais detalhes sobre inércia podem ser encontrados no [apêndice](#apêndice-a-matemática-por-trás-da-análise-de-correspondência) mas, no gráfico, os pontos de dados distantes da origem têm maior inércia. Pontos de inércia elevados sugerem *outliers* (valores atípicos) - atores ou eventos que têm menos conexões do que aqueles próximos ao centro. Os baixos valores de inércia sugerem pontos de dados que têm mais em comum com o grupo como um todo. Como uma ferramenta de análise, pode ser útil para encontrar atores ou subgrupos renegados no *dataset*. Se todos os pontos tiverem alta inércia, pode ser um indicador de alta diversidade ou fragmentação para as redes. A baixa inércia geral pode ser um indicador de maior coesão ou convergência geral. O que isso significa dependerá do *dataset*. Para os nossos gráficos, nenhum projeto de *datapoint* vai muito além de 2 passos da média. Mais uma vez, este é um indicador de que as relações estão relativamente distribuídas de maneira uniforme. + +Vamos analisar os dados mais de perto: + +```R +summary(CA_harper) +``` + +Isto nos retorna + +``` +HARPER + +O qui-quadrado da independência entre as duas variáveis é igual a 655.6636 +(p-value = 0.7420958 ). + +Eigenvalues + Dim.1 Dim.2 Dim.3 Dim.4 Dim.5 Dim.6 +Variance 0.831 0.779 0.748 0.711 0.666 0.622 +% of var. 11.024 10.342 9.922 9.440 8.839 8.252 +Cumulative % of var. 11.024 21.366 31.288 40.729 49.568 57.820 + + Dim.7 Dim.8 Dim.9 Dim.10 Dim.11 Dim.12 +Variance 0.541 0.498 0.463 0.346 0.305 0.263 +% of var. 7.174 6.604 6.138 4.591 4.041 3.488 +Cumulative % of var. 64.995 71.599 77.736 82.328 86.368 89.856 + + Dim.13 Dim.14 Dim.15 Dim.16 Dim.17 +Variance 0.240 0.195 0.136 0.105 0.088 +% of var. 3.180 2.591 1.807 1.396 1.170 +Cumulative % of var. 93.036 95.627 97.434 98.830 100.000 +``` + +O cabeçalho `Eigenvalues` do resumo apresenta métricas sobre as dimensões recém computadas, listando a percentagem de variância contida em cada uma delas. Infelizmente, a percentagem de variância encontrada nas duas dimensões superiores é muito baixa. Mesmo se conseguíssemos visualizar 7 ou 8 dimensões dos dados, capturaríamos apenas uma percentagem acumulada de cerca de 70%. O teste de independência do [qui-quadrado](https://perma.cc/8B82-YAX6) nos diz que não podemos rejeitar a hipótese de que nossas duas categorias (CPCs e MPs) são independentes. O valor p (ou *p-value*) é 0,74, bem acima do 0,05 comumente usado como um recorte para rejeitar uma hipótese nula.[^8] Um valor p menor ocorreria, por exemplo, se todos ou a maioria dos deputados fossem membros de um ou dois comités. A propósito, o valor de p quadrado de chi da amostra de Trudeau é menor em 0,54, mas ainda não o suficiente para rejeitar a hipótese de categorias mutuamente independentes. + +Como discutido, este resultado não é muito surpreendente. Esperamos que os deputados sejam distribuídos de forma relativamente uniforme entre os comités. Se optarmos por ponderar as nossas medidas com base na participação dos parlamentares em cada reunião de comité ou em seu desejo de 1-100 de ser membro de cada comité, poderemos ver resultados diferentes (por exemplo, pode ser mais comum que os parlamentares participem regularmente nas reuniões financeiras em comparação com outras reuniões). + +A CA falhou conosco? Bem, na verdade não. Isto significa apenas que não podemos simplesmente lançar dados em um algoritmo e esperar responder a perguntas reais de história. Mas nós não somos apenas programadores, mas historiadores de programação. Vamos colocar nossos bonés da história e ver se podemos refinar as nossas pesquisas! + +## Trudeau ampliou a Agenda para a Igualdade das Mulheres no Parlamento? + +Uma das primeiras medidas políticas que Justin Trudeau tomou foi garantir que o Canadá tinha um gabinete com 50% de mulheres. É discutível que o objetivo deste anúncio era professar uma agenda de igualdade de género. Na sua primeira sessão, o governo de Trudeau também criou um novo Comité Parlamentar sobre igualdade de remuneração para as mulheres. Além disso, o governo de Trudeau apresentou uma moção para que houvesse um inquérito sobre Mulheres Indígenas Desaparecidas e Assassinadas, substituindo o mandato do comité parlamentar de Harper para a Violência Contra as Mulheres Indígenas. + +Se Trudeau tivesse a intenção de levar a igualdade das mulheres a sério, poderíamos esperar que mais membros do comité do Status da Mulher estivessem ligados a pastas maiores, como Justiça, Finanças, Saúde e Relações Exteriores, em comparação com o governo de Harper. Como o regime de Harper não tinha um CPC de salário igual, incluiremos o CPC para "Violência contra Mulheres Indígenas". + +```R +# Inclua apenas os comités desejados: +# HESA: Health, JUST: Justice, FEWO: Status of Women +# INAN: Indigenous and Northern Affairs, FINA: Finance +# FAAE: Foreign Affairs and International Trade +# IWFA: Violence against Indigenous Women + +harper_df2 <- harper_df[which(harper_df$abbr %in% + c("HESA", "JUST", "FEWO", "INAN", "FINA", "FAAE", "IWFA")),] +harper_table2 <- table(harper_df2$abbr, harper_df2$membership) + +# Remova os singles de novo. +harper_table2 <- harper_table2[, colSums(harper_table2) > 1] +CA_Harper2 <- CA(harper_table2) +plot(CA_Harper2) +``` + +Isto produz o seguinte gráfico: + +{% include figure.html filename="tr-pt-analise-correspondenciaR-6.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 6. Análise de correspondência de Comités Parlamentares selecionados para a 1ª Sessão do Governo de Stephen Harper" %} + +O valor p do qui-quadrado para este resultado se move apenas ligeiramente em direção a zero, para 0,71. Ainda não podemos tirar nenhuma conclusão quantitativa sobre uma relação clara entre CPCs e MPs. Para os nossos dados, este não é um resultado muito importante. Se pesquisássemos os CPCs sobre qual CPC era o mais produtivo ou importante, talvez encontrássemos valores p mais baixos. A inércia no eixo horizontal praticamente dobrou, sugerindo que o FINA (Finance) é um valor mais baixo no gráfico em comparação com os outros portfólios. + +O significado de um CA depende de uma interpretação qualitativa da trama. Por exemplo, observando os elementos do gráfico Harper podemos dizer que as preocupações económicas caem para a direita do eixo y e as preocupações sociais caem para a esquerda. Portanto, uma das "razões" para escolher os parlamentares para participar de comités no governo Harper parece ser a distinção entre preocupações sociais e económicas. + +Entretanto, quando fazemos a mesma análise com o governo de Trudeau... + +```R +trudeau_df2 <- trudeau_df[which(trudeau_df$abbr %in% + c("HESA", "JUST", "FEWO", "INAN", "FINA", "FAAE", "ESPE")),] +trudeau_table2 <- table(trudeau_df2$abbr, trudeau_df2$membership) +trudeau_table2 <- trudeau_table2[, colSums(trudeau_table2) > 1] # remova os singles de novo +CA_trudeau2 <- CA(trudeau_table2) +plot(CA_trudeau2) +``` + +Produzimos um gráfico incompleto e esta mensagem aparece: + +``` +Warning message: +In CA(trudeau_table2) : +The rows FAAE, INAN, JUST sum at 0. They were suppressed from the analysis. +``` + +Isto significa que o gráfico produzido não nos mostra as colunas FAEE, INAN e JUST. Como o valor de cada uma delas é 0, elas foram suprimidas da análise. Olhando para a tabela `trudeau_table2`, vemos que: + +``` + A Vandenbeld D Albas M Gladu R Harder S Sidhu +ESPE 1 1 1 0 1 +FAAE 0 0 0 0 0 +FEWO 1 0 1 1 0 +FINA 0 1 0 0 0 +HESA 0 0 0 1 1 +INAN 0 0 0 0 0 +JUST 0 0 0 0 0 +``` + +Não há nenhuma associação cruzada para FAEE, INAN ou JUST! Bem, isso é um resultado em si mesmo. Podemos concluir, em geral, que as agendas dos dois governos são bastante diferentes, e que houve uma abordagem diferente utilizada para organizar os parlamentares em comités. + +Para um historiador canadense, o resultado faz algum sentido, dado que a Violência contra as Mulheres Indígenas (IWFA) tem muito mais probabilidade de estar ligada aos Assuntos Indígenas e do Norte (INAN), e à Justiça e Direitos Humanos (JUST), do que à Igualdade de Remuneração (ESPE). Afinal, a história da Violência contra as Mulheres Indígenas está ligada a uma série de casos criminais de alto nível no Canadá. Como discutido anteriormente, a análise de CA requer uma quantidade de interpretação para se tornar significativa. + +Talvez possamos observar alguns comités diferentes em seu lugar. Ao retirar “JUST”, “INAN” e “FAAE” (Relações Exteriores) e substituí-los por “CIMM” (Imigração), “ETHI” (Ética e Acesso à Informação) e “HUMA” (Recursos Humanos), podemos obter uma imagem melhor da estrutura dos comités parlamentares neste contexto. + +```R +trudeau_df3 <- trudeau_df[which(trudeau_df$abbr %in% + c("HESA", "CIMM", "FEWO", "ETHI", "FINA", "HUMA", "ESPE")),] +trudeau_table3 <- table(trudeau_df3$abbr, trudeau_df3$membership) +trudeau_table3 <- trudeau_table3[, colSums(trudeau_table3) > 1] # remova os singles de novo +CA_trudeau3 <- CA(trudeau_table3) +plot(CA_trudeau3) +``` + +{% include figure.html filename="tr-pt-analise-correspondenciaR-7.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 7. Análise de correspondência de Comités Parlamentares selecionados para a 1ª Sessão do Governo de Justin Trudeau" %} + +Em geral, a inércia no eixo horizontal é menor que a do governo de Harper, mas a separação tem "HUMA" (Recursos Humanos) e "ETHI" (Ética) contra os outros portfólios à direita. A delimitação entre questões sociais e económicas não é tão evidente como para Harper, sugerindo uma filosofia diferente para a seleção. Dito isto, também há menos deputados compartilhando as posições. Isto pode ser outro mistério para uma maior exploração. No entanto, o processo CA nos fornece uma visão sólida das relações que ocorrem dentro dos comités com um olhar rápido e com muito poucos comandos. + +## Análise + +Como na maioria das pesquisas interpretativas, não obtemos uma resposta direta à nossa pergunta sobre o poder para as mulheres nos governos parlamentares. No caso Harper, vemos uma divisão no eixo horizontal entre questões sociais como Saúde e Justiça e questões económicas como Finanças e Relações Exteriores, respondendo por 35% da variação. Pela visualização, podemos adivinhar que Finanças (FINA) e Relações Exteriores (FAAE) têm um membro comum e que Relações Exteriores (FAAE) tem um membro comum com Violência contra Mulheres Indígenas (IWFA). Este resultado é, possivelmente, uma preocupação, pois as agendas mais divulgadas de Stephen Harper tendiam a se concentrar em preocupações económicas como o comércio e a contenção fiscal. A separação dos comités implica que a filosofia de governança de Harper separava as preocupações económicas das sociais e que os direitos das mulheres eram principalmente uma preocupação social. A própria pasta Status da Mulher (FEWO) é separada do resto das pastas, encontrando-se ligada às outras pastas somente através de parlamentares comuns com os comités Violência contra Mulheres Indígenas (IWFA) e Assuntos Indígenas e do Norte (INAN). + +O gráfico do governo de Trudeau não mostra conexões cruzadas do Status da Mulher com a Justiça, Relações Exteriores e Povos Indígenas, mas conexões mais fortes com Finanças, Cidadania, Recursos Humanos e Ética. Os Direitos da Mulher estão ligados às Finanças e à Imigração através da carteira de Igualdade de Remuneração. + +É discutível que o regime do governo Harper alinhou os Direitos das Mulheres às pastas sociais como Justiça e Saúde, enquanto Trudeau elevou o perfil do Status da Mulher até certo ponto ao incluir o comité de Igualdade de Remuneração. A conexão entre os comités focados nos Direitos da Mulher e fortes carteiras como Saúde, Finanças e Cidadania e Imigração no governo Trudeau é digna de uma análise mais detalhada. Neste contexto, o Status da Mulher parece ter uma posição mais central (mais próxima da origem) do que o comité Status da Mulher no governo de Harper. Dito isto, o número de pontos de dados neste caso ainda é bastante pequeno para se chegar a uma conclusão definitiva. Talvez outras fontes de evidência possam ser visualizadas de maneira semelhante para confirmar ou negar este ponto. + +A agenda anteriormente mantida entre as mulheres e os povos indígenas foi deslocada no caso Trudeau. Como discutido anteriormente, o [National Inquiry into Missing and Murdered Indigenous Women and Girls](https://perma.cc/U38Y-4CY9) (Inquérito Nacional sobre Mulheres Indígenas Desaparecidas e Assassinadas) (em inglês) deslocou o mandato para o comité Violência contra as Mulheres Indígenas que existia durante o mandato de Harper. A história desta transição é complexa, mas a pressão política foi aplicada ao governo Harper para criar o Inquérito Nacional sobre Mulheres Indígenas Desaparecidas e Assassinadas após o julgamento de Robert Pickton e relatos de investigações policiais insuficientes para mulheres indígenas desaparecidas. Harper recusou-se a conduzir um inquérito citando que o CPC era a melhor abordagem.[^9] Trudeau fez uma promessa eleitoral de incluir o inquérito, deslocando assim o comité. Até certo ponto, Harper parece ter dado à violência contra as mulheres indígenas um papel bastante central no planejamento do Comité Parlamentar. Esta evidência é um contraponto às críticas de que Harper não levou a sério a questão das Mulheres Indígenas Desaparecidas e Assassinadas. + +As diferenças entre as duas relações levantam questões importantes sobre o papel do Status da Mulher no discurso político e suas interconexões entre identidade racial, finanças públicas, saúde e justiça social, a serem exploradas talvez em um trabalho qualitativo mais detalhado. Também levanta questões importantes sobre o foco no género em geral (de acordo com a carteira do Status da Mulher) ou mais especificamente, uma vez que se aplica a um grupo marginalizado (Mulheres Indígenas Desaparecidas e Assassinadas). Um documento de política relacionado aos benefícios de um Inquérito versus discussão do Comité Parlamentar parece razoável após examinar esta evidência. Talvez haja um argumento de que a troca do "IWFA" por "ESPE" é uma espécie de teto de vidro, colocando artificialmente uma cota em questões de mulheres enquanto as carteiras estabelecidas permanecem intocadas. Como uma ferramenta exploratória, a CA nos ajuda a identificar tais temas a partir da observação empírica, em vez de confiar na teoria ou em preconceitos pessoais. + +## Conclusão + +Agora que este tutorial está completo, é possível ter alguma noção do que é a CA e como pode ser usada para responder perguntas exploratórias sobre dados. Usamos o comando `CA` do FactoMineR para criar a análise e traçar os resultados em duas dimensões. Quando as etiquetas se cruzaram, aplicamos o comando `viz_ca_biplot` do pacote factoextra para exibir os dados em um formato mais legível. + +Também aprendemos como interpretar uma CA e como detectar potenciais armadilhas analíticas, incluindo casos em que as relações entre categorias são distribuídas de forma muito uniforme e têm baixo valor explicativo. Neste caso, refinamos a nossa pergunta e os dados de pesquisa para fornecer uma imagem mais significativa do que aconteceu. + +Em geral, o benefício desta análise é fornecer uma rápida visão geral do *dataset* de duas categorias, como um guia para questões históricas mais substantivas. O uso de membros e reuniões ou eventos em todas as áreas da vida (negócios, sem fins lucrativos, reuniões municipais, *hashtags* de twitter, etc.) é uma abordagem comum para tal análise. Os grupos sociais e as suas preferências são outro uso comum para a CA. Em cada caso, a visualização oferece um mapa com o qual se pode observar um retrato da vida social, cultural e política. + +Os próximos passos podem incluir a adição de outras dimensões categóricas à nossa análise, como a incorporação do partido político, idade ou sexo. Quando se faz CA com mais de duas categorias, é chamada de [Análise de Correspondência Múltipla ou MCA](https://www.youtube.com/watch?v=RDexHE5Iqrg) (em inglês). Enquanto a matemática para a MCA é mais complicada, os resultados finais são bastante semelhantes aos da CA. + +Esperamos que, agora, estes métodos sejam aplicados aos seus próprios dados, ajudando a descobrir perguntas e hipóteses que enriquecem a sua pesquisa histórica. Boa sorte! + +## Apêndice: A Matemática por trás da Análise de Correspondência + +Como a matemática da CA será interessante para alguns e não para outros, optamos por discuti-la neste Apêndice. A secção também contém um pouco mais de detalhes sobre outros tópicos, tais como inércia (*inertia*), dimensões (*dimensions*) e decomposição de valores singulares (*singular value decomposition* ou SVD). + +A fim de facilitar a compreensão, começaremos com apenas alguns comités. "FEWO" (Status das Mulheres ou *Status of Women*), "HESA" (Saúde ou *Health*), "INAN" (Assuntos Indígenas e do Norte ou *Indigenous and Northern Affairs*), "IWFA" (Violência contra as Mulheres Indígenas ou *Violence Against Indigenous Women*) e "JUST" (Justiça ou *Justice*). + +``` + C Bennett D Wilks G Rickford J Crowder K Block K Seeback L Davies N Ashton +FEWO 0 0 0 0 0 0 0 1 +HESA 0 1 0 0 1 0 1 0 +INAN 1 0 1 1 0 1 0 0 +IWFA 1 0 1 1 1 0 1 1 +JUST 0 1 0 0 0 1 0 0 + + R Goguen S Ambler S Truppe +FEWO 0 1 1 +HESA 0 0 0 +INAN 0 1 0 +IWFA 1 1 1 +JUST 1 0 0 +``` + +A CA é feita em um *dataset* “normalizado” que é criado pela divisão do valor de cada célula pela raiz quadrada do produto da coluna e totais de linhas, ou célula \\(\frac{1}{\sqrt{column total \times row total}}\\). Por exemplo, a célula de "FEWO" e S Ambler é \\(\frac{1}{\sqrt{3 \times 3}}\\) ou 0.333.[^10] + +A tabela “normalizada” se parece com isto: + +``` + C Bennett D Wilks G Rickford J Crowder K Block K Seeback L Davies N Ashton +FEWO 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.408 +HESA 0.000 0.408 0.000 0.000 0.408 0.000 0.408 0.000 +INAN 0.316 0.000 0.316 0.316 0.000 0.316 0.000 0.000 +IWFA 0.235 0.000 0.235 0.235 0.235 0.000 0.235 0.235 +JUST 0.000 0.408 0.000 0.000 0.000 0.408 0.000 0.000 + + R Goguen S Ambler S Truppe +FEWO 0.000 0.333 0.408 +HESA 0.000 0.000 0.000 +INAN 0.000 0.258 0.000 +IWFA 0.235 0.192 0.235 +JUST 0.408 0.000 0.000 +``` + +O processo de normalização faz algo interessante. Aqueles que são membros de múltiplos comités e/ou que pertencem a comités com muitos membros tendem a ter notas de normalização mais baixas, sugerindo que são mais centrais para a rede. Estes membros serão colocados mais próximos do centro da matriz. Por exemplo, a célula pertencente a S Ambler e "IWFA" tem a pontuação mais baixa de 0,192 porque S Ambler é membro de três comités e o comité "IWFA" tem nove membros no gráfico representado. + +A próxima etapa é encontrar a decomposição de valor singular destes dados normalizados. Isto envolve álgebra linear bastante complexa que não será abordada aqui, mas pode-se aprender mais com este tutorial de *[Single Value Decomposition](https://perma.cc/CD5F-AL7W)* (Decomposição de Valores Singulares) (em inglês) ou com mais detalhes [neste pdf sobre SVD](https://perma.cc/F7MJ-EGET) (em inglês). Vou tentar resumir o que acontece em termos leigos. + +* Duas novas matrizes são criadas que mostram pontuações de “dimensão” para as linhas (comités) e as colunas (MPs) baseadas em vetores próprios. +* O número de dimensões é igual ao tamanho das colunas ou filas menos 1, que é sempre menor. Neste caso, há cinco comités em comparação com as MPs onze, portanto o número de dimensões é 4. +* Uma outra matriz mostra os valores singulares (valores próprios ou *eigenvalues*), que podem ser usados para mostrar a influência de cada dimensão na análise. +* Um dos vários “tratamentos” é aplicado aos dados para facilitar a plotagem. O mais comum é a abordagem de “coordenadas padrão”, que compara cada pontuação normalizada de forma positiva ou negativa com a pontuação média. + +Ao usar coordenadas padrão, a nossa tabela de dados mostra o seguinte: +``` +Columns (MPs): + +Dim 1 Dim 2 Dim 3 Dim 4 +C Bennett -0.4061946 -0.495800254 0.6100171 0.07717508 +D Wilks 1.5874119 0.147804035 -0.4190637 -0.34058221 +G Rickford -0.4061946 -0.495800254 0.6100171 0.07717508 +J Crowder -0.4061946 -0.495800254 0.6100171 0.07717508 +K Block 0.6536800 0.897240970 0.5665289 0.04755678 +K Seeback 0.5275373 -1.245237189 -0.3755754 -0.31096392 +L Davies 0.6536800 0.897240970 0.5665289 0.04755678 +N Ashton -0.8554566 0.631040866 -0.6518568 0.02489229 +R Goguen 0.6039463 -0.464503802 -0.6602408 0.73424971 +S Ambler -0.7311723 -0.004817303 -0.1363437 -0.30608465 +S Truppe -0.8554566 0.631040866 -0.6518568 0.02489229 + +$inertia +[1] 0.06859903 0.24637681 0.06859903 0.06859903 0.13526570 0.17971014 0.13526570 +[8] 0.13526570 0.13526570 0.08438003 0.13526570 + +Rows (Committees): + +Dim 1 Dim 2 Dim 3 Dim 4 +FEWO -1.0603194 0.6399308 -0.8842978 -0.30271466 +HESA 1.2568696 0.9885976 0.4384432 -0.28992174 +INAN -0.3705046 -0.8359969 0.4856563 -0.27320374 +IWFA -0.2531830 0.1866016 0.1766091 0.31676507 +JUST 1.1805065 -0.7950050 -0.8933999 0.09768076 + +$inertia +[1] 0.31400966 0.36956522 0.24927536 0.09017713 0.36956522 +``` + +Cada pontuação para uma “dimensão” pode ser usada como uma coordenada nesse gráfico. Como não podemos visualizar em quatro dimensões, as saídas CA normalmente se concentram nas primeiras duas ou três dimensões para produzir um gráfico (por exemplo, "HESA" será plotado em `[1.245, 0.989]` ou `[1.245, 0.989, 0.438]` em um gráfico 3D). + +{% include figure.html filename="tr-pt-analise-correspondenciaR-8.png" alt="Imagem representando um gráfico de correspondências sobre comités parlamentares" caption="Figura 8. Análise de correspondência de Comités Parlamentares selecionados para a 1ª Sessão do Governo Stephen Harper, 2006" %} + +As pontuações de inércia são uma forma de mostrar a variação nos dados. Saúde e Justiça possuem a menor quantidade de membros com uma alta pontuação de inércia, enquanto o comité mais popular - "IWFA" - tem uma pequena inércia. Assim, a inércia é uma forma de quantificar a distância dos pontos em relação ao centro do gráfico. + +Outra pontuação importante é visível no gráfico de CA - a percentagem do valor explicativo para cada dimensão. Isto significa que o eixo horizontal explica 42,32% da variação no gráfico, enquanto o eixo vertical explica quase 31%. O que estes eixos significam deve ser interpretado com base no gráfico. Por exemplo, podemos dizer que o lado esquerdo representa questões relativas à identidade social e os do lado direito são mais reguladores. Uma análise histórica mais aprofundada das atas destes comités poderia, por sua vez, oferecer uma maior compreensão sobre o significado da participação destes membros na época. + +## Notas +[^1]: A CA tem uma história ramificada de várias disciplinas e, assim, a terminologia pode ser confusa. Para simplificar, as categorias se referem aos tipos de dados que estão sendo comparados (por exemplo, membros e clubes) enquanto cada item dentro dessas categorias (por exemplo, “The Tennis Club” ou “John McEnroe”) será um elemento dentro dessa categoria. A localização quantitativa dos elementos (coordenadas x e y) são *datapoints*. + +[^2]: Brigitte Le Roux and Henry Rouanet, *Multiple Correspondence Analysis* (Los Angeles: SAGE Publications, 2010): 3. + +[^3]: Não pretendemos sugerir que esta análise seja de forma alguma conclusiva sobre os laços comerciais entre os EUA e a Rússia. A questão é que, como a Rússia não faz parte da TPP neste acordo, ela se separa dos EUA. Por outro lado, se a adesão à TPP pudesse ser comprovada como representando laços tensos entre os EUA e a Rússia, apareceria no gráfico de CA. + +[^4]: Sebastien Le, Julie Josse, Francois Husson (2008). FactoMineR: An R Package for Multivariate Analysis. Journal of Statistical Software, 25(1), 1-18. [10.18637/jss.v025.i01](https://doi.org/10.18637/jss.v025.i01). + +[^5]: Alboukadel Kassambara and Fabian Mundt (2017). factoextra: Extract and Visualize the Results of Multivariate Data Analyses. R package version 1.0.4. [https://CRAN.R-project.org/package=factoextra](https://perma.cc/Z2RC-F4J7). + +[^6]: O valor explicativo é a distância dos *datapoints* afastados do centro do gráfico. Cada dimensão é responsável por parte da distância que os *datapoints* divergem do centro. + +[^7]: Em geral, a inércia nas estatísticas refere-se à variação ou “disseminação” de um *dataset*. Esta é análoga ao desvio padrão nos dados de distribuição. + +[^8]: Ver Laura Kane (3 de abril de 2017), "Missing and murdered women's inquiry not reaching out to families, say advocates." *CBC News Indigenous*. [https://www.cbc.ca/news/indigenous/mmiw-inquiry-not-reaching-out-to-families-says-advocates-1.4053694](https://perma.cc/UQ6J-8QVZ). + +[^9]: Em estatística, um valor p (*p-value*), abreviação de valor de probabilidade, é um indicador de quão provável um resultado teria ocorrido em circunstâncias aleatórias. Um baixo valor de p sugere uma probabilidade baixa de que o resultado teria ocorrido ao acaso e, portanto, fornece algumas evidências de que uma hipótese nula (neste caso, que os MPs e CPCs são categorias independentes) é improvável. + +[^10]: Katherine Faust (2005) "Using Correspondence Analysis for Joint Displays of Affiliation Network" in *Models and Methods in Social Network Analysis* eds. Peter J. Carrington, John Scott and Stanley Wasserman. diff --git a/pt/licoes/analise-sentimento-R-syuzhet.md b/pt/licoes/analise-sentimento-R-syuzhet.md index 1f29228bd7..d946878db4 100644 --- a/pt/licoes/analise-sentimento-R-syuzhet.md +++ b/pt/licoes/analise-sentimento-R-syuzhet.md @@ -1,505 +1,505 @@ ---- -title: Análise de sentimentos em R com 'syuzhet' -layout: lesson -slug: analise-sentimento-R-syuzhet -date: 2021-03-23 -translation_date: 2022-03-02 -authors: -- Jennifer Isasi -editors: -- Maria José Afanador-Llach -reviewers: -- Riva Quiroga -translator: -- Diana Rebelo Rodriguez -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Ana Giulia Aldgeire -- Ian Araujo -original: analisis-de-sentimientos-r -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/467 -difficulty: 2 -activity: analyzing -topics: [distant-reading, r, data-visualization] -abstract: "Esta lição ensina uma maneira de obter e analisar dados sobre emoções e sentimentos em uma narrativa." -avatar_alt: "Gravura com três rostos que expressam emoções distintas" -doi: 10.46430/phpt0022 ---- - -{% include toc.html %} - -# Objetivos - -Esta lição usa a metodologia de análise de sentimentos e emoções através da linguagem de programação R para investigar documentos textuais de modo individual. Embora a lição não seja destinada a usuários avançados de R, é necessário que se tenha algum conhecimento dessa linguagem; assumimos que se tenha o R instalado e saiba como importar pacotes. Também recomendamos o download do RStudio. Se não estiver familiarizado com R, é melhor trabalhar primeiro através das lições [Processamento básico de texto em R](/pt/licoes/processamento-basico-texto-r), [Noções básicas de R com dados tabulares](/pt/licoes/nocoes-basicas-R-dados-tabulares) ou [Data Wrangling and Management in R](/en/lessons/data-wrangling-and-management-in-r) (em inglês). Ao final desta lição, o(a) pesquisador(a) será capaz de: - -- Colocar perguntas de pesquisa com base na análise quantitativa de sentimentos em textos de tipo ensaístico e/ou narrativo. -- Usar a linguagem de programação R, o ambiente RStudio e o pacote `syuzhet` com o dicionário NRC para gerar o indicador de sentimento de um texto em diferentes linguagens. -- Analisar criticamente os resultados do processamento de texto. -- Visualizar os dados gerais e sua evolução ao longo de um texto. - -Esta lição foi construída com a versão R 4.0.2, mas acreditamos que funcionará corretamente em versões futuras do programa. - -> O uso do R é geralmente o mesmo para Windows, Mac e Linux. Entretanto, como vamos trabalhar com textos em português, precisaremos escrever algum código extra para indicar o formato UTF-8 em máquinas Windows. Nesses casos, o código para o sistema operacional correspondente é exibido. - -# Antes de começar - -## Análise de sentimentos - -A análise dos sentimentos ou a mineração de opinião é utilizada para extrair automaticamente informações sobre a conotação negativa ou positiva da linguagem de um documento. Embora seja uma tarefa que vem sendo utilizada há muito tempo no campo do marketing ou da política, em estudos literários ainda é uma abordagem recente e não há um método único. Além disso, há a possibilidade de extrair a polaridade dos sentimentos e também das emoções. - -É importante especificar o que estamos procurando com os termos “sentimento” e “emoções”, pois eles são frequentemente usados de forma intercambiável, de modo geral, mas são diferentes. Para Antonio R. Damasio, as emoções são reações corporais instigantes de nosso corpo, determinadas por estímulos ambientais e derivadas do desenvolvimento da regulamentação biológica (12). Elas podem ser divididas em primárias e secundárias. Embora não haja um acordo final sobre o número de emoções básicas, geralmente são seis: raiva, alegria, repugnância, medo, tristeza e surpresa, embora Damasio considere esta última como sendo secundária. Além disso, no caso do sistema automático que utilizaremos, as emoções secundárias de antecipação e confiança também aparecem. - -Por outro lado, podemos definir sentimento como a ação e o efeito de sentir uma emoção ou, em outras palavras, é o resultado do fato de que “quando um objeto, uma pessoa, uma situação ou um pensamento provoca em nós a emoção da alegria, começa um processo que pode concluir no sentimento de estar alegre ou feliz” (Pereira Zazo 32) porque é uma emoção positiva. Durante a lição faremos uma distinção entre os dois termos, pois usaremos o resultado do sentimento para ver a sua evolução ao longo do texto e as emoções para ver o uso das palavras em geral. - -## Dicionário de léxicos NRC - -O pacote `syuzhet` funciona com quatro dicionários de sentimentos: Bing, Afinn, Stanford e NRC. Nesta lição, trabalharemos com este último, pois é o único disponível em vários idiomas, incluindo o português. Este vocabulário com valores de sentimentos negativos ou positivos e oito emoções foi desenvolvido por Saif M. Mohammad, um cientista do Conselho Nacional de Pesquisa do Canadá (NRC). O conjunto de dados foi construído manualmente através de pesquisas usando a técnica Maximum Difference Scaling ou MaxDiff, que avalia a preferência por uma série de alternativas (Mohammad e Turney). Assim, o léxico tem 14.182 palavras com as categorias de sentimentos positivos e negativos e as emoções de raiva, antecipação, repugnância, medo, alegria, tristeza, surpresa e confiança. Além disso, está disponível em mais de 100 idiomas (através de tradução automática). - -Seus termos de uso estabelecem que o vocabulário pode ser usado gratuitamente para fins de pesquisa, portanto, todos os dados estão disponíveis para download. - -Se trabalhamos com o inglês, podemos interagir com as diferentes categorias no site do [NRC Word-Emotion Association Lexicon](http://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm). Lá também podemos encontrar trabalhos publicados sobre a obtenção dos valores para o vocabulário, sua organização, extensão, etc. - -## Pacote `syuzhet` - -O [pacote `syuzhet`](https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html) foi desenvolvido em 2015 por Matthew Jockers; que o mantém funcionando até hoje e continuamente apresenta novas versões (no momento da preparação desta lição, foi usada a versão de dezembro de 2017). Uma série de posts no blog acompanham o desenvolvimento do pacote, e estão disponíveis no blog do professor desde [5 de junho de 2014](http://www.matthewjockers.net/page/2/) (em inglês). - -Naturalmente, o pacote foi desenvolvido com testes em textos escritos ou traduzidos para o inglês e não sem debate sobre sua utilidade, para atribuir valores a textos literários que muitas vezes são, por natureza, bastante subjetivos. - -> Atenção: A lista de palavras do dicionário está sendo preparada em inglês como língua principal e os dados quantitativos atribuídos a cada palavra são o resultado da avaliação humana pelos participantes americanos. Portanto, vários fatores devem ser levados em consideração ao utilizar esta metodologia: -> -> - O léxico em português é uma tradução direta realizada por tradução automática (estes sistemas já são muito confiáveis entre o inglês e o português, mas não em outros idiomas que o NRC afirma ser capaz de analisar como, por exemplo, o basco). -> - A pontuação de cada palavra, ou seja, a valência sentimental e emocional, tem um viés cultural e temporal que deve ser levado em conta, e um termo que pareceu positivo para os participantes da pesquisa pode nos parecer negativo. -> - O uso desta metodologia não é recomendado para textos que são altamente metafóricos e simbólicos. -> - O método não vai captar a negação de um sentimento positivo como, por exemplo, a frase “Eu não estou feliz”. -> -> Seguindo o espírito de adaptabilidade das lições do *Programming Historian* a outras línguas, foi decidido usar `syuzhet` em sua forma original, mas ao final da lição indicamos algumas funções avançadas para usar seu próprio dicionário de sentimentos com o mesmo pacote. - -Como os resultados nos *dataframes* aparecerão em inglês, se achar necessário, dedique um momento para aprender esta tradução: - -| anger | anticipation | disgust | fear | joy | sadness | surprise | trust | negative | positive | -| ------ | ------------ | -------- | ----- | ------- | -------- | -------- | --------- | -------- | -------- | -| raiva | anticipação | desgosto | medo | alegria | tristeza | surpresa | confiança | negativo | positivo | - -## Um pequeno exemplo - -Antes de começar a realizar a análise de nossos textos, é útil saber de forma geral qual é o processo de análise realizado pela função de obter sentimentos de `syuzhet`, com o dicionário NRC e os resultados obtidos sobre os quais trabalharemos. - -O sistema irá processar nosso texto e transformá-lo em um vetor de caracteres (aqui palavras), para analisá-los individualmente (também é possível fazê-lo por sentenças). Sem entrar ainda no código para realizar a análise, dê uma olhada neste breve exemplo (nota de tradução: para a versão em português foi usado o texto _Dom Casmurro_ de Machado de Assis, mantendo o tipo de exercícios e o código da lição original): - -> “Contando aquela crise do meu amor adolescente, sinto uma coisa que não sei se explico bem, e é que as dores daquela quadra, a tal ponto se espiritualizaram com o tempo, que chegam a diluir-se no prazer. Não é claro isto, mas nem tudo é claro na vida ou nos livros. A verdade é que sinto um gosto particular em referir tal aborrecimento, quando é certo que ele me lembra outros que não quisera lembrar por nada.” -> -> *Dom Casmurro* de Machado de Assis. - -Este fragmento é transformado em um vetor de caracteres: - -```R -> print(exemplo_2) -[1] "contando" "aquela" "crise" "do" "meu" -[6] "amor" "adolescente" "sinto" "uma" "coisa" -[11] "que" "não" "sei" "se" "explico" ... -``` - -Com a função de obter sentimentos, obtém-se a valência positiva e negativa de cada palavra, assim como a valência das oito emoções classificadas pelo NRC. O resultado para este fragmento é o seguinte: - -```R -> print(sentimentos_exemplo_df, row.names = exemplo_2) - anger anticipation disgust fear joy sadness surprise trust negative positive -contando 0 0 0 0 0 0 0 0 0 0 -aquela 0 0 0 0 0 0 0 0 0 0 -crise 1 0 0 0 0 1 0 0 3 0 -do 0 0 0 0 0 0 0 0 0 0 -meu 0 0 0 0 0 0 0 0 0 0 -amor 0 1 0 0 1 1 0 1 0 1 -adolescente 0 0 0 0 0 0 0 0 0 0 -sinto 0 0 0 0 0 0 0 0 0 0 -uma 0 0 0 0 0 0 0 0 0 0 -coisa 0 0 0 0 0 0 0 0 0 0 -que 0 0 0 0 0 0 0 0 0 0 -não 0 0 0 0 0 0 0 0 0 0 -sei 0 0 0 0 0 0 0 0 0 0 -se 0 0 0 0 0 0 0 0 0 0 -explico 0 0 0 0 0 0 0 0 0 0 -bem 0 0 0 0 0 0 0 0 0 0 -... -``` - -> Nota de tradução: na lição original, os autores não explicaram o passo a passo para se obter esses resultados em um primeiro momento. Apesar de a lição explicar detalhadamente o processo, julguei ser interessante demonstrar aqui como obtive esses outputs: - -```R -exemplo <- "Contando aquela crise do meu amor adolescente, sinto uma coisa que não sei se explico bem, e é que as dores daquela quadra, a tal ponto se espiritualizaram com o tempo, que chegam a diluir-se no prazer. Não é claro isto, mas nem tudo é claro na vida ou nos livros. A verdade é que sinto um gosto particular em referir tal aborrecimento, quando é certo que ele me lembra outros que não quisera lembrar por nada." - -exemplo_2 <- get_tokens(exemplo) - -print(exemplo_2) - -sentimentos_exemplo_df <- get_nrc_sentiment(exemplo_2, lang="portuguese") - -print(sentimentos_exemplo_df, row.names = exemplo_2) -``` - -Como podemos ver nos resultados deste objeto tipo *data frame* ou tabela, cada palavra ou ficha tem um valor padrão de 0 nas dez colunas. Se houver um valor maior que 0 significa, em primeiro lugar, que este termo existe no dicionário NRC e, em segundo lugar, que tem um valor atribuído para alguma emoção e/ou sentimento. Neste exemplo, podemos ver que a palavra “amor” é entendida de forma positiva, ainda que represente tristeza (*sadness*). Por outro lado, a palavra “crise” possui uma conotação negativa muito forte, pois há menos margem para dúvidas. - -As possibilidades de explorar, analisar e visualizar estes resultados dependem, em grande parte, das suas habilidades de programação mas, acima de tudo, da sua questão de pesquisa. Para ajudar o pesquisador, nesta lição introdutória aprenderemos como analisar os dados utilizando várias formas de visualização. - -## Pergunta de pesquisa - -Para essa lição, vamos utilizar o romance Dom Casmurro, escrito por [Machado de Assis](https://pt.wikipedia.org/wiki/Machado_de_Assis), publicado em 1899, de caráter realista e ambientado no Rio de Janeiro na segunda metade do século XIX. O protagonista e narrador é Bento Santiago (também conhecido como Bentinho ou Dom Casmurro), que apresenta relatos desde a sua juventude até à sua vida adulta, quando escreve. Nesse intervalo de tempo passa por experiências como viver em um seminário e se preparar para ser Padre, mas também desistir dessa vida ao se apaixonar por Capitu. O enredo central da trama é o ciúme envolvido nessa relação. - -É possível observar a queda emocional desta trama ao se extrair automaticamente os valores de sentimento do romance? Ou, em outras palavras, nossa recepção dos ciúmes de Bentinho coincide com os resultados desse cálculo automático? Além disso, quais são as palavras mais usadas na descrição das emoções do texto? - - -# Obter valores de sentimentos e emoções - -## Instalar e carregar pacotes - -A primeira coisa que precisamos fazer para poder obter o sentimento de nosso texto, é instalar e carregar o pacote R correspondente, neste caso, o `syuzhet`. Além disso, para facilitar a visualização dos dados, vamos utilizar os pacotes `RColorBrewer`, `wordcloud`, `tm` e `NLP`. Para fazer isto, digite e execute os dois comandos seguintes em seu console; o primeiro para instalar o pacote e o segundo para carregá-lo (se já os tiver instalado, só precisa carregá-los). Note que a instalação destes pacotes pode levar alguns minutos. - -```R -# Instale os pacotes: -install.packages("syuzhet") -install.packages("RColorBrewer") -install.packages("wordcloud") -install.packages("tm") - -# Carregue os pacotes -library(syuzhet) -library(RColorBrewer) -library(wordcloud) -library(tm) -``` - -## Carregar e preparar o texto - -Faça o download do texto do romance [Dom Casmurro](/assets/analise-sentimento-R-syuzhet/domCasmurro.txt). Como podemos ver, o documento está em formato de texto simples, pois isto é essencial para realizar seu processamento e análise em R. - -Com o texto em mãos, a primeira coisa que vamos fazer é carregá-lo como um objeto de _string_. Certifique-se de mudar o caminho para o texto para corresponder ao seu computador. - -**Em Mac e Linux** - -Em sistemas Mac podemos usar a função `get_text_as_string` integrada no pacote `syuzhet`: - -```R -texto <- get_text_as_string("https://raw.githubusercontent.com/programminghistorian/jekyll/gh-pages/assets/analise-sentimento-R-syuzhet/domCasmurro.txt") -``` - -**Em Windows** - -Os sistemas Windows não lêem diretamente caracteres com acentos ou outras marcações típicas do espanhol, português ou francês, então temos que dizer ao sistema que o nosso texto está no formato UTF-8 usando a função `scan`. - -```R -texto <- scan(file = "https://raw.githubusercontent.com/programminghistorian/jekyll/gh-pages/assets/analise-sentimento-R-syuzhet/domCasmurro.txt", fileEncoding = "UTF-8", what = character(), sep = "\n", allowEscapes = T) -``` - -Como a análise que vamos realizar precisa de uma lista, seja de palavras ou de frases (aqui só prestaremos atenção a palavras individuais), precisamos de um passo intermediário entre o carregamento do texto e a extração dos valores de sentimento. Assim, vamos dividir o texto (*string*) em uma lista de palavras (*tokens*). Isto é muito comum na análise distante de textos. - -Para isso, usamos a função `get_tokens()` do pacote e geramos um novo objeto, neste caso um vetor de *tokens* (palavras). Conforme veremos, com esta função nos livramos da pontuação no texto e temos uma lista de palavras. - -```R -texto_palavras <- get_tokens(texto) -head(texto_palavras) -[1] "dom" "casmurro" "texto" "de" "referência" "obras" -``` -Agora podemos ver quantas palavras ou tokens estão neste texto com a função `length()`: -```R -length(texto_palavras) -[1] 66931 -``` - -Se quiser realizar a análise para orações, utilize a função `get_sentences()` e siga o mesmo processo, com exceção da criação da nuvem de palavras: - -```R -oracoes_vetor <- get_sentences(texto) -length(oracoes_vetor) -[1] 8637 -``` - - - -## Extração de dados com o NRC Sentiment Lexicon - -Agora podemos executar a função `get_nrc_sentiment` para obter os sentimentos no romance *Dom Casmurro*. Como a função executa por padrão o vocabulário inglês, nós a escrevemos com o argumento “lang” (de *language*, ou idioma) para usar o vocabulário português (“portuguese”). Por sua vez, criamos um novo objeto para armazenar os dados extraídos. Este será um objeto do tipo *data frame*. Esta função procura a presença das oito emoções e dos dois sentimentos para cada palavra em nosso vetor, e atribui um número maior que 0 se elas existirem. Dependendo do desempenho de seu computador e de acordo com as características de nosso texto, este processo pode levar de 15 a 30 minutos. - -```R -sentimentos_df <- get_nrc_sentiment(texto_palavras, lang="portuguese") -``` - -Quando o código terminar de ser executado, um aviso aparecerá porque o `syuzhet` usa uma função que é descontinuada dentro de sua função `get_nrc_sentiment`: - -```R -Warning message: -`data_frame()` is deprecated as of tibble 1.1.0. -Please use `tibble()` instead. -This warning is displayed once every 8 hours. -Call `lifecycle::last_warnings()` to see where this warning was generated. -``` - -Quando o processo terminar, se desejarmos, podemos ler os resultados no novo objeto, simplesmente selecionando o objeto e executando-o. Mas para evitar “imprimir” milhares de linhas no console, também podemos usar a função `head()` para ver os primeiros seis *tokens*. No caso do texto que estamos usando, quando executarmos essa função, devemos ver o seguinte, que não é nada interessante: - -```R -> head(sentimientos_df) - anger anticipation disgust fear joy sadness surprise trust negative positive -1 0 0 0 0 0 0 0 1 0 1 -2 0 0 0 0 0 0 0 0 0 0 -3 0 0 0 0 0 0 0 0 0 0 -4 0 0 0 0 0 0 0 0 0 0 -5 0 0 0 0 0 0 0 0 0 0 -6 0 0 0 0 0 0 0 0 0 0 -``` - -## Resumo do texto - -O que é interessante é ver um resumo de cada um dos valores que obtivemos utilizando a função geral `summary()`. Isto pode ser muito útil ao comparar vários textos, pois permite ver diferentes medidas, tais como a média dos resultados para cada uma das emoções e os dois sentimentos. Por exemplo, podemos ver que o romance *Dom Casmurro* é, em [média](https://pt.wikipedia.org/wiki/M%C3%A9dia) (*mean*), um pouco mais positivo (0,03892) do que negativo (0,03559). Mas se olharmos para as emoções, parece que a tristeza (0,02116) aparece em mais momentos do que a alegria (0,01593). Vários dos valores fornecidos pela função de resumo do texto aparecem com um valor igual a 0, incluindo a [mediana](https://pt.wikipedia.org/wiki/Mediana_(estat%C3%ADstica)) (*median*). Isto indica que poucas palavras do romance aparecem no dicionário que estamos usando (NRC) ou, inversamente, que poucas têm uma atribuição de sentimento ou emoção no dicionário. - -```R -> summary(sentimentos_df) - anger anticipation disgust fear joy - Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.00000 - 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000 - Median :0.00000 Median :0.00000 Median :0.000000 Median :0.00000 Median :0.00000 - Mean :0.01116 Mean :0.01337 Mean :0.008815 Mean :0.01288 Mean :0.01593 - 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.00000 - Max. :5.00000 Max. :2.00000 Max. :3.000000 Max. :4.00000 Max. :7.00000 - sadness surprise trust negative positive - Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.00000 Min. :0.00000 - 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 - Median :0.00000 Median :0.000000 Median :0.00000 Median :0.00000 Median :0.00000 - Mean :0.02116 Mean :0.008965 Mean :0.02299 Mean :0.03559 Mean :0.03892 - 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 - Max. :4.00000 Max. :2.000000 Max. :3.00000 Max. :5.00000 Max. :7.00000 -``` - -> Parabéns! Já temos os resultados da análise de sentimentos! E, agora, o que podemos fazer com esses números? - - - -# Análise das emoções em um texto - -## Gráfico de barras - -Para ver quais as emoções que estão mais presentes no texto, a maneira mais simples é criar um *barplot*. Para isso, usamos a função `barplot()` com o resumo das colunas 1 a 8, ou seja, as colunas de raiva (*anger*), antecipação (*antecipation*), desgosto (*disgust*), medo (*fear*), alegria (*joy*), tristeza (*sadness*), surpresa (*surprise*) e confiança (*trust*). Os resultados obtidos vêm do processamento da função `prop.table()` dos resultados das oito colunas com cada uma das palavras da tabela. - -> Para cada barra, todos os valores da coluna de emoções correspondentes são somados. Então, o resultado de todas as emoções que adicionamos na saída do gráfico é somado. No final, a soma de cada emoção é dividida pelo total de todas as colunas ou emoções. Isto não acrescenta as colunas negativas e positivas. [^1] - -```R -barplot( -colSums(prop.table(sentimentos_df[, 1:8])), -space = 0.2, -horiz = FALSE, -las = 1, -cex.names = 0.7, -col = brewer.pal(n = 8, name = "Set3"), -main = "'Dom Casmurro' de Machado de Assis", -sub = "Análise realizada por Diana Rebelo Rodriguez", -xlab="emoções", ylab = NULL) -``` -O resto dos parâmetros que vemos no código são “extras”, pois são uma forma de configurar o formato visual do gráfico. Assim, indicamos um espaço (*space*) de 0,2 entre as barras, que estará na posição vertical ao indicar falsamente (*FALSE*) sua horizontalidade (*horiz*) e, ao contrário, a horizontalidade para os valores no eixo Y com `las = 1`. Além disso, reduzimos o tamanho do nome de cada barra (*cex.names*) para 0,7 para evitar que elas desapareçam, por exemplo, se fizermos um pequeno gráfico. Graças ao pacote que instalamos no início, `RColorBrewer`, podemos dar cor às colunas automaticamente, neste caso, com a paleta de cores (*brewer.pal*) do conjunto número 3 do pacote, com oito cores, uma para cada coluna. Finalmente, vamos colocar um título e subtítulo em nosso gráfico com os parâmetros `main` e `sub`, assim como a palavra “emoções” no eixo X e nada no eixo Y. - -Gráfico de barras com os valores das seis emoções capturadas em Dom Casmurro por Machado de Assis - -Se esses parâmetros não forem do seu interesse, basta executar o seguinte código para obter o gráfico padrão: - -```R -barplot(colSums(prop.table(sentimentos_df[, 1:8]))) -``` - -> Certifique-se de que há espaço suficiente na seção de exibição de gráficos do R para poder ver os nomes de cada coluna. - -Estas informações já indicam que as emoções de tristeza e confiança prevalecem mais do que as de desgosto ou surpresa. Mas quais são as palavras usadas por Machado na expressão dessa tristeza? Com que frequência cada uma aparece no romance como um todo? - -## Contando o número de palavras com cada emoção - -A fim de realizar uma análise do texto, é muito interessante saber quais são as palavras usadas com mais frequência no texto em relação à sua identificação com cada emoção. Para isso, primeiro temos que criar um objeto de caracteres com todas as palavras que tenham um valor maior que 0 na coluna “tristeza” (*sadness*). Para selecionar somente essa coluna, usamos o sinal de dólar após o nome do *data frame*: -```R -palavras_tristeza <- texto_palavras[sentimentos_df$sadness > 0] -``` - -O conteúdo de `palavras_tristeza` nos indica que esta lista não diz muito, pois retorna apenas a listagem de palavras sem maiores informações. Para obter a contagem das vezes que cada palavra relacionada à tristeza aparece no romance, geramos uma tabela do primeiro conjunto de caracteres com as funções `unlist` e `table`, que depois ordenamos em ordem decrescente (se quisermos uma ordem ascendente mudamos TRUE para FALSE); criamos um novo objeto de tipo tabela e imprimimos as primeiras 12 palavras da lista com sua frequência: - -```R -palavras_tristeza_ordem <- sort(table(unlist(palavras_tristeza)), decreasing = TRUE) -head(palavras_tristeza_ordem, n = 12) -head(palavras_tristeza_ordem, n = 12) - - nada mal tarde entre - 135 80 53 50 - caso morte sair medo - 34 34 32 23 - amor pecado pena defunto - 20 17 17 14 -``` - -Se quisermos saber quantas palavras únicas foram relacionadas à tristeza, basta usar a função `length` no objeto que agora agrupa as palavras em ordem: - -```R -length(palabras_tristeza_orden) -[1] 163 -``` - -Podemos repetir a mesma operação com o resto das emoções ou com aquela que nos interessa, assim como com os sentimentos positivos e negativos. Tente obter os resultados para a emoção “alegria” e compare os resultados [^2]. - -Dependendo do tipo de análise que se deseje fazer, tal resultado é eficiente. Agora, para o propósito introdutório da lição, vamos gerar uma nuvem de palavras que ajuda a visualizar facilmente os termos associados a cada emoção (embora iremos visualizar aqui apenas quatro para facilitar a leitura). - -## Nuvem de emoções - -A fim de gerar uma nuvem com as palavras que correspondem a cada emoção em *Dom Casmurro*, criaremos primeiro um vetor no qual armazenaremos todas as palavras que, nas colunas que indicamos após o símbolo `$`, têm um valor maior que 0. É gerado um novo objeto do tipo vetor, que contém um elemento para a lista de cada emoção. - -Neste caso, devemos indicar novamente à função que temos caracteres acentuados se for uma máquina Windows. - -**Em Mac e Linux** - -```R -nuvem_emocoes_vetor <- c( -paste(texto_palavras[sentimentos_df$sadness> 0], collapse = " "), -paste(texto_palavras[sentimentos_df$joy > 0], collapse = " "), -paste(texto_palavras[sentimentos_df$anger > 0], collapse = " "), -paste(texto_palavras[sentimentos_df$fear > 0], collapse = " ")) -``` -**Em Windows** - -Uma vez gerado o vetor, deve convertê-lo em caracteres UTF-8 utilizando a função `iconv`. - -```R -nuvem_emocoes_vetor <- c( -paste(texto_palavras[sentimentos_df$sadness> 0], collapse = " "), -paste(texto_palavras[sentimentos_df$joy > 0], collapse = " "), -paste(texto_palavras[sentimentos_df$anger > 0], collapse = " "), -paste(texto_palavras[sentimentos_df$fear > 0], collapse = " ")) - -nuvem_emocoes_vetor <- iconv(nuvem_emocoes_vetor, "latin1", "UTF-8") -``` -Agora que temos o vetor, criamos um _corpus_ de palavras com quatro “documentos” para a nuvem: - -```R -nuvem_corpus <- Corpus(VectorSource(nuvem_emocoes_vetor)) -``` - -Em seguida, transformamos este corpus em uma matriz termo-documento com a função `TermDocumentMatrix()`. Com isto, agora usamos a função `as.matrix()` para converter o TDM em uma matriz que, como podemos ver, lista os termos no texto com um valor maior que zero para cada uma das quatro emoções que extraímos aqui. Para ver o início desta informação, use novamente a função `head`: - -```R -nuvem_tdm <- TermDocumentMatrix(nuvem_corpus) -nuvem_tdm <- as.matrix(nuvem_tdm) -head(nuvem_tdm) - Docs -Terms 1 2 3 4 - abismo 1 0 0 1 - acidente 1 0 1 1 - afligir 3 0 0 3 - agonia 1 0 1 1 - amargamente 1 0 1 0 - amor 20 20 0 0 -``` - -Agora, atribua um nome a cada um dos grupos de palavras ou documentos (*Docs*) em nossa matriz. Aqui vamos usar o termo em português para as colunas que selecionamos para exibir na nuvem. Mais uma vez, podemos ver a mudança feita ao executar a função `head`. - -```R -colnames(nuvem_tdm) <- c('tristeza', 'felicidade', 'raiva', 'confiança') -head(nuvem_tdm) - Docs -Terms tristeza felicidade raiva confiança - abismo 1 0 0 1 - acidente 1 0 1 1 - afligir 3 0 0 3 - agonia 1 0 1 1 - amargamente 1 0 1 0 - amor 20 20 0 0 -``` - - -Finalmente, podemos visualizar a nuvem de palavras que estamos acostumados a ver na mídia ou em estudos académicos. O tamanho e a localização da palavra correspondem à sua maior ou menor ocorrência com valor emocional atribuído no texto. Primeiro, executamos a função `set.seed()` para que quando reproduzirmos o resultado visual seja o mesmo que o nosso (se não o fizer, será o mesmo, mas as palavras aparecerão em posições diferentes). E, para gerar a nuvem, vamos usar a função `comparison.cloud` do pacote `wordcloud`. Indicamos o objeto a representar, aqui ‘nuvem_tdm’, indicamos uma ordem não aleatória das palavras, atribuímos uma cor para cada grupo de palavras e damos tamanhos ao título e à escala geral, e atribuímos um número máximo de termos que serão exibidos. -```R -set.seed(757) # pode ser qualquer número -comparison.cloud(nuvem_tdm, random.order = FALSE, - colors = c("green", "red", "orange", "blue"), - title.size = 1, max.words = 50, scale = c(2.5, 1), rot.per = 0.4) -``` - -O resultado deve ser semelhante à imagem abaixo, mas a localização das palavras pode ser diferente uma vez que a figura é gerada segundo o tamanho da tela: - -Nuvem das palavras mais frequentes correspondentes às emoções de tristeza, felicidade, raiva e confiança no romance Dom Casmurro de Machado de Assis. - -O que sugere o resultado desta nuvem? Ficamos impressionados com o aparecimento de palavras como “entre” no conjunto da tristeza e “cavalo” no conjunto da raiva. Este “disparate” está relacionado com o aviso já anunciado no início da lição. O vocabulário para análise de sentimentos que estamos usando aqui é traduzido do inglês, um tradutor automático que não é “perfeito”. - -# Visualizando a evolução dos sentimentos em um texto - -Para complementar a leitura isolada das emoções, estudando a flutuação dos sentimentos positivos e negativos ao longo de um texto, há uma maneira de normalizar e visualizar estas informações. Como a análise da função de extração de sentimento atribui um valor positivo tanto ao sentimento positivo quanto ao negativo, precisamos gerar dados entre um intervalo de -1 para o momento mais negativo e 1 para o mais positivo, e onde 0 é neutro. Para isso, calculamos a valência do texto multiplicando os valores na coluna de valores negativos de nosso *data frame* com os resultados por -1 e adicionamos o valor na coluna de valores positivos. - -```R -sentimentos_valencia <- (sentimentos_df$negative * -1) + sentimentos_df$positive -``` - -Finalmente, podemos gerar um gráfico com a função `simple_plot()` integrada no pacote `syuzhet`, que nos dará duas imagens diferentes; a primeira, tem todas as medidas que o algoritmo calcula e, a segunda, é uma normalização das mesmas. O eixo horizontal apresenta o texto em 100 fragmentos normalizados e o eixo vertical nos informa sobre a valência do sentimento no texto. Dependendo das características de seu computador, este gráfico pode levar até 20-30 minutos para ser gerado. - -```R -simple_plot(sentimentos_valencia) -``` - -> Assegure-se de possuir espaço suficiente no espaço de visualização de gráficos do R para que ele seja gerado. Caso contrário, aparecerá o erro: *Error in plot.new() : figure margins too large* - -Evolução das emoções ao longo do texto - -Assim, neste caso, podemos interpretar que o romance *Dom Casmurro* varia bastante entre momentos positivos e negativos. Começa de forma mais negativa, fica mais positivo, sendo seguido por um novo momento negativo e um segundo positivo (porém menos do que o primeiro) para um desfecho negativo. Qualquer pessoa que tenha lido o romance pode confirmar esta variação de sentimentos vivida pelo protagonista. - -## Salvar seus dados - -Se quiser salvar seus dados para retornar a eles mais tarde, é possível fazê-lo em um ficheiro de valores separados por vírgula (CSV) com a função `write.csv()`. Aqui dizemos para salvar o *data frame*, que contém o resultado das oito emoções e os dois sentimentos de texto em um ficheiro com uma extensão `.csv`. Além disso, podemos acrescentar a palavra à qual cada linha de resultados corresponde, em uma coluna à esquerda usando a palavra vetor feita no início da análise. - -```R -write.csv(sentimentos_df, file = "analise_sent_domCasmurro.csv", row.names = texto_palavras) -``` - -Agora, pode começar a analisar seus próprios textos e compará-los uns com os outros! - -# Outras funcionalidades e suas limitações - -Talvez esteja trabalhando em um projeto onde já tem um dicionário de sentimentos criado, ou talvez precise personalizar o vocabulário e sua valência sentimental por razões culturais ou temporais, ou talvez esteja procurando melhorar os resultados traduzidos automaticamente do NRC usado aqui. Em qualquer um destes casos, a partir do final de 2020, também é possível usar o seu próprio conjunto de dados no *script* graças à função `custom` e realizar algumas das operações que foram aprendidas nesta lição. - -Para carregar seu próprio “dicionário de sentimentos”, é preciso primeiro criar (ou modificar) uma tabela contendo, pelo menos, uma coluna para palavras e uma coluna para sua valência, por exemplo: - -|word|value| -|---|---| -|amor|1| -|cólera|-1| -|tapete|0| -|catástrofe|-2| - - -Em seguida, carregue os seus dados salvos como um CSV com a função `read.csv`, que criará um novo conjunto disponível como `data.frame`, no qual é possível verificar seu texto: -```R -vocabulario_personalizado <- read.csv("ficheiro.csv") -method <- "custom" -sentimentos_oracoes <- get_sentiment(oracoes_vetor, method = method, lexicon = vocabulario_personalizado) -``` -Se quiser visualizar o progresso do sentimento ao longo de seu texto, podemos usar a função `plot` com outros parâmetros que já vimos: - -```R -plot(sentimentos_oracoes, - type = "l", - main = "'Dom Casmurro' de Machado de Assis", - sub = "Análise realizada por Diana Rebelo Rodriguez", - xlab="emocoes", ylab = " " - ) -``` -Entretanto, tenha em mente que esta forma de análise será limitada e não será possível realizar as mesmas operações como explicado acima. Por exemplo, seguindo o modelo do exemplo, não conseguimos as informações sobre emoções, portanto não somos capazes de fazer uma nuvem de palavras. - -# Referências -Assis, Machado de. _Dom Casmurro_. São Paulo: Editora Ática, 1996. - -Jockers, Matthew L. _Syuzhet: Extract Sentiment and Plot Arcs from Text_, 2015. [https://github.com/mjockers/syuzhet](https://github.com/mjockers/syuzhet) - -Jockers, Matthew L. "Introduction to the Syuzhet Package", CRAN R Project, 2017. https://mran.microsoft.com/snapshot/2017-12-31/web/packages/syuzhet/vignettes/syuzhet-vignette.html - -Damasio, Antonio R. *El error de Descartes: La razón de las emociones*. Barcelona: Andres Bello, 1999. - -Mohammad, Saif, and Peter D. Turney. "Crowdsourcing a Word–Emotion Association Lexicon". *Computational intelligence* 29 (2013): 436-465, doi: 10.1111/j.1467-8640.2012.00460.x - -Pereira Zazo, Óscar. *El analisis de la comunicación en español*. Iowa: Kendal Hunt, 2015. - -Rodríguez Aldape, Fernando Manuel. *Cuantificación del Interés de un usuario en un tema mediante minería de texto y análisis de sentimiento.* Tese de Mestrado, Universidad Autónoma de Nuevo León, 2013. - -# Notas - -[^1]:Agradecemos Mounika Puligurthi, estagiária da Universidade do Texas (UT), pelo seu auxílio na compreensão deste cálculo (durante a primavera de 2019). - -[^2]:Perceba que a palavra “amor”, por exemplo, aparece em ambas as emoções com um valor de 20 pontos. O que será que isso significa? - +--- +title: Análise de sentimentos em R com 'syuzhet' +layout: lesson +slug: analise-sentimento-R-syuzhet +date: 2021-03-23 +translation_date: 2022-03-02 +authors: +- Jennifer Isasi +editors: +- Maria José Afanador-Llach +reviewers: +- Riva Quiroga +translator: +- Diana Rebelo Rodriguez +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Ana Giulia Aldgeire +- Ian Araujo +original: analisis-de-sentimientos-r +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/467 +difficulty: 2 +activity: analyzing +topics: [distant-reading, r, data-visualization] +abstract: "Esta lição ensina uma maneira de obter e analisar dados sobre emoções e sentimentos em uma narrativa." +avatar_alt: "Gravura com três rostos que expressam emoções distintas" +doi: 10.46430/phpt0022 +--- + +{% include toc.html %} + +# Objetivos + +Esta lição usa a metodologia de análise de sentimentos e emoções através da linguagem de programação R para investigar documentos textuais de modo individual. Embora a lição não seja destinada a usuários avançados de R, é necessário que se tenha algum conhecimento dessa linguagem; assumimos que se tenha o R instalado e saiba como importar pacotes. Também recomendamos o download do RStudio. Se não estiver familiarizado com R, é melhor trabalhar primeiro através das lições [Processamento básico de texto em R](/pt/licoes/processamento-basico-texto-r), [Noções básicas de R com dados tabulares](/pt/licoes/nocoes-basicas-R-dados-tabulares) ou [Data Wrangling and Management in R](/en/lessons/data-wrangling-and-management-in-r) (em inglês). Ao final desta lição, o(a) pesquisador(a) será capaz de: + +- Colocar perguntas de pesquisa com base na análise quantitativa de sentimentos em textos de tipo ensaístico e/ou narrativo. +- Usar a linguagem de programação R, o ambiente RStudio e o pacote `syuzhet` com o dicionário NRC para gerar o indicador de sentimento de um texto em diferentes linguagens. +- Analisar criticamente os resultados do processamento de texto. +- Visualizar os dados gerais e sua evolução ao longo de um texto. + +Esta lição foi construída com a versão R 4.0.2, mas acreditamos que funcionará corretamente em versões futuras do programa. + +> O uso do R é geralmente o mesmo para Windows, Mac e Linux. Entretanto, como vamos trabalhar com textos em português, precisaremos escrever algum código extra para indicar o formato UTF-8 em máquinas Windows. Nesses casos, o código para o sistema operacional correspondente é exibido. + +# Antes de começar + +## Análise de sentimentos + +A análise dos sentimentos ou a mineração de opinião é utilizada para extrair automaticamente informações sobre a conotação negativa ou positiva da linguagem de um documento. Embora seja uma tarefa que vem sendo utilizada há muito tempo no campo do marketing ou da política, em estudos literários ainda é uma abordagem recente e não há um método único. Além disso, há a possibilidade de extrair a polaridade dos sentimentos e também das emoções. + +É importante especificar o que estamos procurando com os termos “sentimento” e “emoções”, pois eles são frequentemente usados de forma intercambiável, de modo geral, mas são diferentes. Para Antonio R. Damasio, as emoções são reações corporais instigantes de nosso corpo, determinadas por estímulos ambientais e derivadas do desenvolvimento da regulamentação biológica (12). Elas podem ser divididas em primárias e secundárias. Embora não haja um acordo final sobre o número de emoções básicas, geralmente são seis: raiva, alegria, repugnância, medo, tristeza e surpresa, embora Damasio considere esta última como sendo secundária. Além disso, no caso do sistema automático que utilizaremos, as emoções secundárias de antecipação e confiança também aparecem. + +Por outro lado, podemos definir sentimento como a ação e o efeito de sentir uma emoção ou, em outras palavras, é o resultado do fato de que “quando um objeto, uma pessoa, uma situação ou um pensamento provoca em nós a emoção da alegria, começa um processo que pode concluir no sentimento de estar alegre ou feliz” (Pereira Zazo 32) porque é uma emoção positiva. Durante a lição faremos uma distinção entre os dois termos, pois usaremos o resultado do sentimento para ver a sua evolução ao longo do texto e as emoções para ver o uso das palavras em geral. + +## Dicionário de léxicos NRC + +O pacote `syuzhet` funciona com quatro dicionários de sentimentos: Bing, Afinn, Stanford e NRC. Nesta lição, trabalharemos com este último, pois é o único disponível em vários idiomas, incluindo o português. Este vocabulário com valores de sentimentos negativos ou positivos e oito emoções foi desenvolvido por Saif M. Mohammad, um cientista do Conselho Nacional de Pesquisa do Canadá (NRC). O conjunto de dados foi construído manualmente através de pesquisas usando a técnica Maximum Difference Scaling ou MaxDiff, que avalia a preferência por uma série de alternativas (Mohammad e Turney). Assim, o léxico tem 14.182 palavras com as categorias de sentimentos positivos e negativos e as emoções de raiva, antecipação, repugnância, medo, alegria, tristeza, surpresa e confiança. Além disso, está disponível em mais de 100 idiomas (através de tradução automática). + +Seus termos de uso estabelecem que o vocabulário pode ser usado gratuitamente para fins de pesquisa, portanto, todos os dados estão disponíveis para download. + +Se trabalhamos com o inglês, podemos interagir com as diferentes categorias no site do [NRC Word-Emotion Association Lexicon](https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm). Lá também podemos encontrar trabalhos publicados sobre a obtenção dos valores para o vocabulário, sua organização, extensão, etc. + +## Pacote `syuzhet` + +O [pacote `syuzhet`](https://cran.r-project.org/web/packages/syuzhet/vignettes/syuzhet-vignette.html) foi desenvolvido em 2015 por Matthew Jockers; que o mantém funcionando até hoje e continuamente apresenta novas versões (no momento da preparação desta lição, foi usada a versão de dezembro de 2017). Uma série de posts no blog acompanham o desenvolvimento do pacote, e estão disponíveis no blog do professor desde [5 de junho de 2014](https://www.matthewjockers.net/page/2/) (em inglês). + +Naturalmente, o pacote foi desenvolvido com testes em textos escritos ou traduzidos para o inglês e não sem debate sobre sua utilidade, para atribuir valores a textos literários que muitas vezes são, por natureza, bastante subjetivos. + +> Atenção: A lista de palavras do dicionário está sendo preparada em inglês como língua principal e os dados quantitativos atribuídos a cada palavra são o resultado da avaliação humana pelos participantes americanos. Portanto, vários fatores devem ser levados em consideração ao utilizar esta metodologia: +> +> - O léxico em português é uma tradução direta realizada por tradução automática (estes sistemas já são muito confiáveis entre o inglês e o português, mas não em outros idiomas que o NRC afirma ser capaz de analisar como, por exemplo, o basco). +> - A pontuação de cada palavra, ou seja, a valência sentimental e emocional, tem um viés cultural e temporal que deve ser levado em conta, e um termo que pareceu positivo para os participantes da pesquisa pode nos parecer negativo. +> - O uso desta metodologia não é recomendado para textos que são altamente metafóricos e simbólicos. +> - O método não vai captar a negação de um sentimento positivo como, por exemplo, a frase “Eu não estou feliz”. +> +> Seguindo o espírito de adaptabilidade das lições do *Programming Historian* a outras línguas, foi decidido usar `syuzhet` em sua forma original, mas ao final da lição indicamos algumas funções avançadas para usar seu próprio dicionário de sentimentos com o mesmo pacote. + +Como os resultados nos *dataframes* aparecerão em inglês, se achar necessário, dedique um momento para aprender esta tradução: + +| anger | anticipation | disgust | fear | joy | sadness | surprise | trust | negative | positive | +| ------ | ------------ | -------- | ----- | ------- | -------- | -------- | --------- | -------- | -------- | +| raiva | anticipação | desgosto | medo | alegria | tristeza | surpresa | confiança | negativo | positivo | + +## Um pequeno exemplo + +Antes de começar a realizar a análise de nossos textos, é útil saber de forma geral qual é o processo de análise realizado pela função de obter sentimentos de `syuzhet`, com o dicionário NRC e os resultados obtidos sobre os quais trabalharemos. + +O sistema irá processar nosso texto e transformá-lo em um vetor de caracteres (aqui palavras), para analisá-los individualmente (também é possível fazê-lo por sentenças). Sem entrar ainda no código para realizar a análise, dê uma olhada neste breve exemplo (nota de tradução: para a versão em português foi usado o texto _Dom Casmurro_ de Machado de Assis, mantendo o tipo de exercícios e o código da lição original): + +> “Contando aquela crise do meu amor adolescente, sinto uma coisa que não sei se explico bem, e é que as dores daquela quadra, a tal ponto se espiritualizaram com o tempo, que chegam a diluir-se no prazer. Não é claro isto, mas nem tudo é claro na vida ou nos livros. A verdade é que sinto um gosto particular em referir tal aborrecimento, quando é certo que ele me lembra outros que não quisera lembrar por nada.” +> +> *Dom Casmurro* de Machado de Assis. + +Este fragmento é transformado em um vetor de caracteres: + +```R +> print(exemplo_2) +[1] "contando" "aquela" "crise" "do" "meu" +[6] "amor" "adolescente" "sinto" "uma" "coisa" +[11] "que" "não" "sei" "se" "explico" ... +``` + +Com a função de obter sentimentos, obtém-se a valência positiva e negativa de cada palavra, assim como a valência das oito emoções classificadas pelo NRC. O resultado para este fragmento é o seguinte: + +```R +> print(sentimentos_exemplo_df, row.names = exemplo_2) + anger anticipation disgust fear joy sadness surprise trust negative positive +contando 0 0 0 0 0 0 0 0 0 0 +aquela 0 0 0 0 0 0 0 0 0 0 +crise 1 0 0 0 0 1 0 0 3 0 +do 0 0 0 0 0 0 0 0 0 0 +meu 0 0 0 0 0 0 0 0 0 0 +amor 0 1 0 0 1 1 0 1 0 1 +adolescente 0 0 0 0 0 0 0 0 0 0 +sinto 0 0 0 0 0 0 0 0 0 0 +uma 0 0 0 0 0 0 0 0 0 0 +coisa 0 0 0 0 0 0 0 0 0 0 +que 0 0 0 0 0 0 0 0 0 0 +não 0 0 0 0 0 0 0 0 0 0 +sei 0 0 0 0 0 0 0 0 0 0 +se 0 0 0 0 0 0 0 0 0 0 +explico 0 0 0 0 0 0 0 0 0 0 +bem 0 0 0 0 0 0 0 0 0 0 +... +``` + +> Nota de tradução: na lição original, os autores não explicaram o passo a passo para se obter esses resultados em um primeiro momento. Apesar de a lição explicar detalhadamente o processo, julguei ser interessante demonstrar aqui como obtive esses outputs: + +```R +exemplo <- "Contando aquela crise do meu amor adolescente, sinto uma coisa que não sei se explico bem, e é que as dores daquela quadra, a tal ponto se espiritualizaram com o tempo, que chegam a diluir-se no prazer. Não é claro isto, mas nem tudo é claro na vida ou nos livros. A verdade é que sinto um gosto particular em referir tal aborrecimento, quando é certo que ele me lembra outros que não quisera lembrar por nada." + +exemplo_2 <- get_tokens(exemplo) + +print(exemplo_2) + +sentimentos_exemplo_df <- get_nrc_sentiment(exemplo_2, lang="portuguese") + +print(sentimentos_exemplo_df, row.names = exemplo_2) +``` + +Como podemos ver nos resultados deste objeto tipo *data frame* ou tabela, cada palavra ou ficha tem um valor padrão de 0 nas dez colunas. Se houver um valor maior que 0 significa, em primeiro lugar, que este termo existe no dicionário NRC e, em segundo lugar, que tem um valor atribuído para alguma emoção e/ou sentimento. Neste exemplo, podemos ver que a palavra “amor” é entendida de forma positiva, ainda que represente tristeza (*sadness*). Por outro lado, a palavra “crise” possui uma conotação negativa muito forte, pois há menos margem para dúvidas. + +As possibilidades de explorar, analisar e visualizar estes resultados dependem, em grande parte, das suas habilidades de programação mas, acima de tudo, da sua questão de pesquisa. Para ajudar o pesquisador, nesta lição introdutória aprenderemos como analisar os dados utilizando várias formas de visualização. + +## Pergunta de pesquisa + +Para essa lição, vamos utilizar o romance Dom Casmurro, escrito por [Machado de Assis](https://pt.wikipedia.org/wiki/Machado_de_Assis), publicado em 1899, de caráter realista e ambientado no Rio de Janeiro na segunda metade do século XIX. O protagonista e narrador é Bento Santiago (também conhecido como Bentinho ou Dom Casmurro), que apresenta relatos desde a sua juventude até à sua vida adulta, quando escreve. Nesse intervalo de tempo passa por experiências como viver em um seminário e se preparar para ser Padre, mas também desistir dessa vida ao se apaixonar por Capitu. O enredo central da trama é o ciúme envolvido nessa relação. + +É possível observar a queda emocional desta trama ao se extrair automaticamente os valores de sentimento do romance? Ou, em outras palavras, nossa recepção dos ciúmes de Bentinho coincide com os resultados desse cálculo automático? Além disso, quais são as palavras mais usadas na descrição das emoções do texto? + + +# Obter valores de sentimentos e emoções + +## Instalar e carregar pacotes + +A primeira coisa que precisamos fazer para poder obter o sentimento de nosso texto, é instalar e carregar o pacote R correspondente, neste caso, o `syuzhet`. Além disso, para facilitar a visualização dos dados, vamos utilizar os pacotes `RColorBrewer`, `wordcloud`, `tm` e `NLP`. Para fazer isto, digite e execute os dois comandos seguintes em seu console; o primeiro para instalar o pacote e o segundo para carregá-lo (se já os tiver instalado, só precisa carregá-los). Note que a instalação destes pacotes pode levar alguns minutos. + +```R +# Instale os pacotes: +install.packages("syuzhet") +install.packages("RColorBrewer") +install.packages("wordcloud") +install.packages("tm") + +# Carregue os pacotes +library(syuzhet) +library(RColorBrewer) +library(wordcloud) +library(tm) +``` + +## Carregar e preparar o texto + +Faça o download do texto do romance [Dom Casmurro](/assets/analise-sentimento-R-syuzhet/domCasmurro.txt). Como podemos ver, o documento está em formato de texto simples, pois isto é essencial para realizar seu processamento e análise em R. + +Com o texto em mãos, a primeira coisa que vamos fazer é carregá-lo como um objeto de _string_. Certifique-se de mudar o caminho para o texto para corresponder ao seu computador. + +**Em Mac e Linux** + +Em sistemas Mac podemos usar a função `get_text_as_string` integrada no pacote `syuzhet`: + +```R +texto <- get_text_as_string("https://raw.githubusercontent.com/programminghistorian/jekyll/gh-pages/assets/analise-sentimento-R-syuzhet/domCasmurro.txt") +``` + +**Em Windows** + +Os sistemas Windows não lêem diretamente caracteres com acentos ou outras marcações típicas do espanhol, português ou francês, então temos que dizer ao sistema que o nosso texto está no formato UTF-8 usando a função `scan`. + +```R +texto <- scan(file = "https://raw.githubusercontent.com/programminghistorian/jekyll/gh-pages/assets/analise-sentimento-R-syuzhet/domCasmurro.txt", fileEncoding = "UTF-8", what = character(), sep = "\n", allowEscapes = T) +``` + +Como a análise que vamos realizar precisa de uma lista, seja de palavras ou de frases (aqui só prestaremos atenção a palavras individuais), precisamos de um passo intermediário entre o carregamento do texto e a extração dos valores de sentimento. Assim, vamos dividir o texto (*string*) em uma lista de palavras (*tokens*). Isto é muito comum na análise distante de textos. + +Para isso, usamos a função `get_tokens()` do pacote e geramos um novo objeto, neste caso um vetor de *tokens* (palavras). Conforme veremos, com esta função nos livramos da pontuação no texto e temos uma lista de palavras. + +```R +texto_palavras <- get_tokens(texto) +head(texto_palavras) +[1] "dom" "casmurro" "texto" "de" "referência" "obras" +``` +Agora podemos ver quantas palavras ou tokens estão neste texto com a função `length()`: +```R +length(texto_palavras) +[1] 66931 +``` + +Se quiser realizar a análise para orações, utilize a função `get_sentences()` e siga o mesmo processo, com exceção da criação da nuvem de palavras: + +```R +oracoes_vetor <- get_sentences(texto) +length(oracoes_vetor) +[1] 8637 +``` + + + +## Extração de dados com o NRC Sentiment Lexicon + +Agora podemos executar a função `get_nrc_sentiment` para obter os sentimentos no romance *Dom Casmurro*. Como a função executa por padrão o vocabulário inglês, nós a escrevemos com o argumento “lang” (de *language*, ou idioma) para usar o vocabulário português (“portuguese”). Por sua vez, criamos um novo objeto para armazenar os dados extraídos. Este será um objeto do tipo *data frame*. Esta função procura a presença das oito emoções e dos dois sentimentos para cada palavra em nosso vetor, e atribui um número maior que 0 se elas existirem. Dependendo do desempenho de seu computador e de acordo com as características de nosso texto, este processo pode levar de 15 a 30 minutos. + +```R +sentimentos_df <- get_nrc_sentiment(texto_palavras, lang="portuguese") +``` + +Quando o código terminar de ser executado, um aviso aparecerá porque o `syuzhet` usa uma função que é descontinuada dentro de sua função `get_nrc_sentiment`: + +```R +Warning message: +`data_frame()` is deprecated as of tibble 1.1.0. +Please use `tibble()` instead. +This warning is displayed once every 8 hours. +Call `lifecycle::last_warnings()` to see where this warning was generated. +``` + +Quando o processo terminar, se desejarmos, podemos ler os resultados no novo objeto, simplesmente selecionando o objeto e executando-o. Mas para evitar “imprimir” milhares de linhas no console, também podemos usar a função `head()` para ver os primeiros seis *tokens*. No caso do texto que estamos usando, quando executarmos essa função, devemos ver o seguinte, que não é nada interessante: + +```R +> head(sentimientos_df) + anger anticipation disgust fear joy sadness surprise trust negative positive +1 0 0 0 0 0 0 0 1 0 1 +2 0 0 0 0 0 0 0 0 0 0 +3 0 0 0 0 0 0 0 0 0 0 +4 0 0 0 0 0 0 0 0 0 0 +5 0 0 0 0 0 0 0 0 0 0 +6 0 0 0 0 0 0 0 0 0 0 +``` + +## Resumo do texto + +O que é interessante é ver um resumo de cada um dos valores que obtivemos utilizando a função geral `summary()`. Isto pode ser muito útil ao comparar vários textos, pois permite ver diferentes medidas, tais como a média dos resultados para cada uma das emoções e os dois sentimentos. Por exemplo, podemos ver que o romance *Dom Casmurro* é, em [média](https://pt.wikipedia.org/wiki/M%C3%A9dia) (*mean*), um pouco mais positivo (0,03892) do que negativo (0,03559). Mas se olharmos para as emoções, parece que a tristeza (0,02116) aparece em mais momentos do que a alegria (0,01593). Vários dos valores fornecidos pela função de resumo do texto aparecem com um valor igual a 0, incluindo a [mediana](https://pt.wikipedia.org/wiki/Mediana_(estat%C3%ADstica)) (*median*). Isto indica que poucas palavras do romance aparecem no dicionário que estamos usando (NRC) ou, inversamente, que poucas têm uma atribuição de sentimento ou emoção no dicionário. + +```R +> summary(sentimentos_df) + anger anticipation disgust fear joy + Min. :0.00000 Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.00000 + 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000 + Median :0.00000 Median :0.00000 Median :0.000000 Median :0.00000 Median :0.00000 + Mean :0.01116 Mean :0.01337 Mean :0.008815 Mean :0.01288 Mean :0.01593 + 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.00000 + Max. :5.00000 Max. :2.00000 Max. :3.000000 Max. :4.00000 Max. :7.00000 + sadness surprise trust negative positive + Min. :0.00000 Min. :0.000000 Min. :0.00000 Min. :0.00000 Min. :0.00000 + 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 + Median :0.00000 Median :0.000000 Median :0.00000 Median :0.00000 Median :0.00000 + Mean :0.02116 Mean :0.008965 Mean :0.02299 Mean :0.03559 Mean :0.03892 + 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 + Max. :4.00000 Max. :2.000000 Max. :3.00000 Max. :5.00000 Max. :7.00000 +``` + +> Parabéns! Já temos os resultados da análise de sentimentos! E, agora, o que podemos fazer com esses números? + + + +# Análise das emoções em um texto + +## Gráfico de barras + +Para ver quais as emoções que estão mais presentes no texto, a maneira mais simples é criar um *barplot*. Para isso, usamos a função `barplot()` com o resumo das colunas 1 a 8, ou seja, as colunas de raiva (*anger*), antecipação (*antecipation*), desgosto (*disgust*), medo (*fear*), alegria (*joy*), tristeza (*sadness*), surpresa (*surprise*) e confiança (*trust*). Os resultados obtidos vêm do processamento da função `prop.table()` dos resultados das oito colunas com cada uma das palavras da tabela. + +> Para cada barra, todos os valores da coluna de emoções correspondentes são somados. Então, o resultado de todas as emoções que adicionamos na saída do gráfico é somado. No final, a soma de cada emoção é dividida pelo total de todas as colunas ou emoções. Isto não acrescenta as colunas negativas e positivas. [^1] + +```R +barplot( +colSums(prop.table(sentimentos_df[, 1:8])), +space = 0.2, +horiz = FALSE, +las = 1, +cex.names = 0.7, +col = brewer.pal(n = 8, name = "Set3"), +main = "'Dom Casmurro' de Machado de Assis", +sub = "Análise realizada por Diana Rebelo Rodriguez", +xlab="emoções", ylab = NULL) +``` +O resto dos parâmetros que vemos no código são “extras”, pois são uma forma de configurar o formato visual do gráfico. Assim, indicamos um espaço (*space*) de 0,2 entre as barras, que estará na posição vertical ao indicar falsamente (*FALSE*) sua horizontalidade (*horiz*) e, ao contrário, a horizontalidade para os valores no eixo Y com `las = 1`. Além disso, reduzimos o tamanho do nome de cada barra (*cex.names*) para 0,7 para evitar que elas desapareçam, por exemplo, se fizermos um pequeno gráfico. Graças ao pacote que instalamos no início, `RColorBrewer`, podemos dar cor às colunas automaticamente, neste caso, com a paleta de cores (*brewer.pal*) do conjunto número 3 do pacote, com oito cores, uma para cada coluna. Finalmente, vamos colocar um título e subtítulo em nosso gráfico com os parâmetros `main` e `sub`, assim como a palavra “emoções” no eixo X e nada no eixo Y. + +Gráfico de barras com os valores das seis emoções capturadas em Dom Casmurro por Machado de Assis + +Se esses parâmetros não forem do seu interesse, basta executar o seguinte código para obter o gráfico padrão: + +```R +barplot(colSums(prop.table(sentimentos_df[, 1:8]))) +``` + +> Certifique-se de que há espaço suficiente na seção de exibição de gráficos do R para poder ver os nomes de cada coluna. + +Estas informações já indicam que as emoções de tristeza e confiança prevalecem mais do que as de desgosto ou surpresa. Mas quais são as palavras usadas por Machado na expressão dessa tristeza? Com que frequência cada uma aparece no romance como um todo? + +## Contando o número de palavras com cada emoção + +A fim de realizar uma análise do texto, é muito interessante saber quais são as palavras usadas com mais frequência no texto em relação à sua identificação com cada emoção. Para isso, primeiro temos que criar um objeto de caracteres com todas as palavras que tenham um valor maior que 0 na coluna “tristeza” (*sadness*). Para selecionar somente essa coluna, usamos o sinal de dólar após o nome do *data frame*: +```R +palavras_tristeza <- texto_palavras[sentimentos_df$sadness > 0] +``` + +O conteúdo de `palavras_tristeza` nos indica que esta lista não diz muito, pois retorna apenas a listagem de palavras sem maiores informações. Para obter a contagem das vezes que cada palavra relacionada à tristeza aparece no romance, geramos uma tabela do primeiro conjunto de caracteres com as funções `unlist` e `table`, que depois ordenamos em ordem decrescente (se quisermos uma ordem ascendente mudamos TRUE para FALSE); criamos um novo objeto de tipo tabela e imprimimos as primeiras 12 palavras da lista com sua frequência: + +```R +palavras_tristeza_ordem <- sort(table(unlist(palavras_tristeza)), decreasing = TRUE) +head(palavras_tristeza_ordem, n = 12) +head(palavras_tristeza_ordem, n = 12) + + nada mal tarde entre + 135 80 53 50 + caso morte sair medo + 34 34 32 23 + amor pecado pena defunto + 20 17 17 14 +``` + +Se quisermos saber quantas palavras únicas foram relacionadas à tristeza, basta usar a função `length` no objeto que agora agrupa as palavras em ordem: + +```R +length(palabras_tristeza_orden) +[1] 163 +``` + +Podemos repetir a mesma operação com o resto das emoções ou com aquela que nos interessa, assim como com os sentimentos positivos e negativos. Tente obter os resultados para a emoção “alegria” e compare os resultados [^2]. + +Dependendo do tipo de análise que se deseje fazer, tal resultado é eficiente. Agora, para o propósito introdutório da lição, vamos gerar uma nuvem de palavras que ajuda a visualizar facilmente os termos associados a cada emoção (embora iremos visualizar aqui apenas quatro para facilitar a leitura). + +## Nuvem de emoções + +A fim de gerar uma nuvem com as palavras que correspondem a cada emoção em *Dom Casmurro*, criaremos primeiro um vetor no qual armazenaremos todas as palavras que, nas colunas que indicamos após o símbolo `$`, têm um valor maior que 0. É gerado um novo objeto do tipo vetor, que contém um elemento para a lista de cada emoção. + +Neste caso, devemos indicar novamente à função que temos caracteres acentuados se for uma máquina Windows. + +**Em Mac e Linux** + +```R +nuvem_emocoes_vetor <- c( +paste(texto_palavras[sentimentos_df$sadness> 0], collapse = " "), +paste(texto_palavras[sentimentos_df$joy > 0], collapse = " "), +paste(texto_palavras[sentimentos_df$anger > 0], collapse = " "), +paste(texto_palavras[sentimentos_df$fear > 0], collapse = " ")) +``` +**Em Windows** + +Uma vez gerado o vetor, deve convertê-lo em caracteres UTF-8 utilizando a função `iconv`. + +```R +nuvem_emocoes_vetor <- c( +paste(texto_palavras[sentimentos_df$sadness> 0], collapse = " "), +paste(texto_palavras[sentimentos_df$joy > 0], collapse = " "), +paste(texto_palavras[sentimentos_df$anger > 0], collapse = " "), +paste(texto_palavras[sentimentos_df$fear > 0], collapse = " ")) + +nuvem_emocoes_vetor <- iconv(nuvem_emocoes_vetor, "latin1", "UTF-8") +``` +Agora que temos o vetor, criamos um _corpus_ de palavras com quatro “documentos” para a nuvem: + +```R +nuvem_corpus <- Corpus(VectorSource(nuvem_emocoes_vetor)) +``` + +Em seguida, transformamos este corpus em uma matriz termo-documento com a função `TermDocumentMatrix()`. Com isto, agora usamos a função `as.matrix()` para converter o TDM em uma matriz que, como podemos ver, lista os termos no texto com um valor maior que zero para cada uma das quatro emoções que extraímos aqui. Para ver o início desta informação, use novamente a função `head`: + +```R +nuvem_tdm <- TermDocumentMatrix(nuvem_corpus) +nuvem_tdm <- as.matrix(nuvem_tdm) +head(nuvem_tdm) + Docs +Terms 1 2 3 4 + abismo 1 0 0 1 + acidente 1 0 1 1 + afligir 3 0 0 3 + agonia 1 0 1 1 + amargamente 1 0 1 0 + amor 20 20 0 0 +``` + +Agora, atribua um nome a cada um dos grupos de palavras ou documentos (*Docs*) em nossa matriz. Aqui vamos usar o termo em português para as colunas que selecionamos para exibir na nuvem. Mais uma vez, podemos ver a mudança feita ao executar a função `head`. + +```R +colnames(nuvem_tdm) <- c('tristeza', 'felicidade', 'raiva', 'confiança') +head(nuvem_tdm) + Docs +Terms tristeza felicidade raiva confiança + abismo 1 0 0 1 + acidente 1 0 1 1 + afligir 3 0 0 3 + agonia 1 0 1 1 + amargamente 1 0 1 0 + amor 20 20 0 0 +``` + + +Finalmente, podemos visualizar a nuvem de palavras que estamos acostumados a ver na mídia ou em estudos académicos. O tamanho e a localização da palavra correspondem à sua maior ou menor ocorrência com valor emocional atribuído no texto. Primeiro, executamos a função `set.seed()` para que quando reproduzirmos o resultado visual seja o mesmo que o nosso (se não o fizer, será o mesmo, mas as palavras aparecerão em posições diferentes). E, para gerar a nuvem, vamos usar a função `comparison.cloud` do pacote `wordcloud`. Indicamos o objeto a representar, aqui ‘nuvem_tdm’, indicamos uma ordem não aleatória das palavras, atribuímos uma cor para cada grupo de palavras e damos tamanhos ao título e à escala geral, e atribuímos um número máximo de termos que serão exibidos. +```R +set.seed(757) # pode ser qualquer número +comparison.cloud(nuvem_tdm, random.order = FALSE, + colors = c("green", "red", "orange", "blue"), + title.size = 1, max.words = 50, scale = c(2.5, 1), rot.per = 0.4) +``` + +O resultado deve ser semelhante à imagem abaixo, mas a localização das palavras pode ser diferente uma vez que a figura é gerada segundo o tamanho da tela: + +Nuvem das palavras mais frequentes correspondentes às emoções de tristeza, felicidade, raiva e confiança no romance Dom Casmurro de Machado de Assis. + +O que sugere o resultado desta nuvem? Ficamos impressionados com o aparecimento de palavras como “entre” no conjunto da tristeza e “cavalo” no conjunto da raiva. Este “disparate” está relacionado com o aviso já anunciado no início da lição. O vocabulário para análise de sentimentos que estamos usando aqui é traduzido do inglês, um tradutor automático que não é “perfeito”. + +# Visualizando a evolução dos sentimentos em um texto + +Para complementar a leitura isolada das emoções, estudando a flutuação dos sentimentos positivos e negativos ao longo de um texto, há uma maneira de normalizar e visualizar estas informações. Como a análise da função de extração de sentimento atribui um valor positivo tanto ao sentimento positivo quanto ao negativo, precisamos gerar dados entre um intervalo de -1 para o momento mais negativo e 1 para o mais positivo, e onde 0 é neutro. Para isso, calculamos a valência do texto multiplicando os valores na coluna de valores negativos de nosso *data frame* com os resultados por -1 e adicionamos o valor na coluna de valores positivos. + +```R +sentimentos_valencia <- (sentimentos_df$negative * -1) + sentimentos_df$positive +``` + +Finalmente, podemos gerar um gráfico com a função `simple_plot()` integrada no pacote `syuzhet`, que nos dará duas imagens diferentes; a primeira, tem todas as medidas que o algoritmo calcula e, a segunda, é uma normalização das mesmas. O eixo horizontal apresenta o texto em 100 fragmentos normalizados e o eixo vertical nos informa sobre a valência do sentimento no texto. Dependendo das características de seu computador, este gráfico pode levar até 20-30 minutos para ser gerado. + +```R +simple_plot(sentimentos_valencia) +``` + +> Assegure-se de possuir espaço suficiente no espaço de visualização de gráficos do R para que ele seja gerado. Caso contrário, aparecerá o erro: *Error in plot.new() : figure margins too large* + +Evolução das emoções ao longo do texto + +Assim, neste caso, podemos interpretar que o romance *Dom Casmurro* varia bastante entre momentos positivos e negativos. Começa de forma mais negativa, fica mais positivo, sendo seguido por um novo momento negativo e um segundo positivo (porém menos do que o primeiro) para um desfecho negativo. Qualquer pessoa que tenha lido o romance pode confirmar esta variação de sentimentos vivida pelo protagonista. + +## Salvar seus dados + +Se quiser salvar seus dados para retornar a eles mais tarde, é possível fazê-lo em um ficheiro de valores separados por vírgula (CSV) com a função `write.csv()`. Aqui dizemos para salvar o *data frame*, que contém o resultado das oito emoções e os dois sentimentos de texto em um ficheiro com uma extensão `.csv`. Além disso, podemos acrescentar a palavra à qual cada linha de resultados corresponde, em uma coluna à esquerda usando a palavra vetor feita no início da análise. + +```R +write.csv(sentimentos_df, file = "analise_sent_domCasmurro.csv", row.names = texto_palavras) +``` + +Agora, pode começar a analisar seus próprios textos e compará-los uns com os outros! + +# Outras funcionalidades e suas limitações + +Talvez esteja trabalhando em um projeto onde já tem um dicionário de sentimentos criado, ou talvez precise personalizar o vocabulário e sua valência sentimental por razões culturais ou temporais, ou talvez esteja procurando melhorar os resultados traduzidos automaticamente do NRC usado aqui. Em qualquer um destes casos, a partir do final de 2020, também é possível usar o seu próprio conjunto de dados no *script* graças à função `custom` e realizar algumas das operações que foram aprendidas nesta lição. + +Para carregar seu próprio “dicionário de sentimentos”, é preciso primeiro criar (ou modificar) uma tabela contendo, pelo menos, uma coluna para palavras e uma coluna para sua valência, por exemplo: + +|word|value| +|---|---| +|amor|1| +|cólera|-1| +|tapete|0| +|catástrofe|-2| + + +Em seguida, carregue os seus dados salvos como um CSV com a função `read.csv`, que criará um novo conjunto disponível como `data.frame`, no qual é possível verificar seu texto: +```R +vocabulario_personalizado <- read.csv("ficheiro.csv") +method <- "custom" +sentimentos_oracoes <- get_sentiment(oracoes_vetor, method = method, lexicon = vocabulario_personalizado) +``` +Se quiser visualizar o progresso do sentimento ao longo de seu texto, podemos usar a função `plot` com outros parâmetros que já vimos: + +```R +plot(sentimentos_oracoes, + type = "l", + main = "'Dom Casmurro' de Machado de Assis", + sub = "Análise realizada por Diana Rebelo Rodriguez", + xlab="emocoes", ylab = " " + ) +``` +Entretanto, tenha em mente que esta forma de análise será limitada e não será possível realizar as mesmas operações como explicado acima. Por exemplo, seguindo o modelo do exemplo, não conseguimos as informações sobre emoções, portanto não somos capazes de fazer uma nuvem de palavras. + +# Referências +Assis, Machado de. _Dom Casmurro_. São Paulo: Editora Ática, 1996. + +Jockers, Matthew L. _Syuzhet: Extract Sentiment and Plot Arcs from Text_, 2015. [https://github.com/mjockers/syuzhet](https://github.com/mjockers/syuzhet) + +Jockers, Matthew L. "Introduction to the Syuzhet Package", CRAN R Project, 2017. https://mran.microsoft.com/snapshot/2017-12-31/web/packages/syuzhet/vignettes/syuzhet-vignette.html + +Damasio, Antonio R. *El error de Descartes: La razón de las emociones*. Barcelona: Andres Bello, 1999. + +Mohammad, Saif, and Peter D. Turney. "Crowdsourcing a Word–Emotion Association Lexicon". *Computational intelligence* 29 (2013): 436-465, doi: 10.1111/j.1467-8640.2012.00460.x + +Pereira Zazo, Óscar. *El analisis de la comunicación en español*. Iowa: Kendal Hunt, 2015. + +Rodríguez Aldape, Fernando Manuel. *Cuantificación del Interés de un usuario en un tema mediante minería de texto y análisis de sentimiento.* Tese de Mestrado, Universidad Autónoma de Nuevo León, 2013. + +# Notas + +[^1]:Agradecemos Mounika Puligurthi, estagiária da Universidade do Texas (UT), pelo seu auxílio na compreensão deste cálculo (durante a primavera de 2019). + +[^2]:Perceba que a palavra “amor”, por exemplo, aparece em ambas as emoções com um valor de 20 pontos. O que será que isso significa? + diff --git a/pt/licoes/analise-sentimento-exploracao-dados.md b/pt/licoes/analise-sentimento-exploracao-dados.md index 0ed4de6d40..c93a38d2ae 100644 --- a/pt/licoes/analise-sentimento-exploracao-dados.md +++ b/pt/licoes/analise-sentimento-exploracao-dados.md @@ -1,426 +1,426 @@ ---- -title: Análise de sentimento para exploração de dados -layout: lesson -slug: analise-sentimento-exploracao-dados -date: 2018-01-15 -translation_date: 2021-06-14 -authors: -- Zoë Wilkinson Saldaña -reviewers: -- Anandi Silva Knuppel -- Puteri Zarina Megat Khalid -editors: -- Adam Crymble -translator: -- Caio Mello -translation-editor: -- Josir Cardoso Gomes -translation-reviewer: -- Bruno Ponne -- Ian Araujo -original: sentiment-analysis -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/375 -difficulty: 2 -activity: analyzing -topics: [distant-reading] -abstract: "Nesta lição, você aprenderá a conduzir uma 'análise de sentimento' em textos e a interpretar os resultados. Esta é uma forma de análise exploratória de dados baseada no processamento de linguagem natural (PLN). Você aprenderá a instalar todos os softwares apropriados e a construir um programa reutilizável que pode ser aplicado aos seus próprios textos." -avatar_alt: Um homem sorridente e um homem rabugento -doi: 10.46430/phpt0017 ---- - -{% include toc.html %} - - -# Objetivos da lição - -Esta lição usa a análise de sentimento como base para uma análise exploratória de dados de um grande corpus textual. Portanto, é indicada para leitores com alguma experiência prévia em programação utilizando Python. Caso não tenha experiência com Python ou programação, a autora recomenda trabalhar nas primeiras lições da série “Introdução ao Python”. Ao final desta lição, você terá o conhecimento necessário para: - -* Elaborar questões de pesquisa que usem Processamento de Linguagem Natural (PLN) em um corpus textual. -* Utilizar Python e o Natural Language Processing Toolkit (NLTK) para gerar medidas de sentimento para um texto. -* Avaliar criticamente os resultados da análise de sentimento e ajustar os parâmetros e a metodologia conforme necessário. -* Identificar as próximas etapas para continuar o aprendizado sobre análise exploratória de dados e abordagens programáticas para dados qualitativos. - -Nota do tradutor: Devido à falta de uma biblioteca de código que funcione bem com os textos em português, optamos por manter os textos dos exercícios na língua original. - -## O que é análise exploratória de dados? - -A análise exploratória de dados é um conjunto de estratégias que trazem à tona características importantes num conjunto de dados que normalmente não são facilmente identificadas por meio da leitura tradicional. Com os insights da análise exploratória de dados em mãos, os pesquisadores podem tomar decisões mais adequadas ao selecionar um método ou abordagem para sua questão de pesquisa, ou até mesmo, identificar novas questões. - -Em 1977, o matemático John Tukey descreveu a análise exploratória de dados como uma forma de trabalho de detetive, sem a qual, os estudiosos muitas vezes perderiam descobertas interessantes, porém menos óbvias: - -> “A menos que o detetive encontre pistas, o juiz ou júri não terá como julgar. Caso a análise exploratória de dados não revele indícios, geralmente quantitativos, é provável que se considere não haver nada a ser comprovado. ” (Tukey 1977: 3, tradução livre) - -## Explorando Texto com Análise de Sentimento - -Quando confrontado com um corpus promissor, porém muito grande, como o pesquisador pode encontrar aquilo de mais importante, que pode levar às descobertas de pesquisa mais interessantes? - -O Processamento de Linguagem Natural (PLN) abrange uma ampla gama de técnicas que se baseiam na aplicação de métodos analíticos computacionais ao conteúdo textual, fornecendo meios de categorizar e quantificar o texto. Essas abordagens de PLN, que incluem análise de sentimento, podem ajudar os pesquisadores a explorar seus textos. Nas palavras de Tukey, podem ajudar o pesquisador a encontrar “pistas” sobre seus textos e “indícios” de que pode valer a pena investigar algo mais a fundo. - -Nesta lição, vamos nos concentrar numa ferramenta do kit de ferramentas do PLN: a análise de sentimento. A análise de sentimento busca quantificar a intensidade emocional de palavras e frases num texto. Algumas ferramentas de análise de sentimento levam em consideração, inclusive, o peso emocional de sinais linguísticos como a pontuação ou mesmo os emojis. As ferramentas de análise de sentimento geralmente processam uma unidade de texto (uma frase, um parágrafo, um livro, etc.) e produzem pontuações (“scores”, em inglês) ou classificações quantitativas para indicar se o algoritmo considera que aquele texto transmite emoções positivas ou negativas. Algumas ferramentas também podem quantificar o *grau de positividade* ou o *grau de negatividade* num texto. Combinada com outros métodos de PLN, como modelagem de tópicos (“topic modelling”, em inglês), a análise de sentimento fornece meios de caracterizar as emoções expressas sobre diferentes tópicos de uma conversa. Quando usada em conjunto com a análise de rede, pode lançar luz sobre as maneiras como os indivíduos interagem uns com os outros. Um pesquisador interessado em interações sobre um evento político pode usar a análise de sentimento para estudar como os indivíduos descrevem aquele evento nas redes sociais. Com os dados certos para inserir na ferramenta, pode ser possível fazer comparações regionais ou entender como diferentes grupos demográficos vêem o evento de forma diferente. Como a ferramenta pode processar muitos dados sequencialmente, é até possível analisar o sentimento em centenas de milhares ou até milhões de eventos discursivos. - -Para começar, esta lição fornece uma introdução à análise de sentimento tanto prática quanto crítica. Como qualquer ferramenta computacional, a análise de sentimento tem uma série de limitações e vieses que os pesquisadores devem levar em consideração. Os pesquisadores devem ser especialmente cautelosos ao fazer afirmações empíricas com base nos resultados da análise de sentimento. Você poderá ser melhor atendido usando a análise de sentimento em situações provisórias e exploratórias, como meio de orientar o processo de pesquisa. Ao manejar essas ferramentas com ceticismo e eficácia, é possível realizar um trabalho de detetive bastante notável. - -## Análise de grandes coleções de correspondência textual - -Correspondências escritas como cartas, e-mails, registros de bate-papo, tweets e históricos de mensagens de texto podem fornecer aos pesquisadores uma visão inestimável de seus autores. Os textos geralmente são ricos em emoções e informações que não estão disponibilizadas em nenhum outro lugar. Um pesquisador pode aprender sobre as opiniões que as pessoas, objetos de seu estudo, tiveram sobre vários tópicos ou sobre determinados eventos. Também poderia ser possível aprender sobre os relacionamentos que os indivíduos desenvolveram e mantiveram em organizações ou redes complexas. - -Embora metodologias como etnografia, leitura “manual” e análise do discurso ajudem os pesquisadores a analisar a correspondência histórica, esses métodos trazem desafios significativos quando o número de textos cresce de dezenas ou centenas para milhares ou milhões. A análise textual computacional fornece um conjunto de métodos para tornar visíveis as tendências, dinâmicas e relacionamentos que podem estar ocultos para o leitor humano por problemas de escala. Além disso, muitos métodos de computação produzem descobertas que podem ser expressas quantitativamente e que podem subsequentemente permitir que o pesquisador realize modelagem estatística, visualização de informações e aprendizado de máquina (Machine Learning) para fazer outras análises. - -## Estudo de caso: corpus de e-mails da Enron - -Este tutorial usa a correspondência de e-mail da falida empresa americana de energia Enron. A Enron ocultou uma ampla variedade de práticas contábeis ilegais até que uma investigação federal em 2001 a levou à falência. Na época, o Escândalo Enron foi o maior colapso de uma empresa de capital aberto da história. Em 2001, a empresa começou a mostrar sinais de problemas financeiros que não se alinhavam com as divulgações financeiras da empresa até aquele momento. As ações da Enron negociadas em bolsa caíram de US$ 90,75 em meados de 2000 para menos de um dólar em novembro de 2001, o que levou os acionistas a processar a empresa. Uma investigação subsequente da Comissão de Valores Mobiliários dos Estados Unidos (SEC) revelou que os executivos da Enron cometeram fraude e negligência contábil em grande escala. A Enron declarou falência em dezembro daquele ano. Nos anos que se seguiram, vários executivos enfrentaram condenações criminais por sua participação no escândalo. Para os pesquisadores, o Escândalo Enron resultou na criação de um dos maiores (e mais infames) corpus de texto por correspondência já coletado: - -> “Um dos escândalos corporativos mais infames das últimas décadas deixou curiosamente em seu rastro um dos conjuntos de dados mais valiosos disponíveis publicamente. No final de 2001, o encobrimento de fraude contábil da Enron Corporation levou à falência da gigante da energia. A Federal Energy Regulatory Commission requereu todos os registros de e-mail da Enron como parte da investigação que se seguiu. Nos dois anos seguintes, a comissão divulgou, escondeu e depois divulgou novamente o corpus de e-mail para o público após excluir e-mails que continham informações pessoais, como números de previdência social. O corpus da Enron contém e-mails cujos assuntos variam de planejamento de férias de fim de semana a tópicos de discussão de estratégia política, e continua sendo o único grande exemplo de conjuntos de dados de e-mail do mundo real disponíveis para pesquisa ”. (Hardin, Sarkis e Urc, 2015) - -Quando o conjunto de dados de e-mail da Enron - organizado e editado - foi lançado em 2004, os pesquisadores descobriram uma oportunidade sem precedentes: acesso direto à maneira espontânea e sem censura como os funcionários de uma empresa condenada se comunicavam. De repente, os pesquisadores tiveram acesso a como as pessoas se comunicam no trabalho em uma escala sem precedentes. Isso era importante para pesquisadores interessados ​​no caso especial do escândalo e colapso da Enron, mas também para pesquisadores interessados ​​em um amplo espectro de questões sobre a comunicação cotidiana no trabalho. - -Na década seguinte, centenas de novos estudos surgiram a partir desses e-mails, realizados em diversos campos como teoria das redes sociais, comunidade e detecção de anomalias, gênero e comunicação dentro das organizações, mudança de comportamento durante uma crise organizacional, insularidade e formação de comunidade. O uso da teoria das redes sociais nas humanidades oferece algumas possibilidades fascinantes, mas não é tão simples. - -Além da grande quantidade de mensagens incluídas (o corpus contém mais de 600.000 mensagens), o corpus de e-mails da Enron também inclui os metadados necessários para que os pesquisadores realizem uma série de questões de pesquisa. Assim como a presença de envelopes com endereços legíveis do remetente e do destinatário seria um excelente trunfo para pesquisadores de correspondências de cartas históricas, a presença de endereços de e-mail do remetente e do destinatário permite que os pesquisadores associem os e-mails a determinados indivíduos conhecidos dentro da corporação. Como alguns indivíduos tinham vários endereços de e-mail, ou mais de um indivíduo pode ter compartilhado o mesmo endereço, os metadados não são de uso muito óbvio, mas são potencialmente elucidativos. O restante do tutorial explicará como aplicar e interpretar a análise de sentimento de e-mails neste corpus. - -# Usando Python com o Natural Language Toolkit (NLTK) - -

    -Programando pela primeira vez? Esta lição é destinada a iniciantes, mas pode ser conveniente revisar outras lições de Python no Programming Historian. No entanto, observe que, embora muitas lições usem o Python versão 2, esta lição requer o Python versão 3. As instruções de instalação do Python 3 serão apresentadas a seguir. -
    - -Neste tutorial, Python será usado junto com algumas ferramentas do Natural Language Toolkit (NLTK) para gerar indicadores de sentimento a partir de transcrições de e-mail. Para fazer isso, você primeiro aprenderá como carregar os dados textuais no Python, selecionar as ferramentas de PLN apropriadas para análise de sentimento e escrever um algoritmo que calcula pontuações de sentimento para um determinado texto. Também exploraremos como ajustar seu algoritmo para melhor atender a seu objetivo de pesquisa. Ao final, você irá arquivar seu algoritmo de solução de problemas como um pacote de código conhecido como *função*, que poderá ser reutilizado e reaproveitado (inclusive na parte 2 deste tutorial) - -## Instalação - -Para continuar, as seguintes instalações serão necessárias: - -* Python 3 (preferivelmente 3.5 ou superior) - [Instruções para baixar e instalar Python](https://wiki.python.org/moin/BeginnersGuide/Download) -* NLTK (3.2.5 or superior) - [Instruções para baixar e instalar NLTK](http://www.nltk.org/install.html) - -## Primeiros passos com NLTK - -O Natural Language Toolkit (NLTK) é uma coleção de ferramentas Python reutilizáveis (também conhecido como uma biblioteca Python) que ajuda os pesquisadores a aplicar um conjunto de métodos computacionais a textos. As ferramentas variam desde métodos que ajudam a quebrar o texto em pedaços menores, alguns que identificam se uma palavra pertence a um determinado idioma, até aqueles textos de amostra que os pesquisadores podem usar para fins de treinamento e desenvolvimento (como o texto completo de *Moby Dick*). - -Se você precisar de ajuda para baixar e instalar o módulo para [Python 3](https://www.python.org/download/releases/3.0/), dê uma olhada na lição Instalando Módulos Python com pip de [Fred Gibbs](/en/lessons/installing-python-modules-pip) (em inglês). - -Em nosso caso, usaremos duas ferramentas NLTK em particular: - -* A ferramenta ["Análise de sentimento VADER"](http://www.nltk.org/_modules/nltk/sentiment/vader.html) (que gera pontuações de sentimento positivas, negativas e neutras para uma determinada entrada) -* A ferramenta de toquenização ‘word_tokenize’ (divide um texto grande em uma sequência de unidades menores, como frases ou palavras) - -Para usar VADER e word_tokenize, primeiro precisamos baixar e instalar alguns dados extras para NLTK. O NLTK é um kit de ferramentas muito grande e várias de suas ferramentas requerem uma segunda etapa de download para reunir a coleção de dados necessária (geralmente léxicos codificados) para funcionar corretamente. - -Para instalar a análise de sentimento e o tokenizador de palavras que usaremos neste tutorial, escreva um novo script em Python com as três linhas a seguir: - -```python -import nltk -nltk.download('vader_lexicon') -nltk.download('punkt') -``` -Você pode salvar este arquivo como `“installation.py”`. Se você não tiver certeza de como salvar e executar scripts em Python, reveja o tutorial sobre como configurar um 'Ambiente de Desenvolvimento Integrado' usando Python, substituindo o comando '% (python)% f' por '% (python3)% f' quando você chegar a esse parte no tutorial. - -* Configurando um ambiente de desenvolvimento integrado para Python no [Windows](/pt/licoes/instalacao-windows). -* Configurando um ambiente de desenvolvimento integrado para Python no [Mac](/pt/licoes/instalacao-mac). -* Configurando um ambiente de desenvolvimento integrado para Python no [Linux](/pt/licoes/instalacao-linux). - - Se você sabe como executar scripts Python, execute o arquivo usando Python 3. - - [VADER](http://www.nltk.org/_modules/nltk/sentiment/vader.html) (Valence Aware Dictionary and sEntiment Reasoner) é uma ferramenta de atribuição de intensidade de sentimento acrescentada ao NLTK em 2014. Ao contrário de outras técnicas que exigem treinamento em textos parecidos antes do uso, o VADER está pronto para ser usado sem qualquer configuração especial. O VADER é o único que faz distinções refinadas entre vários graus de positividade e negatividade. Por exemplo, VADER pontua “conforto” como moderadamente positivo e “euforia” como extremamente positivo. Ele também tenta capturar e pontuar características textuais comuns em texto online informal, como letras maiúsculas, pontos de exclamação e emoticons, conforme mostrado na tabela abaixo: - - {% include figure.html filename="analise-sentimento1.png" caption="Vader captura pequenas gradações de entusiasmo. (Hutto e Gilbert, 2014). **Versão do tradutor**. Acesse a original [aqui](/en/lessons/sentiment-analysis)" %} - - Como qualquer ferramenta de análise de texto, o VADER deve ser avaliado com criticidade e de forma contextualizada. O VADER foi desenvolvido em meados da década de 2010 principalmente para analisar microblogs em inglês e sites de rede social (especialmente o Twitter). Esse tipo de texto tende a ser muito mais informal do que o e-mail profissional, e contém padrões de linguagem e de uso de recursos que diferem dos padrões de 1999-2002 quando os e-mails da Enron foram escritos. No entanto, VADER também foi desenvolvido como uma ferramenta de análise de sentimento de propósito geral, e o estudo inicial dos autores mostra que ele se compara favoravelmente com ferramentas que foram treinadas para domínios específicos, usam léxicos especializados ou técnicas de aprendizado de máquina com muitos recursos (Hutto e Gilbert, 2014 ). A sensibilidade da ferramenta em relação aos graus de afeto se mostrou útil para descrever as sutilezas das emoções expressas nos e-mails profissionais - como pesquisadores, podemos estar especialmente interessados ​​em capturar os momentos em que a emoção surge em um texto formal. No entanto, a análise de sentimento continua se dedicando a encontrar soluções para capturar sentimentos complexos como ironia, sarcasmo e zombaria, quando o leitor médio seria capaz de fazer a distinção entre o texto literal e seu significado pretendido. - - Embora o VADER seja uma boa ferramenta de uso geral para textos contemporâneos e históricos em inglês, a ferramenta fornece apenas suporte nativo parcial para textos em outras línguas (detecta emojis / maiúsculas / etc., mas não a escolha de palavras). No entanto, os desenvolvedores incentivam os usuários a usar a tradução automática para pré-processar textos que não sejam em inglês e, em seguida, inserir os resultados no VADER. O "VADER demo" inclui um código para enviar o texto de entrada automaticamente para o serviço web ‘My Memory Translation Service’, (leitores avançados podem encontrar no [Github](https://github.com/cjhutto/vaderSentiment/blob/master/vaderSentiment/vaderSentiment.py) a partir da linha 554 - no momento da escrita deste artigo). A implementação deste método de tradução é mais indicada para usuários intermediários de Python. Você pode aprender mais sobre o estado da arte da análise de sentimento multilíngue (que infelizmente quase sempre requer uma etapa de tradução) em ["Análise de sentimento multilíngue: o estado da arte e comparação independente de técnicas"](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4981629/), de Kia Dashtipour, et al (2016). - - -## Calculando Sentimento para um Parágrafo - -Leia o seguinte trecho: - ->“Like you, I am getting very frustrated with this process. I am genuinely trying to be as reasonable as possible. I am not trying to “hold up” the deal at the last minute. I’m afraid that I am being asked to take a fairly large leap of faith after this company (I don’t mean the two of you – I mean Enron) has screwed me and the people who work for me.” - -Este é o primeiro parágrafo do e-mail de janeiro de 2012 de Timothy Belden para Louise Kitchen e John Lavorato sobre o “Acordo de Contratos de Trabalho”. Belden dirigiu os Serviços de Energia da Enron e mais tarde seria condenado por conspiração a fim de aumentar os custos de energia na Califórnia, o que levou a uma crise energética em todo o estado. - -Apesar do sentimento de frustração e ansiedade que você pode deduzir do parágrafo como um todo, observe a ambivalência das frases específicas dentro do parágrafo. Alguns parecem expressar esforços de boa fé, por exemplo: “Não estou tentando ‘atrasar’ o negócio” e “genuinamente tentando”. E, no entanto, há declarações negativas ainda mais fortes sobre "ficar frustrado", "Receio" e "esta empresa [...] ferrou comigo e com as pessoas que trabalham para mim". - -Vamos calcular as pontuações de sentimento para este e-mail usando o VADER para ter uma ideia do que a ferramenta pode fazer. Para começar, crie um novo diretório de trabalho (pasta) em seu computador chamado `“sentimento”` em algum lugar onde você possa encontrá-lo. Dentro dessa pasta, crie um novo arquivo de texto e salve-o como `“sentimento.py”`. É aqui que escreveremos o código para esta tarefa. - -Primeiro, temos que dizer ao Python onde o código NLTK para a análise de sentimento VADER está localizado. No início do nosso arquivo, importaremos o código do VADER: - -```python -# primeiro, importamos os módulos relevantes da biblioteca NLTK -from nltk.sentiment.vader import SentimentIntensityAnalyzer -``` - -Também devemos habilitar o Python para usar este código com nosso conjunto particular de código. Embora tenhamos todas as instruções de que precisamos na biblioteca NLTK, o Python gosta de agrupar essas instruções em um único `objeto` (nossa ferramenta de Análise de Sentimentos) que nosso programa pode acessar. *SentimentIntensityAnalyzer* é uma `classe`, que é um “modelo” que instrui o Python a construir um `objeto` com um conjunto especial de `funções` e `variáveis`. No nosso caso, queremos construir um único `objeto`: nosso analisador de sentimento, que segue este “modelo”. Para fazer isso, executamos *SentimentIntensityAnalyzer( )* e atribuímos a saída - nosso novo analisador de sentimento - a uma variável, que chamaremos de *‘sid’*. - -```python -# em seguida, inicializamos o VADER para que possamos usá-lo em nosso script Python -sid = SentimentIntensityAnalyzer() -``` - -Fazendo isso, fornecemos à nossa nova variável *sid* todos os recursos do código de análise de sentimento VADER. Assim, *sid* se tornou nossa ferramenta de análise de sentimento, mas com um nome mais curto. - -Em seguida, precisamos armazenar o texto que queremos analisar em um lugar que o *sid* possa acessar. Em Python, podemos armazenar uma única sequência de texto como uma variável de `string` (Nota do tradutor: Optamos por manter a palavra 'string' como no original em inglês para facilitar o entendimento de seu uso mais comum em códigos ['str']). - -```python -# a variável 'message_text' agora contém o texto que iremos analisar. -message_text = '''Like you, I am getting very frustrated with this process. I am genuinely trying to be as reasonable as possible. I am not trying to "hold up" the deal at the last minute. I'm afraid that I am being asked to take a fairly large leap of faith after this company (I don't mean the two of you -- I mean Enron) has screwed me and the people who work for me.''' -``` - -Como este texto inclui aspas e apóstrofos, é necessário circundar todo o texto com três aspas (“”” ou ’’’). Isso significa que quaisquer aspas e apóstrofos no texto serão reconhecidos como tal. Essa abordagem também mantém qualquer espaçamento que nosso texto já inclua. - -Agora você está pronto para processar o texto. - -Para fazer isso, o texto *(message_text)* deve ser inserido na ferramenta *(sid)* e o programa deve ser executado. Estamos interessados na "pontuação de polaridade" do analisador de sentimento, que nos dá uma pontuação positiva ou negativa. Este recurso é integrado ao VADER e pode ser solicitado sob demanda. - -Queremos ter certeza de capturar a saída de sid.polarity_scores () atribuindo-a a uma variável que chamaremos de *scores*: - -```python -print(message_text) - -# Utilizar método polarity_scores no sid e passar dentro dele o message_text produz um dicionário com pontuações negativas, neutras, positivas e compostas para o texto de entrada -scores = sid.polarity_scores(message_text) -``` - -Quando você executa este código, os resultados da análise de sentimento agora são armazenados no `dicionário` de *pontuação* (scores). Um dicionário, muito parecido com o tipo que você usa para pesquisar a definição de palavras, é uma variável que armazena informações conhecidas como 'valores' que são acessíveis dando ao programa a 'chave' para a entrada que você deseja ler. Isso significa que um dicionário como *scores* pode armazenar muitos `pares de valores-chave`. Para solicitar os dados, você só precisa conhecer as `chaves`. Mas não sabemos as `chaves`. Felizmente, Python nos dará uma lista de todas as `chaves`, classificadas em ordem alfabética, se usarmos a função `sorted(scores)`. - -Para imprimir cada `chave` e `valor` armazenado no dicionário, precisamos de um `for loop`, que aplica o mesmo código sequencialmente a todas as `chaves` do dicionário. - -Aqui está o código para imprimir cada par de `valores-chave` dentro da variável de pontuação (score): - -```python -# Aqui, percorremos as chaves contidas nas pontuações (pos, neu, neg e pontuações compostas) e imprimimos os pares de valores-chave na tela para digitação classificada (pontuações): -for key in sorted(scores): - print('{0}: {1}, '.format(key, scores[key]), end='') -``` - -Aqui está todo o código em um único programa: - -```python -# primeiro, importamos os módulos relevantes da biblioteca NLTK -from nltk.sentiment.vader import SentimentIntensityAnalyzer - -# a seguir, inicializamos o VADER para que possamos usá-lo em nosso script Python -sid = SentimentIntensityAnalyzer() - -# a variável 'message_text' agora contém o texto que iremos analisar. -message_text = '''Like you, I am getting very frustrated with this process. I am genuinely trying to be as reasonable as possible. I am not trying to "hold up" the deal at the last minute. I'm afraid that I am being asked to take a fairly large leap of faith after this company (I don't mean the two of you -- I mean Enron) has screwed me and the people who work for me.''' - -print(message_text) - -# Utilizar método polarity_scores no sid e passar dentro dele o message_text produz um dicionário com pontuações negativas, neutras, positivas e compostas para o texto de entrada -scores = sid.polarity_scores(message_text) - -# Aqui, percorremos as chaves contidas nas pontuações (pos, neu, neg e pontuações compostas) e imprimimos os pares de valores-chave na tela -for key in sorted(scores): - print('{0}: {1}, '.format(key, scores[key]), end='') -``` - -Salve seu arquivo Python. Agora estamos prontos para executar o código. Usando seu método preferido (ou seu Ambiente de Desenvolvimento Integrado ou a linha de comando), execute seu arquivo Python, `sentimento.py`. - -O resultado deve ser semelhante a este: - -```python -Like you, I am getting very frustrated with this process. I am genuinely trying to be as reasonable as possible. I am not trying to "hold up" the deal at the last minute. I'm afraid that I am being asked to take a fairly large leap of faith after this company (I don't mean the two of you -- I mean Enron) has screwed me and the people who work for me. - -compound: -0.3804, neg: 0.093, neu: 0.836, pos: 0.071, -``` -
    -Lembre-se de usar três aspas simples para envolver a string *message_text* acima. Se você usar aspas duplas, a string terminará mais cedo devido às aspas dentro do texto. -
    - -O VADER coleta e pontua palavras e características negativas, neutras e positivas (e é responsável por fatores como negação ao longo do caminho). Os valores “neg”, “neu” e “pos” descrevem a fração das pontuações ponderadas que se enquadram em cada categoria. VADER também soma todas as pontuações ponderadas para calcular um valor “composto” normalizado entre -1 e 1; este valor tenta descrever o efeito geral de todo o texto de fortemente negativo (-1) a fortemente positivo (1). Neste caso, a análise com VADER descreve a passagem como ligeiramente a moderadamente negativa (-0,3804). Podemos pensar nesse valor como uma estimativa da impressão geral de um leitor médio ao considerar o e-mail como um todo, apesar de alguma ambiguidade e ambivalência ao longo do caminho. - -Ao ler o texto, estaria inclinado a concordar com essa avaliação geral. O valor de saída de -0,3804 é negativo, mas não fortemente negativo. Os pesquisadores podem desejar definir um limite mínimo para positividade ou negatividade antes de declarar um texto definitivamente positivo ou negativo - por exemplo, a documentação oficial do VADER sugere um limite de -0,5 e 0,5, que este trecho específico não alcançaria (em outras palavras , este texto é negativo, mas não extremamente negativo). - -O que isso implica, para você, sobre a maneira como esse sentimento pode ser expresso em um contexto de e-mail profissional? Como você definiria seus valores limite quando o texto expressa emoções de maneira mais sutil ou cortês? Você acha que a análise de sentimento é uma ferramenta apropriada para nossa análise exploratória de dados? - -Desafio: tente substituir o conteúdo de *message_text* pelas seguintes cadeias de caracteres e execute novamente o programa. Não se esqueça de cercar cada texto com três aspas simples ao atribuí-lo à variável *message_text* (como em: *message_text* = ''' algumas palavras '''). Antes de executar o programa, tente adivinhar o resultado da análise de sentimento: positivo ou negativo? Quão positivo ou negativo? - -Texto 1: - -``` -Looks great. I think we should have a least 1 or 2 real time traders in Calgary. -``` - -Texto 2: - -``` -I think we are making great progress on the systems side. I would like to -set a deadline of November 10th to have a plan on all North American projects -(I'm ok if fundementals groups are excluded) that is signed off on by -commercial, Sally's world, and Beth's world. When I say signed off I mean -that I want signitures on a piece of paper that everyone is onside with the -plan for each project. If you don't agree don't sign. If certain projects -(ie. the gas plan) are not done yet then lay out a timeframe that the plan -will be complete. I want much more in the way of specifics about objectives -and timeframe. - -Thanks for everyone's hard work on this. -``` - -Experimente uma terceira vez com algum texto de uma de suas próprias fontes de pesquisa. Que resultados você obteve para cada um? Você concorda com os resultados? - -# Determine o escopo apropriado para e-mail - -Quando analisado por meio da ferramenta de análise de sentimento VADER, o texto produz um conjunto de pontuações positivas, neutras e negativas, que são então agregadas e dimensionadas como uma "pontuação composta". Embora seja útil saber em teoria, como esse método pode ser aplicado aos dados no exemplo da Enron - isto é, uma coleção de dados de e-mail e metadados? E o que isso pode nos dizer sobre as emoções, relacionamentos e mudanças ao longo do tempo dos funcionários da Enron? - -Nesta seção, apresentaremos a você o processo de seleção do escopo de análise para nossa ferramenta de análise de sentimento. Considere os seguintes dados brutos pertencentes a um e-mail de 3 de outubro de 2000 escrito por Jeffrey Shankman, então presidente de mercados globais da Enron (Quinn, 2006): - -``` -Message-ID: <3764632.1075857565248.JavaMail.evans@thyme> -Date: Mon, 23 Oct 2000 09:14:00 -0700 (PDT) -From: jeffrey.shankman@enron.com -To: john.nowlan@enron.com, don.schroeder@enron.com, david.botchlett@enron.com, - chris.mahoney@enron.com, ross.koller@enron.com -Subject: -Mime-Version: 1.0 -Content-Type: text/plain; charset=us-ascii -Content-Transfer-Encoding: 7bit -X-From: Jeffrey A Shankman -X-To: John L Nowlan, Don Schroeder, David J Botchlett, Chris Mahoney, Ross Koller -X-cc: -X-bcc: -X-Folder: \Jeffrey_Shankman_Jun2001\Notes Folders\Sent -X-Origin: Shankman-J -X-FileName: jshankm.nsf - -It seems to me we are in the middle of no man's land with respect to the -following: Opec production speculation, Mid east crisis and renewed -tensions, US elections and what looks like a slowing economy (?), and no -real weather anywhere in the world. I think it would be most prudent to play -the markets from a very flat price position and try to day trade more -aggressively. I have no intentions of outguessing Mr. Greenspan, the US. -electorate, the Opec ministers and their new important roles, The Israeli and -Palestinian leaders, and somewhat importantly, Mother Nature. Given that, -and that we cannot afford to lose any more money, and that Var seems to be a -problem, let's be as flat as possible. I'm ok with spread risk (not front to -backs, but commodity spreads). - - -The morning meetings are not inspiring, and I don't have a real feel for -everyone's passion with respect to the markets. As such, I'd like to ask -John N. to run the morning meetings on Mon. and Wed. - - -Thanks. Jeff -``` - -No texto da mensagem do e-mail, Shankman traça uma estratégia corporativa para avançar no que ele percebe como um contexto geopolítico ambíguo. A mensagem descreve uma série de situações difíceis, bem como exasperação ("As reuniões matinais não são inspiradoras") e incerteza ("Não tenho um sentimento real de paixão de todos"). Ao mesmo tempo, Shankman descreve um conjunto de etapas de ação junto com pedidos educados ("Eu gostaria de pedir ...") e expressões de gratidão ("Obrigado"). - -Antes de prosseguirmos, pare um minuto para refletir sobre a mensagem. Como você acha que um leitor típico descreveria a intensidade emocional deste e-mail? Considerando o que você sabe agora sobre VADER, que proporção de positividade, negatividade e neutralidade você espera que a ferramenta de análise de sentimento encontre na mensagem? Finalmente, o que você acha que a pontuação composta irá sugerir sobre o efeito geral na mensagem? - -Como discutimos acima, a análise de sentimento não fornece uma saída objetiva, mas sim indicadores de orientação que refletem nossa escolha e calibração de ferramentas analíticas. Talvez o elemento mais importante da calibração seja selecionar o escopo do texto que está sendo analisado, ou seja, quanto de uma mensagem colocamos na ferramenta de uma vez. Em nosso caso, podemos determinar o escopo da análise decidindo entre analisar a mensagem inteira como uma única unidade ou, em vez disso, dividir a mensagem em unidades menores como frases e analisar cada uma separadamente. - -Primeiro, vamos considerar uma abordagem no nível da mensagem, na qual analisamos a mensagem como um único bloco: - -```python -# Continue com o mesmo código da seção anterior, mas substitua a variável *message_text* pelo novo texto do e-mail: - -message_text = '''It seems to me we are in the middle of no man's land with respect to the following: Opec production speculation, Mid east crisis and renewed tensions, US elections and what looks like a slowing economy (?), and no real weather anywhere in the world. I think it would be most prudent to play the markets from a very flat price position and try to day trade more aggressively. I have no intentions of outguessing Mr. Greenspan, the US. electorate, the Opec ministers and their new important roles, The Israeli and Palestinian leaders, and somewhat importantly, Mother Nature. Given that, and that we cannot afford to lose any more money, and that Var seems to be a problem, let's be as flat as possible. I'm ok with spread risk (not front to backs, but commodity spreads). The morning meetings are not inspiring, and I don't have a real feel for everyone's passion with respect to the markets. As such, I'd like to ask John N. to run the morning meetings on Mon. and Wed. Thanks. Jeff''' - -``` - -Substitua `sentimento.py` pelo código acima, salve-o e execute-o. A saída deve ser semelhante a esta: - -```python -It seems to me we are in the middle of no man's land with respect to the following: Opec production speculation, Mid east crisis and renewed tensions, US elections and what looks like a slowing economy (?), and no real weather anywhere in the world. I think it would be most prudent to play the markets from a very flat price position and try to day trade more aggressively. I have no intentions of outguessing Mr. Greenspan, the US. electorate, the Opec ministers and their new important roles, The Israeli and Palestinian leaders, and somewhat importantly, Mother Nature. Given that, and that we cannot afford to lose any more money, and that Var seems to be a problem, let's be as flat as possible. I'm ok with spread risk (not front to backs, but commodity spreads). The morning meetings are not inspiring, and I don't have a real feel for everyone's passion with respect to the markets. As such, I'd like to ask John N. to run the morning meetings on Mon. and Wed. Thanks. Jeff -compound: 0.889, neg: 0.096, neu: 0.765, pos: 0.14, -``` - -Aqui você pode ver que, ao analisar o e-mail como um todo, VADER retorna valores que sugerem que a mensagem é principalmente neutra (neu: 0,765), mas que mais recursos parecem ser positivos (pos: 0,14) em vez de negativos (0,096). VADER calcula uma pontuação geral de sentimento de 0,889 para a mensagem (em uma escala de -1 a 1), o que sugere um efeito fortemente positivo para a mensagem como um todo. - -Isso atendeu às suas expectativas? Se não, por que você acha que o VADER encontrou mais características positivas do que negativas? - -No nível da entidade da mensagem, não há como destacar sentimentos particularmente positivos ou negativos na mensagem. Essa perda de detalhes pode ser irrelevante ou pode ser vital ao conduzir uma análise exploratória. Isso depende das necessidades de pesquisa de seu estudo. Por exemplo, identificar frases negativas em e-mails de outra forma adequados pode ser especialmente importante ao procurar explosões emocionais ou trocas abusivas que podem ocorrer muito raramente, mas revelam algo essencial sobre a natureza de um relacionamento. Se quisermos capturar esse nível de nuance, precisamos de um método para passar da análise do nível da mensagem para a análise do sentimento. - -Felizmente, o NLTK oferece uma coleção de ferramentas para dividir o texto em componentes menores. Os tokenizadores dividem as sequências de texto em pedaços menores, como frases. Alguns podem ainda dividir uma frase em partes específicas do discurso, como o substantivo, adjetivo e assim por diante. No nosso caso, usaremos o tokenizer english.pickle do NLTK para dividir os parágrafos em sentenças. - -Agora podemos reescrever o script de análise de sentimento para analisar cada frase separadamente: - -```python -# Abaixo está o código de análise de sentimento reescrito para uma análise por frase -# observe o novo módulo -- word_tokenize! -import nltk.data -from nltk.sentiment.vader import SentimentIntensityAnalyzer -from nltk import sentiment -from nltk import word_tokenize - -# Em seguida, inicializamos VADER para utilizá-lo em nosso script Python -sid = SentimentIntensityAnalyzer() - -# Vamos também incializar nossa função 'english.pickle' e atribuir a ela um nome curto - -tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') - -message_text = '''It seems to me we are in the middle of no man's land with respect to the following: Opec production speculation, Mid east crisis and renewed tensions, US elections and what looks like a slowing economy (?), and no real weather anywhere in the world. I think it would be most prudent to play the markets from a very flat price position and try to day trade more aggressively. I have no intentions of outguessing Mr. Greenspan, the US. electorate, the Opec ministers and their new important roles, The Israeli and Palestinian leaders, and somewhat importantly, Mother Nature. Given that, and that we cannot afford to lose any more money, and that Var seems to be a problem, let's be as flat as possible. I'm ok with spread risk (not front to backs, but commodity spreads). The morning meetings are not inspiring, and I don't have a real feel for everyone's passion with respect to the markets. As such, I'd like to ask John N. to run the morning meetings on Mon. and Wed. Thanks. Jeff''' - -# O método de tokenização quebra o parágrafo em uma lista de frases (strings). Neste exemplo, observe que o tokenizer se confunde pela falta de espaçamento após o ponto final e acaba por quebrar as frases de forma equivocada. Como podemos consertar isso? - -sentences = tokenizer.tokenize(message_text) - -# Vamos adicionar um passo para percorrer a lista de frases, calcular e imprimir a pontuação de polaridade para cada uma. - -for sentence in sentences: - print(sentence) - scores = sid.polarity_scores(sentence) - for key in sorted(scores): - print('{0}: {1}, '.format(key, scores[key]), end='') - print() -``` - - -O resultado deve ser semelhante a este: - -```python -It seems to me we are in the middle of no man's land with respect to the following: Opec production speculation, Mid east crisis and renewed tensions, US elections and what looks like a slowing economy (? -compound: -0.5267, neg: 0.197, neu: 0.68, pos: 0.123, -), and no real weather anywhere in the world. -compound: -0.296, neg: 0.216, neu: 0.784, pos: 0.0, -I think it would be most prudent to play the markets from a very flat price position and try to day trade more aggressively. -compound: 0.0183, neg: 0.103, neu: 0.792, pos: 0.105, -I have no intentions of outguessing Mr. Greenspan, the US. -compound: -0.296, neg: 0.216, neu: 0.784, pos: 0.0, -electorate, the Opec ministers and their new important roles, The Israeli and Palestinian leaders, and somewhat importantly, Mother Nature. -compound: 0.4228, neg: 0.0, neu: 0.817, pos: 0.183, -Given that, and that we cannot afford to lose any more money, and that Var seems to be a problem, let's be as flat as possible. -compound: -0.1134, neg: 0.097, neu: 0.823, pos: 0.081, -I'm ok with spread risk (not front to backs, but commodity spreads). -compound: -0.0129, neg: 0.2, neu: 0.679, pos: 0.121, -The morning meetings are not inspiring, and I don't have a real feel for everyone's passion with respect to the markets. -compound: 0.5815, neg: 0.095, neu: 0.655, pos: 0.25, -As such, I'd like to ask John N. to run the morning meetings on Mon. -compound: 0.3612, neg: 0.0, neu: 0.848, pos: 0.152, -and Wed. -compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, -Thanks. -compound: 0.4404, neg: 0.0, neu: 0.0, pos: 1.0, -Jeff -compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, -``` - -Aqui, você notará uma visalização muito mais detalhada do sentimento neste e-mail. O VADER identifica com sucesso sentenças moderadas a fortemente negativas no e-mail, especialmente as principais descrições de crises. A análise no nível da frase permite que você identifique frases e tópicos específicos nos extremos do sentimento, o que pode ser útil mais tarde. - -Mas, mesmo nesse nível, o VADER também comete vários erros. A frase que começa com “As reuniões matinais não são inspiradoras” resulta em uma pontuação surpreendentemente positiva - talvez por causa de uma leitura incorreta dos termos “paixão” e “respeito”. - - Observe também que o ponto de interrogação no início do e-mail e o ponto de abreviação após *Mon* (Segunda-feira: *seg.*) próximo ao final fazem com que o tokenizador english.pickle quebre as frases por engano. Este é um risco constante de pontuação informal e complexa no texto. - -O que você nota sobre a distribuição dos scores de sentimento? Como você poderia coletá-los de uma maneira que o ajude a entender melhor seus dados e as questões de pesquisa de seu interesse? (Sinta-se à vontade para experimentar diferentes tipos de texto na variável *message_text* para ver como a ferramenta responde a diferentes tipos de construções de linguagem). O código que você acabou de escrever pode ser reaproveitado para qualquer texto. - -# Agradecimentos - -Meus sinceros agradecimentos a Justin Joque, Bibliotecário de Visualização da Biblioteca da Universidade de Michigan e do Digital Projects Studio, pelo apoio na formulação das ideias e abordagem por trás desta lição. Muito obrigado também a Adam Crymble, que forneceu diversas ideias e apoio durante todo o processo editorial. E obrigado a Anandi Silva Knuppel e Puteri Zarina Megat Khalid por seus comentários atenciosos. - -# Referências - -Barton, D., & Hall, N. (Eds.). (2000). Letter writing as a social practice (Vol. 9). John Benjamins Publishing. - -Hardin, J., Sarkis, G., & Urc, P. C. (2015). Network Analysis with the Enron Email Corpus. Journal of Statistics Education, 23:2. https://doi.org/10.1080/10691898.2015.11889734 - -Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. https://www.aaai.org/ocs/index.php/ICWSM/ICWSM14/paper/viewPaper/8109 - -Klimt, B., & Yang, Y. (2004, July). Introducing the Enron Corpus. In CEAS. https://bklimt.com/papers/2004_klimt_ceas.pdf - -Klimt, B., & Yang, Y. (2004). The Enron corpus: A new dataset for email classification research. Machine learning: ECML 2004, 217-226. https://bklimt.com/papers/2004_klimt_ecml.pdf - -Tukey, J.W. (1977). Exploratory Data Analysis. Addison-Wesley Publishing Company - -Quinn, J. (2006, November 14). Ex-Enron man goes back into energy. Retrieved January 10, 2018, from http://www.telegraph.co.uk/finance/2950645/Ex-Enron-man-goes-back-into-energy.html +--- +title: Análise de sentimento para exploração de dados +layout: lesson +slug: analise-sentimento-exploracao-dados +date: 2018-01-15 +translation_date: 2021-06-14 +authors: +- Zoë Wilkinson Saldaña +reviewers: +- Anandi Silva Knuppel +- Puteri Zarina Megat Khalid +editors: +- Adam Crymble +translator: +- Caio Mello +translation-editor: +- Josir Cardoso Gomes +translation-reviewer: +- Bruno Ponne +- Ian Araujo +original: sentiment-analysis +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/375 +difficulty: 2 +activity: analyzing +topics: [distant-reading] +abstract: "Nesta lição, você aprenderá a conduzir uma 'análise de sentimento' em textos e a interpretar os resultados. Esta é uma forma de análise exploratória de dados baseada no processamento de linguagem natural (PLN). Você aprenderá a instalar todos os softwares apropriados e a construir um programa reutilizável que pode ser aplicado aos seus próprios textos." +avatar_alt: Um homem sorridente e um homem rabugento +doi: 10.46430/phpt0017 +--- + +{% include toc.html %} + + +# Objetivos da lição + +Esta lição usa a análise de sentimento como base para uma análise exploratória de dados de um grande corpus textual. Portanto, é indicada para leitores com alguma experiência prévia em programação utilizando Python. Caso não tenha experiência com Python ou programação, a autora recomenda trabalhar nas primeiras lições da série “Introdução ao Python”. Ao final desta lição, você terá o conhecimento necessário para: + +* Elaborar questões de pesquisa que usem Processamento de Linguagem Natural (PLN) em um corpus textual. +* Utilizar Python e o Natural Language Processing Toolkit (NLTK) para gerar medidas de sentimento para um texto. +* Avaliar criticamente os resultados da análise de sentimento e ajustar os parâmetros e a metodologia conforme necessário. +* Identificar as próximas etapas para continuar o aprendizado sobre análise exploratória de dados e abordagens programáticas para dados qualitativos. + +Nota do tradutor: Devido à falta de uma biblioteca de código que funcione bem com os textos em português, optamos por manter os textos dos exercícios na língua original. + +## O que é análise exploratória de dados? + +A análise exploratória de dados é um conjunto de estratégias que trazem à tona características importantes num conjunto de dados que normalmente não são facilmente identificadas por meio da leitura tradicional. Com os insights da análise exploratória de dados em mãos, os pesquisadores podem tomar decisões mais adequadas ao selecionar um método ou abordagem para sua questão de pesquisa, ou até mesmo, identificar novas questões. + +Em 1977, o matemático John Tukey descreveu a análise exploratória de dados como uma forma de trabalho de detetive, sem a qual, os estudiosos muitas vezes perderiam descobertas interessantes, porém menos óbvias: + +> “A menos que o detetive encontre pistas, o juiz ou júri não terá como julgar. Caso a análise exploratória de dados não revele indícios, geralmente quantitativos, é provável que se considere não haver nada a ser comprovado. ” (Tukey 1977: 3, tradução livre) + +## Explorando Texto com Análise de Sentimento + +Quando confrontado com um corpus promissor, porém muito grande, como o pesquisador pode encontrar aquilo de mais importante, que pode levar às descobertas de pesquisa mais interessantes? + +O Processamento de Linguagem Natural (PLN) abrange uma ampla gama de técnicas que se baseiam na aplicação de métodos analíticos computacionais ao conteúdo textual, fornecendo meios de categorizar e quantificar o texto. Essas abordagens de PLN, que incluem análise de sentimento, podem ajudar os pesquisadores a explorar seus textos. Nas palavras de Tukey, podem ajudar o pesquisador a encontrar “pistas” sobre seus textos e “indícios” de que pode valer a pena investigar algo mais a fundo. + +Nesta lição, vamos nos concentrar numa ferramenta do kit de ferramentas do PLN: a análise de sentimento. A análise de sentimento busca quantificar a intensidade emocional de palavras e frases num texto. Algumas ferramentas de análise de sentimento levam em consideração, inclusive, o peso emocional de sinais linguísticos como a pontuação ou mesmo os emojis. As ferramentas de análise de sentimento geralmente processam uma unidade de texto (uma frase, um parágrafo, um livro, etc.) e produzem pontuações (“scores”, em inglês) ou classificações quantitativas para indicar se o algoritmo considera que aquele texto transmite emoções positivas ou negativas. Algumas ferramentas também podem quantificar o *grau de positividade* ou o *grau de negatividade* num texto. Combinada com outros métodos de PLN, como modelagem de tópicos (“topic modelling”, em inglês), a análise de sentimento fornece meios de caracterizar as emoções expressas sobre diferentes tópicos de uma conversa. Quando usada em conjunto com a análise de rede, pode lançar luz sobre as maneiras como os indivíduos interagem uns com os outros. Um pesquisador interessado em interações sobre um evento político pode usar a análise de sentimento para estudar como os indivíduos descrevem aquele evento nas redes sociais. Com os dados certos para inserir na ferramenta, pode ser possível fazer comparações regionais ou entender como diferentes grupos demográficos vêem o evento de forma diferente. Como a ferramenta pode processar muitos dados sequencialmente, é até possível analisar o sentimento em centenas de milhares ou até milhões de eventos discursivos. + +Para começar, esta lição fornece uma introdução à análise de sentimento tanto prática quanto crítica. Como qualquer ferramenta computacional, a análise de sentimento tem uma série de limitações e vieses que os pesquisadores devem levar em consideração. Os pesquisadores devem ser especialmente cautelosos ao fazer afirmações empíricas com base nos resultados da análise de sentimento. Você poderá ser melhor atendido usando a análise de sentimento em situações provisórias e exploratórias, como meio de orientar o processo de pesquisa. Ao manejar essas ferramentas com ceticismo e eficácia, é possível realizar um trabalho de detetive bastante notável. + +## Análise de grandes coleções de correspondência textual + +Correspondências escritas como cartas, e-mails, registros de bate-papo, tweets e históricos de mensagens de texto podem fornecer aos pesquisadores uma visão inestimável de seus autores. Os textos geralmente são ricos em emoções e informações que não estão disponibilizadas em nenhum outro lugar. Um pesquisador pode aprender sobre as opiniões que as pessoas, objetos de seu estudo, tiveram sobre vários tópicos ou sobre determinados eventos. Também poderia ser possível aprender sobre os relacionamentos que os indivíduos desenvolveram e mantiveram em organizações ou redes complexas. + +Embora metodologias como etnografia, leitura “manual” e análise do discurso ajudem os pesquisadores a analisar a correspondência histórica, esses métodos trazem desafios significativos quando o número de textos cresce de dezenas ou centenas para milhares ou milhões. A análise textual computacional fornece um conjunto de métodos para tornar visíveis as tendências, dinâmicas e relacionamentos que podem estar ocultos para o leitor humano por problemas de escala. Além disso, muitos métodos de computação produzem descobertas que podem ser expressas quantitativamente e que podem subsequentemente permitir que o pesquisador realize modelagem estatística, visualização de informações e aprendizado de máquina (Machine Learning) para fazer outras análises. + +## Estudo de caso: corpus de e-mails da Enron + +Este tutorial usa a correspondência de e-mail da falida empresa americana de energia Enron. A Enron ocultou uma ampla variedade de práticas contábeis ilegais até que uma investigação federal em 2001 a levou à falência. Na época, o Escândalo Enron foi o maior colapso de uma empresa de capital aberto da história. Em 2001, a empresa começou a mostrar sinais de problemas financeiros que não se alinhavam com as divulgações financeiras da empresa até aquele momento. As ações da Enron negociadas em bolsa caíram de US$ 90,75 em meados de 2000 para menos de um dólar em novembro de 2001, o que levou os acionistas a processar a empresa. Uma investigação subsequente da Comissão de Valores Mobiliários dos Estados Unidos (SEC) revelou que os executivos da Enron cometeram fraude e negligência contábil em grande escala. A Enron declarou falência em dezembro daquele ano. Nos anos que se seguiram, vários executivos enfrentaram condenações criminais por sua participação no escândalo. Para os pesquisadores, o Escândalo Enron resultou na criação de um dos maiores (e mais infames) corpus de texto por correspondência já coletado: + +> “Um dos escândalos corporativos mais infames das últimas décadas deixou curiosamente em seu rastro um dos conjuntos de dados mais valiosos disponíveis publicamente. No final de 2001, o encobrimento de fraude contábil da Enron Corporation levou à falência da gigante da energia. A Federal Energy Regulatory Commission requereu todos os registros de e-mail da Enron como parte da investigação que se seguiu. Nos dois anos seguintes, a comissão divulgou, escondeu e depois divulgou novamente o corpus de e-mail para o público após excluir e-mails que continham informações pessoais, como números de previdência social. O corpus da Enron contém e-mails cujos assuntos variam de planejamento de férias de fim de semana a tópicos de discussão de estratégia política, e continua sendo o único grande exemplo de conjuntos de dados de e-mail do mundo real disponíveis para pesquisa ”. (Hardin, Sarkis e Urc, 2015) + +Quando o conjunto de dados de e-mail da Enron - organizado e editado - foi lançado em 2004, os pesquisadores descobriram uma oportunidade sem precedentes: acesso direto à maneira espontânea e sem censura como os funcionários de uma empresa condenada se comunicavam. De repente, os pesquisadores tiveram acesso a como as pessoas se comunicam no trabalho em uma escala sem precedentes. Isso era importante para pesquisadores interessados ​​no caso especial do escândalo e colapso da Enron, mas também para pesquisadores interessados ​​em um amplo espectro de questões sobre a comunicação cotidiana no trabalho. + +Na década seguinte, centenas de novos estudos surgiram a partir desses e-mails, realizados em diversos campos como teoria das redes sociais, comunidade e detecção de anomalias, gênero e comunicação dentro das organizações, mudança de comportamento durante uma crise organizacional, insularidade e formação de comunidade. O uso da teoria das redes sociais nas humanidades oferece algumas possibilidades fascinantes, mas não é tão simples. + +Além da grande quantidade de mensagens incluídas (o corpus contém mais de 600.000 mensagens), o corpus de e-mails da Enron também inclui os metadados necessários para que os pesquisadores realizem uma série de questões de pesquisa. Assim como a presença de envelopes com endereços legíveis do remetente e do destinatário seria um excelente trunfo para pesquisadores de correspondências de cartas históricas, a presença de endereços de e-mail do remetente e do destinatário permite que os pesquisadores associem os e-mails a determinados indivíduos conhecidos dentro da corporação. Como alguns indivíduos tinham vários endereços de e-mail, ou mais de um indivíduo pode ter compartilhado o mesmo endereço, os metadados não são de uso muito óbvio, mas são potencialmente elucidativos. O restante do tutorial explicará como aplicar e interpretar a análise de sentimento de e-mails neste corpus. + +# Usando Python com o Natural Language Toolkit (NLTK) + +
    +Programando pela primeira vez? Esta lição é destinada a iniciantes, mas pode ser conveniente revisar outras lições de Python no Programming Historian. No entanto, observe que, embora muitas lições usem o Python versão 2, esta lição requer o Python versão 3. As instruções de instalação do Python 3 serão apresentadas a seguir. +
    + +Neste tutorial, Python será usado junto com algumas ferramentas do Natural Language Toolkit (NLTK) para gerar indicadores de sentimento a partir de transcrições de e-mail. Para fazer isso, você primeiro aprenderá como carregar os dados textuais no Python, selecionar as ferramentas de PLN apropriadas para análise de sentimento e escrever um algoritmo que calcula pontuações de sentimento para um determinado texto. Também exploraremos como ajustar seu algoritmo para melhor atender a seu objetivo de pesquisa. Ao final, você irá arquivar seu algoritmo de solução de problemas como um pacote de código conhecido como *função*, que poderá ser reutilizado e reaproveitado (inclusive na parte 2 deste tutorial) + +## Instalação + +Para continuar, as seguintes instalações serão necessárias: + +* Python 3 (preferivelmente 3.5 ou superior) - [Instruções para baixar e instalar Python](https://wiki.python.org/moin/BeginnersGuide/Download) +* NLTK (3.2.5 or superior) - [Instruções para baixar e instalar NLTK](https://www.nltk.org/install.html) + +## Primeiros passos com NLTK + +O Natural Language Toolkit (NLTK) é uma coleção de ferramentas Python reutilizáveis (também conhecido como uma biblioteca Python) que ajuda os pesquisadores a aplicar um conjunto de métodos computacionais a textos. As ferramentas variam desde métodos que ajudam a quebrar o texto em pedaços menores, alguns que identificam se uma palavra pertence a um determinado idioma, até aqueles textos de amostra que os pesquisadores podem usar para fins de treinamento e desenvolvimento (como o texto completo de *Moby Dick*). + +Se você precisar de ajuda para baixar e instalar o módulo para [Python 3](https://www.python.org/download/releases/3.0/), dê uma olhada na lição Instalando Módulos Python com pip de [Fred Gibbs](/en/lessons/installing-python-modules-pip) (em inglês). + +Em nosso caso, usaremos duas ferramentas NLTK em particular: + +* A ferramenta ["Análise de sentimento VADER"](https://www.nltk.org/_modules/nltk/sentiment/vader.html) (que gera pontuações de sentimento positivas, negativas e neutras para uma determinada entrada) +* A ferramenta de toquenização ‘word_tokenize’ (divide um texto grande em uma sequência de unidades menores, como frases ou palavras) + +Para usar VADER e word_tokenize, primeiro precisamos baixar e instalar alguns dados extras para NLTK. O NLTK é um kit de ferramentas muito grande e várias de suas ferramentas requerem uma segunda etapa de download para reunir a coleção de dados necessária (geralmente léxicos codificados) para funcionar corretamente. + +Para instalar a análise de sentimento e o tokenizador de palavras que usaremos neste tutorial, escreva um novo script em Python com as três linhas a seguir: + +```python +import nltk +nltk.download('vader_lexicon') +nltk.download('punkt') +``` +Você pode salvar este arquivo como `“installation.py”`. Se você não tiver certeza de como salvar e executar scripts em Python, reveja o tutorial sobre como configurar um 'Ambiente de Desenvolvimento Integrado' usando Python, substituindo o comando '% (python)% f' por '% (python3)% f' quando você chegar a esse parte no tutorial. + +* Configurando um ambiente de desenvolvimento integrado para Python no [Windows](/pt/licoes/instalacao-windows). +* Configurando um ambiente de desenvolvimento integrado para Python no [Mac](/pt/licoes/instalacao-mac). +* Configurando um ambiente de desenvolvimento integrado para Python no [Linux](/pt/licoes/instalacao-linux). + + Se você sabe como executar scripts Python, execute o arquivo usando Python 3. + + [VADER](https://www.nltk.org/_modules/nltk/sentiment/vader.html) (Valence Aware Dictionary and sEntiment Reasoner) é uma ferramenta de atribuição de intensidade de sentimento acrescentada ao NLTK em 2014. Ao contrário de outras técnicas que exigem treinamento em textos parecidos antes do uso, o VADER está pronto para ser usado sem qualquer configuração especial. O VADER é o único que faz distinções refinadas entre vários graus de positividade e negatividade. Por exemplo, VADER pontua “conforto” como moderadamente positivo e “euforia” como extremamente positivo. Ele também tenta capturar e pontuar características textuais comuns em texto online informal, como letras maiúsculas, pontos de exclamação e emoticons, conforme mostrado na tabela abaixo: + + {% include figure.html filename="analise-sentimento1.png" caption="Vader captura pequenas gradações de entusiasmo. (Hutto e Gilbert, 2014). **Versão do tradutor**. Acesse a original [aqui](/en/lessons/sentiment-analysis)" %} + + Como qualquer ferramenta de análise de texto, o VADER deve ser avaliado com criticidade e de forma contextualizada. O VADER foi desenvolvido em meados da década de 2010 principalmente para analisar microblogs em inglês e sites de rede social (especialmente o Twitter). Esse tipo de texto tende a ser muito mais informal do que o e-mail profissional, e contém padrões de linguagem e de uso de recursos que diferem dos padrões de 1999-2002 quando os e-mails da Enron foram escritos. No entanto, VADER também foi desenvolvido como uma ferramenta de análise de sentimento de propósito geral, e o estudo inicial dos autores mostra que ele se compara favoravelmente com ferramentas que foram treinadas para domínios específicos, usam léxicos especializados ou técnicas de aprendizado de máquina com muitos recursos (Hutto e Gilbert, 2014 ). A sensibilidade da ferramenta em relação aos graus de afeto se mostrou útil para descrever as sutilezas das emoções expressas nos e-mails profissionais - como pesquisadores, podemos estar especialmente interessados ​​em capturar os momentos em que a emoção surge em um texto formal. No entanto, a análise de sentimento continua se dedicando a encontrar soluções para capturar sentimentos complexos como ironia, sarcasmo e zombaria, quando o leitor médio seria capaz de fazer a distinção entre o texto literal e seu significado pretendido. + + Embora o VADER seja uma boa ferramenta de uso geral para textos contemporâneos e históricos em inglês, a ferramenta fornece apenas suporte nativo parcial para textos em outras línguas (detecta emojis / maiúsculas / etc., mas não a escolha de palavras). No entanto, os desenvolvedores incentivam os usuários a usar a tradução automática para pré-processar textos que não sejam em inglês e, em seguida, inserir os resultados no VADER. O "VADER demo" inclui um código para enviar o texto de entrada automaticamente para o serviço web ‘My Memory Translation Service’, (leitores avançados podem encontrar no [Github](https://github.com/cjhutto/vaderSentiment/blob/master/vaderSentiment/vaderSentiment.py) a partir da linha 554 - no momento da escrita deste artigo). A implementação deste método de tradução é mais indicada para usuários intermediários de Python. Você pode aprender mais sobre o estado da arte da análise de sentimento multilíngue (que infelizmente quase sempre requer uma etapa de tradução) em ["Análise de sentimento multilíngue: o estado da arte e comparação independente de técnicas"](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4981629/), de Kia Dashtipour, et al (2016). + + +## Calculando Sentimento para um Parágrafo + +Leia o seguinte trecho: + +>“Like you, I am getting very frustrated with this process. I am genuinely trying to be as reasonable as possible. I am not trying to “hold up” the deal at the last minute. I’m afraid that I am being asked to take a fairly large leap of faith after this company (I don’t mean the two of you – I mean Enron) has screwed me and the people who work for me.” + +Este é o primeiro parágrafo do e-mail de janeiro de 2012 de Timothy Belden para Louise Kitchen e John Lavorato sobre o “Acordo de Contratos de Trabalho”. Belden dirigiu os Serviços de Energia da Enron e mais tarde seria condenado por conspiração a fim de aumentar os custos de energia na Califórnia, o que levou a uma crise energética em todo o estado. + +Apesar do sentimento de frustração e ansiedade que você pode deduzir do parágrafo como um todo, observe a ambivalência das frases específicas dentro do parágrafo. Alguns parecem expressar esforços de boa fé, por exemplo: “Não estou tentando ‘atrasar’ o negócio” e “genuinamente tentando”. E, no entanto, há declarações negativas ainda mais fortes sobre "ficar frustrado", "Receio" e "esta empresa [...] ferrou comigo e com as pessoas que trabalham para mim". + +Vamos calcular as pontuações de sentimento para este e-mail usando o VADER para ter uma ideia do que a ferramenta pode fazer. Para começar, crie um novo diretório de trabalho (pasta) em seu computador chamado `“sentimento”` em algum lugar onde você possa encontrá-lo. Dentro dessa pasta, crie um novo arquivo de texto e salve-o como `“sentimento.py”`. É aqui que escreveremos o código para esta tarefa. + +Primeiro, temos que dizer ao Python onde o código NLTK para a análise de sentimento VADER está localizado. No início do nosso arquivo, importaremos o código do VADER: + +```python +# primeiro, importamos os módulos relevantes da biblioteca NLTK +from nltk.sentiment.vader import SentimentIntensityAnalyzer +``` + +Também devemos habilitar o Python para usar este código com nosso conjunto particular de código. Embora tenhamos todas as instruções de que precisamos na biblioteca NLTK, o Python gosta de agrupar essas instruções em um único `objeto` (nossa ferramenta de Análise de Sentimentos) que nosso programa pode acessar. *SentimentIntensityAnalyzer* é uma `classe`, que é um “modelo” que instrui o Python a construir um `objeto` com um conjunto especial de `funções` e `variáveis`. No nosso caso, queremos construir um único `objeto`: nosso analisador de sentimento, que segue este “modelo”. Para fazer isso, executamos *SentimentIntensityAnalyzer( )* e atribuímos a saída - nosso novo analisador de sentimento - a uma variável, que chamaremos de *‘sid’*. + +```python +# em seguida, inicializamos o VADER para que possamos usá-lo em nosso script Python +sid = SentimentIntensityAnalyzer() +``` + +Fazendo isso, fornecemos à nossa nova variável *sid* todos os recursos do código de análise de sentimento VADER. Assim, *sid* se tornou nossa ferramenta de análise de sentimento, mas com um nome mais curto. + +Em seguida, precisamos armazenar o texto que queremos analisar em um lugar que o *sid* possa acessar. Em Python, podemos armazenar uma única sequência de texto como uma variável de `string` (Nota do tradutor: Optamos por manter a palavra 'string' como no original em inglês para facilitar o entendimento de seu uso mais comum em códigos ['str']). + +```python +# a variável 'message_text' agora contém o texto que iremos analisar. +message_text = '''Like you, I am getting very frustrated with this process. I am genuinely trying to be as reasonable as possible. I am not trying to "hold up" the deal at the last minute. I'm afraid that I am being asked to take a fairly large leap of faith after this company (I don't mean the two of you -- I mean Enron) has screwed me and the people who work for me.''' +``` + +Como este texto inclui aspas e apóstrofos, é necessário circundar todo o texto com três aspas (“”” ou ’’’). Isso significa que quaisquer aspas e apóstrofos no texto serão reconhecidos como tal. Essa abordagem também mantém qualquer espaçamento que nosso texto já inclua. + +Agora você está pronto para processar o texto. + +Para fazer isso, o texto *(message_text)* deve ser inserido na ferramenta *(sid)* e o programa deve ser executado. Estamos interessados na "pontuação de polaridade" do analisador de sentimento, que nos dá uma pontuação positiva ou negativa. Este recurso é integrado ao VADER e pode ser solicitado sob demanda. + +Queremos ter certeza de capturar a saída de sid.polarity_scores () atribuindo-a a uma variável que chamaremos de *scores*: + +```python +print(message_text) + +# Utilizar método polarity_scores no sid e passar dentro dele o message_text produz um dicionário com pontuações negativas, neutras, positivas e compostas para o texto de entrada +scores = sid.polarity_scores(message_text) +``` + +Quando você executa este código, os resultados da análise de sentimento agora são armazenados no `dicionário` de *pontuação* (scores). Um dicionário, muito parecido com o tipo que você usa para pesquisar a definição de palavras, é uma variável que armazena informações conhecidas como 'valores' que são acessíveis dando ao programa a 'chave' para a entrada que você deseja ler. Isso significa que um dicionário como *scores* pode armazenar muitos `pares de valores-chave`. Para solicitar os dados, você só precisa conhecer as `chaves`. Mas não sabemos as `chaves`. Felizmente, Python nos dará uma lista de todas as `chaves`, classificadas em ordem alfabética, se usarmos a função `sorted(scores)`. + +Para imprimir cada `chave` e `valor` armazenado no dicionário, precisamos de um `for loop`, que aplica o mesmo código sequencialmente a todas as `chaves` do dicionário. + +Aqui está o código para imprimir cada par de `valores-chave` dentro da variável de pontuação (score): + +```python +# Aqui, percorremos as chaves contidas nas pontuações (pos, neu, neg e pontuações compostas) e imprimimos os pares de valores-chave na tela para digitação classificada (pontuações): +for key in sorted(scores): + print('{0}: {1}, '.format(key, scores[key]), end='') +``` + +Aqui está todo o código em um único programa: + +```python +# primeiro, importamos os módulos relevantes da biblioteca NLTK +from nltk.sentiment.vader import SentimentIntensityAnalyzer + +# a seguir, inicializamos o VADER para que possamos usá-lo em nosso script Python +sid = SentimentIntensityAnalyzer() + +# a variável 'message_text' agora contém o texto que iremos analisar. +message_text = '''Like you, I am getting very frustrated with this process. I am genuinely trying to be as reasonable as possible. I am not trying to "hold up" the deal at the last minute. I'm afraid that I am being asked to take a fairly large leap of faith after this company (I don't mean the two of you -- I mean Enron) has screwed me and the people who work for me.''' + +print(message_text) + +# Utilizar método polarity_scores no sid e passar dentro dele o message_text produz um dicionário com pontuações negativas, neutras, positivas e compostas para o texto de entrada +scores = sid.polarity_scores(message_text) + +# Aqui, percorremos as chaves contidas nas pontuações (pos, neu, neg e pontuações compostas) e imprimimos os pares de valores-chave na tela +for key in sorted(scores): + print('{0}: {1}, '.format(key, scores[key]), end='') +``` + +Salve seu arquivo Python. Agora estamos prontos para executar o código. Usando seu método preferido (ou seu Ambiente de Desenvolvimento Integrado ou a linha de comando), execute seu arquivo Python, `sentimento.py`. + +O resultado deve ser semelhante a este: + +```python +Like you, I am getting very frustrated with this process. I am genuinely trying to be as reasonable as possible. I am not trying to "hold up" the deal at the last minute. I'm afraid that I am being asked to take a fairly large leap of faith after this company (I don't mean the two of you -- I mean Enron) has screwed me and the people who work for me. + +compound: -0.3804, neg: 0.093, neu: 0.836, pos: 0.071, +``` +
    +Lembre-se de usar três aspas simples para envolver a string *message_text* acima. Se você usar aspas duplas, a string terminará mais cedo devido às aspas dentro do texto. +
    + +O VADER coleta e pontua palavras e características negativas, neutras e positivas (e é responsável por fatores como negação ao longo do caminho). Os valores “neg”, “neu” e “pos” descrevem a fração das pontuações ponderadas que se enquadram em cada categoria. VADER também soma todas as pontuações ponderadas para calcular um valor “composto” normalizado entre -1 e 1; este valor tenta descrever o efeito geral de todo o texto de fortemente negativo (-1) a fortemente positivo (1). Neste caso, a análise com VADER descreve a passagem como ligeiramente a moderadamente negativa (-0,3804). Podemos pensar nesse valor como uma estimativa da impressão geral de um leitor médio ao considerar o e-mail como um todo, apesar de alguma ambiguidade e ambivalência ao longo do caminho. + +Ao ler o texto, estaria inclinado a concordar com essa avaliação geral. O valor de saída de -0,3804 é negativo, mas não fortemente negativo. Os pesquisadores podem desejar definir um limite mínimo para positividade ou negatividade antes de declarar um texto definitivamente positivo ou negativo - por exemplo, a documentação oficial do VADER sugere um limite de -0,5 e 0,5, que este trecho específico não alcançaria (em outras palavras , este texto é negativo, mas não extremamente negativo). + +O que isso implica, para você, sobre a maneira como esse sentimento pode ser expresso em um contexto de e-mail profissional? Como você definiria seus valores limite quando o texto expressa emoções de maneira mais sutil ou cortês? Você acha que a análise de sentimento é uma ferramenta apropriada para nossa análise exploratória de dados? + +Desafio: tente substituir o conteúdo de *message_text* pelas seguintes cadeias de caracteres e execute novamente o programa. Não se esqueça de cercar cada texto com três aspas simples ao atribuí-lo à variável *message_text* (como em: *message_text* = ''' algumas palavras '''). Antes de executar o programa, tente adivinhar o resultado da análise de sentimento: positivo ou negativo? Quão positivo ou negativo? + +Texto 1: + +``` +Looks great. I think we should have a least 1 or 2 real time traders in Calgary. +``` + +Texto 2: + +``` +I think we are making great progress on the systems side. I would like to +set a deadline of November 10th to have a plan on all North American projects +(I'm ok if fundementals groups are excluded) that is signed off on by +commercial, Sally's world, and Beth's world. When I say signed off I mean +that I want signitures on a piece of paper that everyone is onside with the +plan for each project. If you don't agree don't sign. If certain projects +(ie. the gas plan) are not done yet then lay out a timeframe that the plan +will be complete. I want much more in the way of specifics about objectives +and timeframe. + +Thanks for everyone's hard work on this. +``` + +Experimente uma terceira vez com algum texto de uma de suas próprias fontes de pesquisa. Que resultados você obteve para cada um? Você concorda com os resultados? + +# Determine o escopo apropriado para e-mail + +Quando analisado por meio da ferramenta de análise de sentimento VADER, o texto produz um conjunto de pontuações positivas, neutras e negativas, que são então agregadas e dimensionadas como uma "pontuação composta". Embora seja útil saber em teoria, como esse método pode ser aplicado aos dados no exemplo da Enron - isto é, uma coleção de dados de e-mail e metadados? E o que isso pode nos dizer sobre as emoções, relacionamentos e mudanças ao longo do tempo dos funcionários da Enron? + +Nesta seção, apresentaremos a você o processo de seleção do escopo de análise para nossa ferramenta de análise de sentimento. Considere os seguintes dados brutos pertencentes a um e-mail de 3 de outubro de 2000 escrito por Jeffrey Shankman, então presidente de mercados globais da Enron (Quinn, 2006): + +``` +Message-ID: <3764632.1075857565248.JavaMail.evans@thyme> +Date: Mon, 23 Oct 2000 09:14:00 -0700 (PDT) +From: jeffrey.shankman@enron.com +To: john.nowlan@enron.com, don.schroeder@enron.com, david.botchlett@enron.com, + chris.mahoney@enron.com, ross.koller@enron.com +Subject: +Mime-Version: 1.0 +Content-Type: text/plain; charset=us-ascii +Content-Transfer-Encoding: 7bit +X-From: Jeffrey A Shankman +X-To: John L Nowlan, Don Schroeder, David J Botchlett, Chris Mahoney, Ross Koller +X-cc: +X-bcc: +X-Folder: \Jeffrey_Shankman_Jun2001\Notes Folders\Sent +X-Origin: Shankman-J +X-FileName: jshankm.nsf + +It seems to me we are in the middle of no man's land with respect to the +following: Opec production speculation, Mid east crisis and renewed +tensions, US elections and what looks like a slowing economy (?), and no +real weather anywhere in the world. I think it would be most prudent to play +the markets from a very flat price position and try to day trade more +aggressively. I have no intentions of outguessing Mr. Greenspan, the US. +electorate, the Opec ministers and their new important roles, The Israeli and +Palestinian leaders, and somewhat importantly, Mother Nature. Given that, +and that we cannot afford to lose any more money, and that Var seems to be a +problem, let's be as flat as possible. I'm ok with spread risk (not front to +backs, but commodity spreads). + + +The morning meetings are not inspiring, and I don't have a real feel for +everyone's passion with respect to the markets. As such, I'd like to ask +John N. to run the morning meetings on Mon. and Wed. + + +Thanks. Jeff +``` + +No texto da mensagem do e-mail, Shankman traça uma estratégia corporativa para avançar no que ele percebe como um contexto geopolítico ambíguo. A mensagem descreve uma série de situações difíceis, bem como exasperação ("As reuniões matinais não são inspiradoras") e incerteza ("Não tenho um sentimento real de paixão de todos"). Ao mesmo tempo, Shankman descreve um conjunto de etapas de ação junto com pedidos educados ("Eu gostaria de pedir ...") e expressões de gratidão ("Obrigado"). + +Antes de prosseguirmos, pare um minuto para refletir sobre a mensagem. Como você acha que um leitor típico descreveria a intensidade emocional deste e-mail? Considerando o que você sabe agora sobre VADER, que proporção de positividade, negatividade e neutralidade você espera que a ferramenta de análise de sentimento encontre na mensagem? Finalmente, o que você acha que a pontuação composta irá sugerir sobre o efeito geral na mensagem? + +Como discutimos acima, a análise de sentimento não fornece uma saída objetiva, mas sim indicadores de orientação que refletem nossa escolha e calibração de ferramentas analíticas. Talvez o elemento mais importante da calibração seja selecionar o escopo do texto que está sendo analisado, ou seja, quanto de uma mensagem colocamos na ferramenta de uma vez. Em nosso caso, podemos determinar o escopo da análise decidindo entre analisar a mensagem inteira como uma única unidade ou, em vez disso, dividir a mensagem em unidades menores como frases e analisar cada uma separadamente. + +Primeiro, vamos considerar uma abordagem no nível da mensagem, na qual analisamos a mensagem como um único bloco: + +```python +# Continue com o mesmo código da seção anterior, mas substitua a variável *message_text* pelo novo texto do e-mail: + +message_text = '''It seems to me we are in the middle of no man's land with respect to the following: Opec production speculation, Mid east crisis and renewed tensions, US elections and what looks like a slowing economy (?), and no real weather anywhere in the world. I think it would be most prudent to play the markets from a very flat price position and try to day trade more aggressively. I have no intentions of outguessing Mr. Greenspan, the US. electorate, the Opec ministers and their new important roles, The Israeli and Palestinian leaders, and somewhat importantly, Mother Nature. Given that, and that we cannot afford to lose any more money, and that Var seems to be a problem, let's be as flat as possible. I'm ok with spread risk (not front to backs, but commodity spreads). The morning meetings are not inspiring, and I don't have a real feel for everyone's passion with respect to the markets. As such, I'd like to ask John N. to run the morning meetings on Mon. and Wed. Thanks. Jeff''' + +``` + +Substitua `sentimento.py` pelo código acima, salve-o e execute-o. A saída deve ser semelhante a esta: + +```python +It seems to me we are in the middle of no man's land with respect to the following: Opec production speculation, Mid east crisis and renewed tensions, US elections and what looks like a slowing economy (?), and no real weather anywhere in the world. I think it would be most prudent to play the markets from a very flat price position and try to day trade more aggressively. I have no intentions of outguessing Mr. Greenspan, the US. electorate, the Opec ministers and their new important roles, The Israeli and Palestinian leaders, and somewhat importantly, Mother Nature. Given that, and that we cannot afford to lose any more money, and that Var seems to be a problem, let's be as flat as possible. I'm ok with spread risk (not front to backs, but commodity spreads). The morning meetings are not inspiring, and I don't have a real feel for everyone's passion with respect to the markets. As such, I'd like to ask John N. to run the morning meetings on Mon. and Wed. Thanks. Jeff +compound: 0.889, neg: 0.096, neu: 0.765, pos: 0.14, +``` + +Aqui você pode ver que, ao analisar o e-mail como um todo, VADER retorna valores que sugerem que a mensagem é principalmente neutra (neu: 0,765), mas que mais recursos parecem ser positivos (pos: 0,14) em vez de negativos (0,096). VADER calcula uma pontuação geral de sentimento de 0,889 para a mensagem (em uma escala de -1 a 1), o que sugere um efeito fortemente positivo para a mensagem como um todo. + +Isso atendeu às suas expectativas? Se não, por que você acha que o VADER encontrou mais características positivas do que negativas? + +No nível da entidade da mensagem, não há como destacar sentimentos particularmente positivos ou negativos na mensagem. Essa perda de detalhes pode ser irrelevante ou pode ser vital ao conduzir uma análise exploratória. Isso depende das necessidades de pesquisa de seu estudo. Por exemplo, identificar frases negativas em e-mails de outra forma adequados pode ser especialmente importante ao procurar explosões emocionais ou trocas abusivas que podem ocorrer muito raramente, mas revelam algo essencial sobre a natureza de um relacionamento. Se quisermos capturar esse nível de nuance, precisamos de um método para passar da análise do nível da mensagem para a análise do sentimento. + +Felizmente, o NLTK oferece uma coleção de ferramentas para dividir o texto em componentes menores. Os tokenizadores dividem as sequências de texto em pedaços menores, como frases. Alguns podem ainda dividir uma frase em partes específicas do discurso, como o substantivo, adjetivo e assim por diante. No nosso caso, usaremos o tokenizer english.pickle do NLTK para dividir os parágrafos em sentenças. + +Agora podemos reescrever o script de análise de sentimento para analisar cada frase separadamente: + +```python +# Abaixo está o código de análise de sentimento reescrito para uma análise por frase +# observe o novo módulo -- word_tokenize! +import nltk.data +from nltk.sentiment.vader import SentimentIntensityAnalyzer +from nltk import sentiment +from nltk import word_tokenize + +# Em seguida, inicializamos VADER para utilizá-lo em nosso script Python +sid = SentimentIntensityAnalyzer() + +# Vamos também incializar nossa função 'english.pickle' e atribuir a ela um nome curto + +tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') + +message_text = '''It seems to me we are in the middle of no man's land with respect to the following: Opec production speculation, Mid east crisis and renewed tensions, US elections and what looks like a slowing economy (?), and no real weather anywhere in the world. I think it would be most prudent to play the markets from a very flat price position and try to day trade more aggressively. I have no intentions of outguessing Mr. Greenspan, the US. electorate, the Opec ministers and their new important roles, The Israeli and Palestinian leaders, and somewhat importantly, Mother Nature. Given that, and that we cannot afford to lose any more money, and that Var seems to be a problem, let's be as flat as possible. I'm ok with spread risk (not front to backs, but commodity spreads). The morning meetings are not inspiring, and I don't have a real feel for everyone's passion with respect to the markets. As such, I'd like to ask John N. to run the morning meetings on Mon. and Wed. Thanks. Jeff''' + +# O método de tokenização quebra o parágrafo em uma lista de frases (strings). Neste exemplo, observe que o tokenizer se confunde pela falta de espaçamento após o ponto final e acaba por quebrar as frases de forma equivocada. Como podemos consertar isso? + +sentences = tokenizer.tokenize(message_text) + +# Vamos adicionar um passo para percorrer a lista de frases, calcular e imprimir a pontuação de polaridade para cada uma. + +for sentence in sentences: + print(sentence) + scores = sid.polarity_scores(sentence) + for key in sorted(scores): + print('{0}: {1}, '.format(key, scores[key]), end='') + print() +``` + + +O resultado deve ser semelhante a este: + +```python +It seems to me we are in the middle of no man's land with respect to the following: Opec production speculation, Mid east crisis and renewed tensions, US elections and what looks like a slowing economy (? +compound: -0.5267, neg: 0.197, neu: 0.68, pos: 0.123, +), and no real weather anywhere in the world. +compound: -0.296, neg: 0.216, neu: 0.784, pos: 0.0, +I think it would be most prudent to play the markets from a very flat price position and try to day trade more aggressively. +compound: 0.0183, neg: 0.103, neu: 0.792, pos: 0.105, +I have no intentions of outguessing Mr. Greenspan, the US. +compound: -0.296, neg: 0.216, neu: 0.784, pos: 0.0, +electorate, the Opec ministers and their new important roles, The Israeli and Palestinian leaders, and somewhat importantly, Mother Nature. +compound: 0.4228, neg: 0.0, neu: 0.817, pos: 0.183, +Given that, and that we cannot afford to lose any more money, and that Var seems to be a problem, let's be as flat as possible. +compound: -0.1134, neg: 0.097, neu: 0.823, pos: 0.081, +I'm ok with spread risk (not front to backs, but commodity spreads). +compound: -0.0129, neg: 0.2, neu: 0.679, pos: 0.121, +The morning meetings are not inspiring, and I don't have a real feel for everyone's passion with respect to the markets. +compound: 0.5815, neg: 0.095, neu: 0.655, pos: 0.25, +As such, I'd like to ask John N. to run the morning meetings on Mon. +compound: 0.3612, neg: 0.0, neu: 0.848, pos: 0.152, +and Wed. +compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, +Thanks. +compound: 0.4404, neg: 0.0, neu: 0.0, pos: 1.0, +Jeff +compound: 0.0, neg: 0.0, neu: 1.0, pos: 0.0, +``` + +Aqui, você notará uma visalização muito mais detalhada do sentimento neste e-mail. O VADER identifica com sucesso sentenças moderadas a fortemente negativas no e-mail, especialmente as principais descrições de crises. A análise no nível da frase permite que você identifique frases e tópicos específicos nos extremos do sentimento, o que pode ser útil mais tarde. + +Mas, mesmo nesse nível, o VADER também comete vários erros. A frase que começa com “As reuniões matinais não são inspiradoras” resulta em uma pontuação surpreendentemente positiva - talvez por causa de uma leitura incorreta dos termos “paixão” e “respeito”. + + Observe também que o ponto de interrogação no início do e-mail e o ponto de abreviação após *Mon* (Segunda-feira: *seg.*) próximo ao final fazem com que o tokenizador english.pickle quebre as frases por engano. Este é um risco constante de pontuação informal e complexa no texto. + +O que você nota sobre a distribuição dos scores de sentimento? Como você poderia coletá-los de uma maneira que o ajude a entender melhor seus dados e as questões de pesquisa de seu interesse? (Sinta-se à vontade para experimentar diferentes tipos de texto na variável *message_text* para ver como a ferramenta responde a diferentes tipos de construções de linguagem). O código que você acabou de escrever pode ser reaproveitado para qualquer texto. + +# Agradecimentos + +Meus sinceros agradecimentos a Justin Joque, Bibliotecário de Visualização da Biblioteca da Universidade de Michigan e do Digital Projects Studio, pelo apoio na formulação das ideias e abordagem por trás desta lição. Muito obrigado também a Adam Crymble, que forneceu diversas ideias e apoio durante todo o processo editorial. E obrigado a Anandi Silva Knuppel e Puteri Zarina Megat Khalid por seus comentários atenciosos. + +# Referências + +Barton, D., & Hall, N. (Eds.). (2000). Letter writing as a social practice (Vol. 9). John Benjamins Publishing. + +Hardin, J., Sarkis, G., & Urc, P. C. (2015). Network Analysis with the Enron Email Corpus. Journal of Statistics Education, 23:2. https://doi.org/10.1080/10691898.2015.11889734 + +Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for Sentiment Analysis of Social Media Text. Eighth International Conference on Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014. https://www.aaai.org/ocs/index.php/ICWSM/ICWSM14/paper/viewPaper/8109 + +Klimt, B., & Yang, Y. (2004, July). Introducing the Enron Corpus. In CEAS. https://bklimt.com/papers/2004_klimt_ceas.pdf + +Klimt, B., & Yang, Y. (2004). The Enron corpus: A new dataset for email classification research. Machine learning: ECML 2004, 217-226. https://bklimt.com/papers/2004_klimt_ecml.pdf + +Tukey, J.W. (1977). Exploratory Data Analysis. Addison-Wesley Publishing Company + +Quinn, J. (2006, November 14). Ex-Enron man goes back into energy. Retrieved January 10, 2018, from https://www.telegraph.co.uk/finance/2950645/Ex-Enron-man-goes-back-into-energy.html diff --git a/pt/licoes/autoria-sustentavel-texto-simples-pandoc-markdown.md b/pt/licoes/autoria-sustentavel-texto-simples-pandoc-markdown.md index 39efa08d17..5aa0dd93e9 100644 --- a/pt/licoes/autoria-sustentavel-texto-simples-pandoc-markdown.md +++ b/pt/licoes/autoria-sustentavel-texto-simples-pandoc-markdown.md @@ -1,352 +1,352 @@ ---- -title: Autoria Sustentável em Texto Simples usando Pandoc e Markdown -layout: lesson -collection: lessons -slug: autoria-sustentavel-texto-simples-pandoc-markdown -date: 2014-03-19 -translation_date: 2022-11-27 -authors: -- Dennis Tenen -- Grant Wythoff -lesson-testers: -- Pao-Chuan Ma -tested-date: 2021-06-10 -editors: -- Fred Gibbs -translator: -- Gabriela Kucuruza -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Daniel Bonatto Seco -- André Salvo -difficulty: 2 -activity: sustaining -topics: [website, data-management] -abstract: "Neste tutorial, você aprenderá primeiro o básico do Markdown - uma sintaxe de marcação fácil de ler e escrever para texto simples - bem como Pandoc, uma ferramenta de linha de comando que converte texto simples em vários tipos de ficheiros formatados: PDF, docx, HTML, LaTeX, apresentação de slides e muito mais." -exclude_from_check: - - reviewers - - review-ticket -original: sustainable-authorship-in-plain-text-using-pandoc-and-markdown -avatar_alt: Um homem trabalhando numa mesa de desenho -doi: 10.46430/phpt0036 ---- - -{% include toc.html %} - -{% include figure.html filename="lexoriter.jpg" caption="" %} - -## Objetivos - -Neste tutorial, você aprenderá primeiro o básico do Markdown - uma sintaxe de marcação fácil de ler e de escrever para texto simples - assim como o [Pandoc](https://pandoc.org/), uma ferramenta de linha de comando que converte texto simples em vários tipos de ficheiro belamente formatados: PDF, docx, HTML, LaTeX, apresentações de slides e muito mais.[^1] Com o Pandoc como sua ferramenta de composição digital, você pode usar a sintaxe Markdown para adicionar figuras, bibliografia, formatação e alterar facilmente os estilos de citação de Chicago para MLA (por exemplo), todos usando texto simples. - -Este tutorial não pressupõe nenhum conhecimento técnico prévio, mas aumenta com a experiência, uma vez que vamos sugerir técnicas mais avançadas ao final de cada seção. Elas estão claramente marcadas e podem ser revisitadas após alguma prática e experimentação. - -Ao invés de seguir esse tutorial de maneira mecânica, recomendamos que se esforce para entender as soluções oferecidas aqui como uma _metodologia_, que pode precisar de adaptações para se adequar ao seu ambiente e fluxo de trabalho. A instalação das ferramentas necessárias apresenta talvez a maior barreira à participação. Tenha tempo e paciência suficientes para instalar tudo corretamente, ou faça isso com um/a colega que tenha uma configuração semelhante e ajudem-se mutuamente. Consulte a seção [Recursos Úteis](/pt/licoes/autoria-sustentavel-texto-simples-pandoc-markdown#recursos-uteis) abaixo se ficar preso.[^2] - -## Filosofia -Escrever, armazenar e recuperar documentos são atividades centrais para o fluxo de trabalho de pesquisa das humanidades. Mesmo assim, muitos autores baseiam suas práticas em ferramentas e formatos proprietários que, às vezes, ficam aquém dos requisitos mais básicos da escrita acadêmica. Talvez possa se lembrar de certa frustração com a fragilidade de notas de rodapé, bibliografias, figuras e rascunhos de livros escritos em Microsoft Word ou Google Docs. No entanto, a maioria dos periódicos ainda insiste em submissões no formato .docx. - -Mais do que causar frustração pessoal, essa dependência de ferramentas e de formatos proprietários tem implicações negativas de longo prazo para a comunidade acadêmica. Em tal ambiente, os periódicos devem terceirizar a composição, alienando os autores dos contextos materiais de publicação e adicionando outras barreiras desnecessárias à circulação irrestrita do conhecimento.[^3] - -Quando se usa MS Word, Google Docs ou Open Office para escrever documentos, o que se vê não é o que se obtém. Embaixo da camada visível de palavras, frases e parágrafos, encontra-se uma complicada camada de código compreensível apenas para as máquinas. Por causa dessa camada oculta, os ficheiros .docx e .pdf dependem de ferramentas proprietárias para serem visualizados corretamente. Esses documentos são difíceis de pesquisar, imprimir e converter em outros formatos de ficheiros. - -Além disso, o tempo gasto formatando documentos em MS Word ou Open Office é perdido, pois toda essa formatação é removida pelo editor durante a submissão. Tanto os autores quanto os editores se beneficiariam da troca de ficheiros com formatação mínima, deixando a composição tipográfica para o estágio final de composição do processo de publicação. - -Aqui é onde o Markdown brilha. Markdown é uma sintaxe para marcar explicitamente elementos semânticos dentro de um documento, não em alguma camada oculta. A ideia é identificar as unidades que são significativas para humanos, como títulos, seções, subseções, notas de rodapé e ilustrações. No mínimo, os seus ficheiros sempre permanecerão compreensíveis **para você**, mesmo se o editor de texto que estiver usando parar de funcionar ou "sair do mercado". - -Escrever dessa forma libera o autor da ferramenta. Markdown pode ser escrito em qualquer editor de texto simples e oferece um rico ecossistema de software que pode renderizar o texto em documentos belamente formatados. Por esta razão, o Markdown está atualmente passando por um período de crescimento, não apenas como meio para escrever artigos acadêmicos, mas como uma convenção para edição online em geral. - -Os editores de texto simples de uso geral populares incluem [Atom](https://atom.io/) (todas as plataformas) e [Notepad ++](https://notepad-plus-plus.org/) (somente para Windows). - -É importante entender que o Markdown é apenas uma convenção. Os ficheiros Markdown são armazenados como texto simples, aumentando ainda mais a flexibilidade do formato. Ficheiros de texto simples existem desde a máquina de escrever eletrônica. A longevidade deste padrão torna, de modo inerente, o texto simples mais sustentável e estável do que os formatos proprietários. Enquanto os ficheiros produzidos até dez anos atrás no Microsoft Word e no Apple Pages, podem causar problemas significativos quando abertos nas versões mais recentes, ainda é possível abrir um ficheiro escrito em qualquer editor de texto simples “morto” nas últimas décadas: AlphaPlus, Perfect Writer, Text Wizard, Spellbinder, WordStar ou o favorito de Isaac Asimov, SCRIPSIT 2.0 , feito por Radio Shack. Escrever em texto simples garante que seus ficheiros permanecerão legíveis daqui a dez, quinze, vinte anos. Neste tutorial, descrevemos um fluxo de trabalho que libera o pesquisador de softwares proprietários de processamento de texto e de formatos de ficheiro frágeis. - -Agora é possível escrever uma ampla variedade de documentos em um formato - artigos, postagens de blog, wikis, programas de estudos e cartas de recomendação - usando o mesmo conjunto de ferramentas e técnicas para pesquisar, descobrir, fazer backup e distribuir nossos materiais. Suas notas, entradas de blog, documentação de código e wikis podem ser criados no Markdown. Cada vez mais, muitas plataformas como WordPress, Reddit e GitHub suportam a autoria Markdown nativamente. A longo prazo, sua pesquisa se beneficiará desses fluxos de trabalho unificados, tornando mais fácil salvar, pesquisar, compartilhar e organizar seus materiais. - -## Princípios - -Inspirados pelas melhores práticas em uma variedade de disciplinas, nós fomos guiados pelos seguintes princípios: - -1. _Sustentabilidade_. O texto simples garante tanto transparência, como atende aos padrões de preservação de longo prazo. O Word pode seguir o caminho do [Word Perfect](https://pt.wikipedia.org/wiki/WordPerfect) no futuro, mas o texto simples sempre permanecerá fácil de ler, catalogar, extrair e transformar. Além disso, o texto simples permite um controle fácil e poderoso do versionamento do documento, o que é útil na colaboração e na organização de rascunhos. Seus ficheiros de texto simples estarão acessíveis em telefones celulares, tablets ou, talvez, em um terminal de baixa potência em alguma biblioteca remota. O texto simples é compatível com versões anteriores e à prova de futuro. Qualquer que seja o software ou hardware que vier a seguir, ele será capaz de entender os seus ficheiros de texto simples. -2. _Preferência por formatos legíveis por humanos_. Quando escrevemos no Word ou no Google Docs, o que vemos não é o que obtemos. O ficheiro .doc contem uma formatação oculta de caracteres gerados automaticamente, criando uma camada de composição tipográfica ofuscada que é difícil para o usuário solucionar. Algo tão simples como colar uma imagem ou texto do navegador pode ter efeitos imprevisíveis na formatação do seu documento. -3. _Separação entre forma e conteúdo_. Escrever e formatar ao mesmo tempo é distrativo. A ideia é escrever primeiro e formatar depois, o mais próximo possível da hora da publicação. Uma tarefa como mudar da formatação Chicago para MLA deve ser simples. Os editores de periódicos que desejam economizar tempo na formatação desnecessária e na edição de cópias devem ser capazes de fornecer aos seus autores um modelo de formatação que cuida dos detalhes da composição tipográfica. -4. _Apoio ao aparato acadêmico_. O fluxo de trabalho precisa lidar com notas de rodapé, figuras, caracteres internacionais e bibliografias com elegância. -5. _Independência de plataforma_. Na medida em que os vetores de publicação se multiplicam, precisamos ser capazes de gerar uma multiplicidade de formatos, incluindo projeção de slides, impressão, web e celular. Idealmente, gostaríamos de poder gerar os formatos mais comuns sem quebrar as dependências bibliográficas. Nosso fluxo de trabalho também precisa ser portátil - seria bom poder copiar uma pasta para um pen drive e saber que ela contém tudo o que é necessário para publicação de estudos. Escrever em texto simples significa que é possível facilmente compartilhar, editar e arquivar seus documentos em praticamente qualquer ambiente. Por exemplo, um programa escrito em Markdown pode ser salvo como PDF, impresso como um folheto e convertido em HTML para a web, tudo a partir do mesmo ficheiro. Tanto os documentos da web quanto os impressos devem ser publicados da mesma fonte e ter aparência semelhante, preservando o layout lógico do material. - -Mardown e LaTeX cumprem todos esses requisitos. Nós escolhemos Markdown (e não LaTeX) porque ele oferece a sintaxe mais leve e organizada (por isso, _mark down_) e porque quando unido com Pandoc, permite maior flexibilidade nas saídas (incluindo ficheiros .docs e .tex).[^4] - -## Requisitos de Software - -Nós omitimos propositalmente alguns dos detalhes menores vinculados à plataforma ou ao sistema operacional de instalação do software listado abaixo. Por exemplo, não faz sentido fornecer instruções de instalação para o LaTeX, quando as instruções online para o seu sistema operacional serão sempre mais atuais e completas. Da mesma forma, o processo de instalação do Pandoc é melhor explorado pesquisando por “instalar o Pandoc” no Google, com o provável primeiro resultado sendo a página inicial do Pandoc. - - - **Editor de texto simples**. Entrar no mundo de edição de texto simples expande dramaticamente as suas escolhas de ferramentas inovadoras de autoria. Pesquise online por "editor de texto markdown" e experimente as opções. Não importa qual for usada, contanto que seja explicitamente um editor de texto simples, como Atom e Notepad++. Lembre-se de que nós não estamos presos a ferramenta, é possível trocar de editor a qualquer momento. - - **Terminal de linha de comando**. Trabalhar na "linha de comando" equivale a escrever comandos no terminal. Em um Mac, apenas pesquise por "Terminal". No Windows, use o [PowerShell](https://pt.wikipedia.org/wiki/PowerShell). Usuários de Linux provavelmente já devem estar familiarizados com seus terminais. Nós iremos cobrir o básico de como procurar e usar a linha de comando abaixo. - - **Pandoc**. Instruções de instalação detalhadas e para plataformas específicas estão disponíveis no [site do Pandoc](https://pandoc.org/installing.html). _A instalação do Pandoc na sua máquina é crucial para esse tutorial_, então tome o seu tempo navegando pelas instruções. O Pandoc foi criado e é mantido por John MacFarlane, Professor de Filosofia na Universidade da Califórnia, Berkeley. Isso é a humanidade computacional em sua melhor expressão e servirá como o motor de nosso fluxo de trabalho. Com o Pandoc, será possível compilar texto e bibliografia em documentos belamente formatados e flexíveis. Depois de seguir as instruções de instalação, verifique se o Pandoc está instalado digitando `pandoc --version` na linha de comando. Presumimos que a sua versão seja ao menos a versão 1.12.3, lançada em janeiro de 2014. - -Os próximos dois softwares são recomendados, mas não requisitados para realizar esse tutorial. - -* **Zotero ou Endnote**. Softwares de referência bibliográfica como Zotero e Endnote são ferramentas indispensáveis para organizar e formatar citações em um artigo de pesquisa. Esses programas podem exportar suas bibliotecas como um ficheiro BibTeX (sobre o qual você aprenderá mais no Caso 2 a seguir). Este ficheiro, por si só um documento de texto simples formatado com todas as suas citações, permitirá que você cite referências de forma rápida e fácil usando `@tags`. Deve-se notar que também é possível digitar todas as suas referências bibliográficas à mão, usando [nossa bibliografia](https://github.com/dh-notes/pandoc-workflow/blob/master/pandoctut.bib) como modelo. -* **LaTeX**. Instruções de instalação detalhadas e específicas da plataforma estão disponíveis no [site do Pandoc](https://pandoc.org/installing.html). Embora o LaTeX não seja abordado neste tutorial, ele é usado pelo Pandoc para a criação de .pdf. Usuários avançados frequentemente irão converter para LaTeX diretamente para ter um controle mais minucioso sobre a composição do .pdf. Os iniciantes podem querer pular esta etapa. Caso contrário, digite`latex -v` para ver se o LaTeX está instalado corretamente (você receberá um erro se não estiver e algumas informações sobre a versão, se estiver). - -## Básico do Markdown - -O Markdown é uma convenção para estruturar os seus documentos de texto simples semanticamente. A ideia é identificar estruturas lógicas no seu documento (títulos, seções, subseções, notas de rodapé, etc.), marcá-las com caracteres discretos e então "compilar" o texto resultante com um interpretador de composição tipográfica que formatará o documento consistentemente, de acordo com um estilo específico. - -As convenções de Markdown vêm em várias “versões” projetadas para uso em contextos específicos, como blogs, wikis ou repositórios de código. O do Markdown usado pelo [Pandoc](https://pandoc.org/MANUAL.html#pandocs-markdown) é voltado para uso acadêmico. Suas convenções são descritas na página Markdown do Pandoc. Suas convenções incluem o bloco “[YAML](https://pandoc.org/MANUAL.html#extension-yaml_metadata_block)”, que contém alguns metadados úteis. - -Vamos agora criar um documento simples no Markdown. Abra um editor de texto simples de sua escolha e comece a digitar. Deve ser assim: - -``` ---- -title: Fluxo de Trabalho em Texto Simples -author: Gabriela Domingues -date: 20 de janeiro de 2014 -fontfamily: times ---- -``` - -A versão do Markdown usada pelo Pandoc armazena cada um dos valores acima, e "imprime-os" na localização apropriada do seu documento de saída quando o documento estiver pronto para a composição tipográfica. Aprenderemos mais tarde a adicionar outros campos mais poderosos ao bloco "YAML". Por enquanto, vamos fingir que estamos escrevendo um artigo que contém três seções, cada uma subdividida em duas subseções. Deixe uma linha em branco após os três últimos traços no bloco "YAML" e cole o seguinte: - -``` - -# Seção 1 - -## Subseção 1.1 -Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - -O parágrafo seguinte deve começar sem recuo: - -## Subseção 1.2 -Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. - -# Seção 2 - -## Subseção 2.1 -``` - -Vá em frente e escreva um texto simulado também. Espaços em branco são significativos em Markdown: não recue os seus parágrafos. Ao invés disso, separe parágrafos usando uma linha vazia. Linhas vazias também devem preceder os cabeçalhos das seções. - -Use asteriscos para adicionar ênfases em negrito ou em itálico, assim: `*itálico*` e `**negrito**`. Nós devemos também adicionar um link e uma nota de rodapé no nosso texto para cobrir os componentes básicos de um artigo médio. Digite: - -``` -Uma frase que precisa de uma nota.[^1] - -[^1]: Essa é a minha primeira nota de rodapé! E um [link](https://www.eff.org/). -``` - -Quando o texto do link e o endereço são iguais, é mais rápido escrever `` ao invés de `[www.eff.org](www.eff.org)`. - -Vamos salvar nosso ficheiro antes de avançar. Crie a nova pasta que irá armazenar esse projeto. É provável que tenha algum sistema de organização de seus documentos, projetos, ilustrações e bibliografias, mas geralmente, o seu documento, e as suas ilustrações e bibliografia estão em pastas diferentes, o que os torna mais difíceis de achar. Nosso objetivo é criar uma única pasta para cada projeto, com todos os materiais relevantes incluídos. A regra geral é um projeto, um artigo, uma pasta. Nomeie seu ficheiro como `main.md`, onde “md” significa markdown. - -Depois que seu ficheiro for salvo, vamos adicionar uma ilustração. Copie uma imagem (qualquer imagem pequena) para a sua pasta e adicione o seguinte em algum lugar no corpo do texto: `![legenda da imagem](sua_imagem.jpg)`. - -Nesse ponto, o seu `main.md` deve parecer com o que está abaixo. É possível baixar esse exemplo de ficheiro teste.md [aqui](/assets/autoria-sustentavel-texto-simples-pandoc-markdown/teste.md). - -``` ---- -title: Fluxo de trabalho de texto simples -author: Gabriela Domingues -date: 20 de Janeiro de 2014 ---- - -# Seção 1 - -## Subseção 1.1 - -Lorem *ipsum* dolor sit amet, **consectetur** adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. - -## Subseção 1.2 - -Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. - -O próximo parágrafo deve começar assim. Não dê recuo. - -# Seção 2 - -## Subseção 2.1 - -![legenda da imagem](sua_imagem.jpg) - -## Subseção 2.2 - -Uma frase que precisa de uma nota.[^1] - -[^1]: Essa é a minha primeira nota de rodapé! E um [link](https://www.eff.org/). -``` -Como faremos em breve, esse ficheiro de texto simples pode ser renderizado em um belo PDF: - -{% include figure.html filename="autoria-sustentavel-texto-simples-pandoc-markdown-01.png" alt="Imagem representando o ficheiro MarkDown e a respectiva versão em Word produzida com o Pandoc" caption="Exemplo de captura de tela de Word renderizado no Pandoc" %} - -Se quiser ter uma ideia de como esse tipo de marcação será interpretado como formatação HTML, experimente esse [espaço de teste online](https://daringfireball.net/projects/markdown/dingus) e brinque com vários tipos de sintaxe. Lembre-se de que certos elementos do Markdown com o sabor do Pandoc (como o bloco de título e as notas de rodapé) não funcionarão neste formulário da web, que aceita apenas o básico. - -Neste ponto, gaste algum tempo explorando alguns dos outros recursos do Markdown, como citações (referenciadas pelo símbolo `>`), listas de marcadores que começam com `*` ou `-`, quebras de linha textuais que começam com `|` (útil para poesia), tabelas e algumas das outras funções listadas na página de marcação do Pandoc. - -Preste bastante atenção em espaços vazios e no fluxo dos parágrafos. A documentação coloca sucintamente quando define um parágrafo como "uma ou mais linhas de texto seguida por uma ou mais linhas vazias.". Note que "linhas novas são tratadas como espaços" e que "se precisa de uma quebra de linha forte, coloque dois ou mais espaços no final de uma linha." A melhor maneira de entender o que isso significa é experimentar livremente. Use o modo de visualização do seu editor ou apenas execute o Pandoc para ver os resultados dos seus experimentos. - -Acima de tudo, evite a vontade de formatar. Lembre-se de que estamos identificando unidades semânticas: seções, subseções, ênfases, notas de rodapé e figuras. Mesmo *itálico* e **negrito** em Markdown não são realmente marcos de formatação, mas indicam diferentes níveis de *ênfase*. A formatação acontecerá depois, quando souber o lugar e os requisitos da publicação. - -Existem programas que permitem que se veja uma pré-visualização em tempo real da saída do Markdown enquanto se edita o ficheiro de texto simples, que nós detalhamos abaixo na seção de Recursos Úteis. Poucos deles suportam, entretanto, notas de rodapé, figuras e bibliografias. Para aproveitar o Pandoc ao máximo, nós recomendamos que use ficheiros de texto simples armazenados localmente, no seu computador. - -## Entrando em contato com a linha de comendos do seu computador - -Antes de começarmos a publicar o nosso ficheiro `main.md` em outros formatos, nós precisamos nos orientar sobre como trabalhar com a linha de comando usando o programa de terminal do seu computador, que é o único (e melhor) modo de usar o Pandoc. - -A linha de comando é um lugar amigável, uma vez que se acostuma com ela. Se já estiver familiarizado com o uso da linha de comando, sinta-se à vontade para pular esta seção. Para outros, é importante entender que ser capaz de usar seu programa de terminal diretamente permitirá que se use uma ampla gama de poderosas ferramentas de pesquisa que não poderiam ser usadas de outra forma, e podem servir como base para um trabalho mais avançado. Para os fins deste tutorial, é preciso aprender apenas alguns comandos muito simples. - -Primeiro, abra uma janela de linha de comando. Se você estiver usando o macOS, abra o aplicativo Terminal no diretório ‘Aplicativos / Utilitários’. No Windows, recomendamos que use o PowerShell ou, para uma solução mais robusta, instale o subsistema do Windows para Linux e use o terminal que vem com sua distribuição favorita do Linux. Para obter uma excelente introdução à linha de comando, consulte [“Introdução à linha de comando Bash” (em inglês)](/en/lessons/intro-to-bash), de Ian Milligan e James Baker. - -No terminal, deve-se ver uma janela de texto e um prompt que parece com isso: `nome-do-computador:~nome-do-usuário$`. O título indica qual é o diretório do usuário, e é possível escrever `$ cd~` em qualquer ponto para retornar para o seu diretório de usuário. Não escreva o cifrão, ele apenas simboliza o prompt de comando no seu terminal, indicando que se digite algo no terminal (em oposição a digitar algo no seu documento); lembre-se de apertar "Enter" após todo comando. - -É bem provável que a sua pasta "Documentos" esteja localizada aqui. Digite `$ pwd`(= _print working directory_, exibe o diretório de trabalho) e aperte "Enter" para exibir o nome do diretório atual. Use `$ pwd` sempre que se sentir perdido. - -O comando `$ ls` (= _list_, listar) simplesmente lista os ficheiros no diretório atual. Enfim, pode usar `$cd>`(= _change directory,_ mudar diretório) assim: `$ cd NOME_DIRETÓRIO` (em que `NOME_DIRETÓRIO` é o nome do diretório que se quer acessar). Use `$ cd ..` para mover automaticamente um nível para cima na estrutura de diretórios (o diretório-pai do diretório em que se está). Uma vez que começar a digitar o nome do diretório, use a tecla Tab para completar automaticamente o texto - particularmente útil para nomes de diretório longos ou nomes de diretórios que contenham espaços.[^5] - -Esses três comandos de terminal: `pwd`, `ls` e `cd` são tudo o que é preciso para esse tutorial. Pratique-os por alguns minutos para navegar pela sua pasta de documentos e pense na forma em que os seus ficheiros estão organizados. Se quiser, acompanhe seu gerenciador gráficos regular de ficheiros para se manter informado. - -## Usando Pandoc para converter Markdown em um documento do MS Word - -Nós estamos agora prontos para formatar! Abra a sua janela de terminal, use o `$ pwd`e `$ cd NOME_DIRETÓRIO` para navegar até a pasta correta para o seu projeto. Chegando lá, digite `$ ls` no terminal para listar os ficheiros. Se encontrar o seu ficheiro .md e suas imagens, está no lugar certo. Para converter o .md em um .docx escreva: - -``` - $ pandoc main.md -o main.docx -``` -Abra o ficheiro no MS Word e confira os resultados. Alternativamente, se usa o Open- ou LibreOffice, escreva: -``` - $ pandoc main.md -o project.odt -``` -Se não estiver acostumado com a linha de comando, imagine ler o comando acima como se fosse algo como: "Pandoc, crie um ficheiro MS Word a partir do meu ficheiro Markdown". A parte `-o` é uma "bandeira", que nesse caso diz algo como "ao invés de eu lhe dizer explicitamente os formatos de ficheiro de origem e destino, apenas tente adivinhar olhando para a extensão do ficheiro" ou simplesmente "output (saída)". Muitas opções estão disponíveis através desses sinalizadores no Pandoc. É possível ver a lista completa no [site do Pandoc](https://pandoc.org/) ou digitando `$ man pandoc` no terminal. - -Tente rodar o comando: -``` - pandoc main.md -o projeto.html -``` -Agora navegue de volta para o diretório do seu projeto. O que aconteceu? - -Usuários mais avançados que tem o LaTeX instalado podem querer experimentar convertendo o Markdown em .tex ou ficheiros .pdf especialmente formatados. Uma vez que o LaTeX estiver instalado, um ficheiro PDF belamente formatado pode ser criado usando a mesma estrutura de comando: -``` - pandoc main.md -o main.pdf -``` - -
    - Se este comando falhar, você pode precisar adicionar um componente que forneça ao pandoc o caminho completo para o motor LaTeX que deseja usar, especificando onde está armazenado. A localização variará se você estiver trabalhando em Mac, Windows ou Linux. Os leitores são aconselhados a verificar o caminho correto para o motor LaTeX em seu sistema e seguir as instruções de instalação atuais (em inglês). -
    - -Se o seu documento estiver escrito em outros idiomas que não o inglês, você provavelmente precisará usar o mecanismo XeLaTeX em vez do LaTeX simples para conversão .pdf: -``` - pandoc main.md --pdf-engine=xelatex -o main.pdf -``` -Tenha certeza de que o seu editor de texto suporta a codificação UTF-8. Quando usar XeLaTeX para conversão em .pdf, ao invés do atributo `fontfamily` no "YAML" para mudar fontes, especifique o atributo `mainfont` para produzir algo como isto: -``` - --- - title: Fluxo de Trabalho de Texto Simples - author: Gabriela Domingues - date: 20 de janeiro de 2014 - mainfont: times - ___ -``` - -Por exemplo, estilos de fontes podem ser passados para o Pandoc na forma de `pandoc main.md -- mainfont=times -o destino.pdf`. Nós preferimos, entretanto, usar as opções de cabeçalho do "YAML" sempre que possível, uma vez que os comandos são mais fáceis de lembrar. Usar uma ferramenta de controle de versão como o Git preservará as mudanças "YAML", onde o que é digitado no terminal é mais efêmero. Consulte a seção de Templates (Modelos) no manual do Pandoc (`man pandoc`) para a lista de variáveis do "YAML" disponíveis. - -## Trabalhando com Bibliografias - -Nesta seção, adicionaremos uma bibliografia ao nosso documento e, em seguida, converteremos os formatos de Chicago para MLA. - -Se não estiver usando um gerenciador de referência como Endnote ou Zotero, use. Preferimos o Zotero porque, como o Pandoc, foi criado pela comunidade acadêmica e, como outros projetos de código aberto, é lançado sob a GNU, General Public License. O mais importante para nós é que o seu gerenciador de referência deve ter a capacidade de gerar bibliografias em formato de texto simples, para manter o alinhamento com nosso princípio “tudo em texto simples”. Vá em frente e abra um gerenciador de referência de sua escolha e adicione algumas entradas de amostra. Quando estiver pronto, encontre a opção de exportar sua bibliografia no formato BibTeX (.bib). Salve o ficheiro .bib no diretório do projeto e dê a ele um título razoável como “projeto.bib”. - -A ideia geral é manter as suas fontes organizadas sob um banco de dados bibliográfico centralizado, enquanto geramos ficheiros .bib menores e mais específicos que devem ficar no mesmo diretório que o seu projeto. Vá em frente e abra o seu ficheiro .bib com o editor de texto simples que escolher.[^6] - -O seu ficheiro .bib deve conter múltiplas entradas que se parecem com esta: - - @article{fyfe_digital_2011, - title = {Digital Pedagogy Unplugged}, - volume = {5}, - url = {http://digitalhumanities.org/dhq/vol/5/3/000106/000106.html}, - number = {3}, - urldate = {2013-09-28}, - author = {Fyfe, Paul}, - year = {2011}, - file = {fyfe_digital_pedagogy_unplugged_2011.pdf} - - -Raramente será necessário editá-las manualmente (embora seja possível). Na maioria dos casos, simplesmente o ficheiro .bib será exportado do Zotero ou de um gerenciador de referências semelhante. Reserve um momento para se orientar aqui. Cada entrada consiste em um tipo de documento, “artigo” em nosso caso, um identificador exclusivo (fyfe_digital_2011) e os metadados relevantes sobre título, volume, autor e assim por diante. O que mais nos interessa é o ID exclusivo que segue imediatamente a chave na primeira linha de cada entrada. O ID único é o que nos permite conectar a bibliografia ao documento principal. Deixe este ficheiro aberto por enquanto e volte para o seu ficheiro `main.md`. - -Edite a nota de rodapé na primeira linha do seu ficheiro `main.md` para se parecer com algo como os seguintes exemplos, em que o `@nome_título_data` pode ser substituído por um dos IDs únicos do seu ficheiro `projeto.bib`. - -* `Uma referência formatada como esta será renderizada apropriadamente como citação no estilo em linha - ou nota de rodapé [@nome_título_data, 67].`[^7] -* `Para citações entre aspas, coloque a vírgula fora das marcas de citação [@nome_título_data, 67]. ` - -Uma vez que rodarmos o Markdown através do Pandoc, "@fyfe_digital_2011" será expandido em uma citação completa no estilo que desejar. É possível usar a sintaxe da `@citação` como preferir: em linha com o seu texto ou em notas de rodapé. Para gerar a bibliografia simplesmente inclua uma seção chamada `# Bibliografia` no fim do documento. - -Agora, retorne para o seu cabeçalho de metadados no topo do seu documento .md, e especifique o ficheiro de bibliografia a ser usado, assim: - -``` ---- -title: Fluxo de Trabalho de Texto Simples -author: Gabriela Domingues -date: 20 de janeiro de 2014 -bibliography: projeto.bib ---- -``` -Isso diz ao Pandoc para procurar pela bibliografia no ficheiro `projeto.bib`, sob o mesmo diretório que o seu `main.md`. Vamos ver se funciona. Salve o ficheiro, mude para a janela do terminal e execute: - -``` -$ pandoc main.md --filter pandoc-citeproc -o main.docx - -``` -O filtro “pandoc-citeproc” analisará quaisquer tags de citação encontradas em seu documento. O resultado deve ser um ficheiro MS Word formatado. Se tiver o LaTeX instalado, converta para .pdf usando a mesma sintaxe para resultados mais bonitos. Não se preocupe se as coisas não estiverem exatamente como prefere - lembre-se de que fará o ajuste refinado da formatação de uma vez mais tarde, o mais próximo possível da data da publicação. Por enquanto, estamos apenas criando rascunhos baseados em padrões razoáveis. - -## Mudando estilos de citação - -O estilo de citação padrão no Pandoc é [Chicago Author-date](https://www.chicagomanualofstyle.org/tools_citationguide/citation-guide-2.html). Podemos especificar um estilo diferente usando a folha de estilo, escrita na “Linguagem de Estilo de Citação” (outra convenção de texto simples, neste caso para descrever estilos de citação) e denotada pela extensão de ficheiro .csl. Felizmente, o projeto CSL mantém um repositório de estilos de citação comuns, alguns até personalizados para periódicos específicos. Visite http://editor.citationstyles.org/about/ para localizar o ficheiro .csl para Modern Language Association (Associação de Linguagem Moderna), baixe `modern-language-association.csl` e salve no diretório do projeto como `mla.csl`. Agora precisamos dizer ao Pandoc para usar a folha de estilo MLA em vez do padrão Chicago. Fazemos isso atualizando o cabeçalho YAML: - -``` ---- -title: Fluxo de trabalho de Texto Simples -author: Gabriela Domingues -date: 20 de janeiro de 2014 -bibliography: projeto.bib -csl: mla.csl ---- -``` - -Então repita o comando Pandoc para carregar seu ficheiro markdown em seu formato de destino (.pdf ou .docx): - -``` -$ pandoc main.md --filter pandoc-citeproc -o main.pdf -``` -Traduza o comando para o Português enquanto digita. Na minha cabeça, eu traduzo o comando acima em algo como: "Pandoc, pegue o meu ficheiro markdown, aplique o filtro de citação sobre ele e retorne um ficheiro PDF". Quanto ficar mais familiarizado com as páginas de estilo de citação, considere adicionar os seus ficheiros .csl customizados para periódicos do seu campo no ficheiro como um serviço para a comunidade. - -## Resumo - -Agora, você deve ser capaz de escrever artigos em Markdown, criar rascunhos em múltiplos formatos, adicionar bibliografias e facilmente mudar os estilos de citação. Um último olhar no diretório do projeto revelará vários ficheiros de origem: o ficheiro `main.md`, o ficheiro `projeto.bib`, o ficheiro `mla.csl` e algumas imagens. Além dos ficheiros origens, deve haver alguns ficheiros "destino" que criamos ao longo desse tutorial: `main.docx` ou `main.pdf`. A sua pasta deve se parecer com isso: - -``` -Pandoc-tutorial/ - main.md - projeto.bib - mla.csl - imagem.jpg - main.docx -``` - -Trate seus ficheiros de origem como uma versão autorizada de seu texto e seus ficheiros de destino como “impressões” descartáveis que podem ser geradas facilmente com o Pandoc em tempo real. Todas as revisões devem ser feitas no `main.md`. O ficheiro `main.docx` está lá para formatação e limpeza em estágio final. Se, por exemplo, o periódico requisitar manuscritos com espaçamento duplo, é possível rapidamente colocar o espaçamento duplo no Open Office ou Microsoft Word. Mas não gaste muito tempo formatando. Lembre-se, tudo é retirado quando o seu manuscrito vai para a impressão. O tempo gasto em formatação desnecessária pode ser usado melhorando a prosa do seu rascunho. - -## Recursos úteis - -Se tiver problemas, não há lugar melhor para começar a procurar ajuda do que o [site do Pandoc](https://pandoc.org/) de John MacFarlane e a [lista de e-mails](https://groups.google.com/forum/#!forum/pandoc-discuss) associados. Pelo menos dois sites do tipo “Pergunta e Resposta” podem responder a perguntas no Pandoc: [Stack Overflow](https://stackoverflow.com/questions/tagged/pandoc) e [Digital Humanities Q&A](https://web.archive.org/web/20190203062832/http://digitalhumanities.org/answers/). As perguntas também podem ser feitas ao vivo, no Freenode IRC, canal #Pandoc, frequentado por um amigável grupo de regulares. Conforme aprender mais sobre o Pandoc, também pode explorar um de seus recursos mais poderosos: [filtros](https://github.com/jgm/pandoc/wiki/Pandoc-Filters). - -Embora nossa sugestão seja começar com um editor simples, muitas (mais de 70, de acordo com [esta postagem do blog](https://web.archive.org/web/20140120195538/http://mashable.com/2013/06/24/markdown-tools/) outras alternativas específicas do Markdown para o MS Word estão disponíveis online, e muitas vezes sem custo. Dos autônomos, gostamos de [Write Monkey](https://web.archive.org/web/20260327163157/http://writemonkey.com/) e [Sublime Text](https://www.sublimetext.com/). Várias plataformas baseadas na web surgiram recentemente que fornecem interfaces gráficas elegantes para escrita colaborativa e controle de versão usando Markdown. Algumas delas são: [prose.io](http://prose.io/), [Authorea](https://www.authorea.com/), [Draft](http://www.draftin.com/) e [StackEdit](https://stackedit.io/). - -Mas o ecossistema não é limitado a editores. [Gitit](http://gitit.net/) e [Ikiwiki](https://github.com/dubiousjim/pandoc-iki) suportam autoria em Markdown com Pandoc como analisador. Podemos incluir nesta lista uma série de ferramentas que geram páginas da Web estáticas e rápidas, [Yst](https://github.com/jgm/yst), [Jekyll](https://github.com/fauno/jekyll-pandoc-multiple-formats), [Hakyll](http://jaspervdj.be/hakyll/) e o [script de shell bash](https://github.com/wcaleb/website) do historiador Caleb McDaniel. - -Por fim, plataformas de publicação completas estão se formando ao redor do uso de Markdown. O Markdown na plataforma de marketplace [Leanpub](https://leanpub.com/) pode ser uma alternativa interessante ao modelo de publicação tradicional. E nós mesmos estamos experimentando o design de periódicos acadêmicos com base no GitHub e [readthedocs.org](https://readthedocs.org/) (ferramentas geralmente usadas para documentação técnica). - - -### Notas -[^1]: Não se preocupe se não entender essa terminologia ainda! -[^2]: Os ficheiros fonte para essa documentação podem ser [baixados no GitHub](https://github.com/dh-notes/pandoc-workflow). Use a opção "raw" quando visualizar no GitHub para ver o Markdown fonte. Os autores gostariam de agradecer a Alex Gil e seus colegas do Digital Humanities Center de Columbia e aos participantes do openLab no Studio na biblioteca Butler por testar o código deste tutorial em uma variedade de plataformas. -[^3]: Veja a excelente discussão de Charlie Stross sobre esse tópico em [Porque Microsoft Word Deve Morrer (em inglês)](http://www.antipope.org/charlie/blog-static/2013/10/why-microsoft-word-must-die.html). -[^4]: Não existem boas soluções para chegar diretamente no MS Word a partir do LaTeX. -[^5]: É uma boa ideia criar o hábito de não usar espaços em nomes de pastas ou ficheiros. Traços ou sublinhados ao invés de espaços nos nomes de seus ficheiros garantem uma duradoura compatibilidade entre plataformas. -[^6]: Note que a extensão .bib pode estar "registrada" no Zotero no seu sistema operacional. Isso significa que quando se clica em um ficheiro .bib é provável que se chame o Zotero para abri-lo, enquanto nós queremos abrir com o editor de texto. Eventualmente, pode querer associar a extensão .bib ao seu editor de texto, -[^7]: Agradeço a [@njbart](https://github.com/njbart) pela correção. Em resposta a nossa sugestão original, `Algumas frases precisam de citação.^[@fyfe_digital_2011 argumenta isso também.]`, [ele escreve](https://github.com/programminghistorian/jekyll/issues/46#issue-45559983): “Isso não é recomendado, pois evita que se alterne facilmente entre os estilos de nota de rodapé e data do autor. É melhor usar o [corrigido] (sem circunflexo, sem ponto final entre colchetes e a pontuação final da frase do texto após os colchetes; com estilos de notas de rodapé, o pandoc ajusta automaticamente a posição da pontuação final). ” +--- +title: Autoria Sustentável em Texto Simples usando Pandoc e Markdown +layout: lesson +collection: lessons +slug: autoria-sustentavel-texto-simples-pandoc-markdown +date: 2014-03-19 +translation_date: 2022-11-27 +authors: +- Dennis Tenen +- Grant Wythoff +lesson-testers: +- Pao-Chuan Ma +tested-date: 2021-06-10 +editors: +- Fred Gibbs +translator: +- Gabriela Kucuruza +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Daniel Bonatto Seco +- André Salvo +difficulty: 2 +activity: sustaining +topics: [website, data-management] +abstract: "Neste tutorial, você aprenderá primeiro o básico do Markdown - uma sintaxe de marcação fácil de ler e escrever para texto simples - bem como Pandoc, uma ferramenta de linha de comando que converte texto simples em vários tipos de ficheiros formatados: PDF, docx, HTML, LaTeX, apresentação de slides e muito mais." +exclude_from_check: + - reviewers + - review-ticket +original: sustainable-authorship-in-plain-text-using-pandoc-and-markdown +avatar_alt: Um homem trabalhando numa mesa de desenho +doi: 10.46430/phpt0036 +--- + +{% include toc.html %} + +{% include figure.html filename="lexoriter.jpg" caption="" %} + +## Objetivos + +Neste tutorial, você aprenderá primeiro o básico do Markdown - uma sintaxe de marcação fácil de ler e de escrever para texto simples - assim como o [Pandoc](https://pandoc.org/), uma ferramenta de linha de comando que converte texto simples em vários tipos de ficheiro belamente formatados: PDF, docx, HTML, LaTeX, apresentações de slides e muito mais.[^1] Com o Pandoc como sua ferramenta de composição digital, você pode usar a sintaxe Markdown para adicionar figuras, bibliografia, formatação e alterar facilmente os estilos de citação de Chicago para MLA (por exemplo), todos usando texto simples. + +Este tutorial não pressupõe nenhum conhecimento técnico prévio, mas aumenta com a experiência, uma vez que vamos sugerir técnicas mais avançadas ao final de cada seção. Elas estão claramente marcadas e podem ser revisitadas após alguma prática e experimentação. + +Ao invés de seguir esse tutorial de maneira mecânica, recomendamos que se esforce para entender as soluções oferecidas aqui como uma _metodologia_, que pode precisar de adaptações para se adequar ao seu ambiente e fluxo de trabalho. A instalação das ferramentas necessárias apresenta talvez a maior barreira à participação. Tenha tempo e paciência suficientes para instalar tudo corretamente, ou faça isso com um/a colega que tenha uma configuração semelhante e ajudem-se mutuamente. Consulte a seção [Recursos Úteis](/pt/licoes/autoria-sustentavel-texto-simples-pandoc-markdown#recursos-úteis) abaixo se ficar preso.[^2] + +## Filosofia +Escrever, armazenar e recuperar documentos são atividades centrais para o fluxo de trabalho de pesquisa das humanidades. Mesmo assim, muitos autores baseiam suas práticas em ferramentas e formatos proprietários que, às vezes, ficam aquém dos requisitos mais básicos da escrita acadêmica. Talvez possa se lembrar de certa frustração com a fragilidade de notas de rodapé, bibliografias, figuras e rascunhos de livros escritos em Microsoft Word ou Google Docs. No entanto, a maioria dos periódicos ainda insiste em submissões no formato .docx. + +Mais do que causar frustração pessoal, essa dependência de ferramentas e de formatos proprietários tem implicações negativas de longo prazo para a comunidade acadêmica. Em tal ambiente, os periódicos devem terceirizar a composição, alienando os autores dos contextos materiais de publicação e adicionando outras barreiras desnecessárias à circulação irrestrita do conhecimento.[^3] + +Quando se usa MS Word, Google Docs ou Open Office para escrever documentos, o que se vê não é o que se obtém. Embaixo da camada visível de palavras, frases e parágrafos, encontra-se uma complicada camada de código compreensível apenas para as máquinas. Por causa dessa camada oculta, os ficheiros .docx e .pdf dependem de ferramentas proprietárias para serem visualizados corretamente. Esses documentos são difíceis de pesquisar, imprimir e converter em outros formatos de ficheiros. + +Além disso, o tempo gasto formatando documentos em MS Word ou Open Office é perdido, pois toda essa formatação é removida pelo editor durante a submissão. Tanto os autores quanto os editores se beneficiariam da troca de ficheiros com formatação mínima, deixando a composição tipográfica para o estágio final de composição do processo de publicação. + +Aqui é onde o Markdown brilha. Markdown é uma sintaxe para marcar explicitamente elementos semânticos dentro de um documento, não em alguma camada oculta. A ideia é identificar as unidades que são significativas para humanos, como títulos, seções, subseções, notas de rodapé e ilustrações. No mínimo, os seus ficheiros sempre permanecerão compreensíveis **para você**, mesmo se o editor de texto que estiver usando parar de funcionar ou "sair do mercado". + +Escrever dessa forma libera o autor da ferramenta. Markdown pode ser escrito em qualquer editor de texto simples e oferece um rico ecossistema de software que pode renderizar o texto em documentos belamente formatados. Por esta razão, o Markdown está atualmente passando por um período de crescimento, não apenas como meio para escrever artigos acadêmicos, mas como uma convenção para edição online em geral. + +Os editores de texto simples de uso geral populares incluem [Atom](https://atom.io/) (todas as plataformas) e [Notepad ++](https://notepad-plus-plus.org/) (somente para Windows). + +É importante entender que o Markdown é apenas uma convenção. Os ficheiros Markdown são armazenados como texto simples, aumentando ainda mais a flexibilidade do formato. Ficheiros de texto simples existem desde a máquina de escrever eletrônica. A longevidade deste padrão torna, de modo inerente, o texto simples mais sustentável e estável do que os formatos proprietários. Enquanto os ficheiros produzidos até dez anos atrás no Microsoft Word e no Apple Pages, podem causar problemas significativos quando abertos nas versões mais recentes, ainda é possível abrir um ficheiro escrito em qualquer editor de texto simples “morto” nas últimas décadas: AlphaPlus, Perfect Writer, Text Wizard, Spellbinder, WordStar ou o favorito de Isaac Asimov, SCRIPSIT 2.0 , feito por Radio Shack. Escrever em texto simples garante que seus ficheiros permanecerão legíveis daqui a dez, quinze, vinte anos. Neste tutorial, descrevemos um fluxo de trabalho que libera o pesquisador de softwares proprietários de processamento de texto e de formatos de ficheiro frágeis. + +Agora é possível escrever uma ampla variedade de documentos em um formato - artigos, postagens de blog, wikis, programas de estudos e cartas de recomendação - usando o mesmo conjunto de ferramentas e técnicas para pesquisar, descobrir, fazer backup e distribuir nossos materiais. Suas notas, entradas de blog, documentação de código e wikis podem ser criados no Markdown. Cada vez mais, muitas plataformas como WordPress, Reddit e GitHub suportam a autoria Markdown nativamente. A longo prazo, sua pesquisa se beneficiará desses fluxos de trabalho unificados, tornando mais fácil salvar, pesquisar, compartilhar e organizar seus materiais. + +## Princípios + +Inspirados pelas melhores práticas em uma variedade de disciplinas, nós fomos guiados pelos seguintes princípios: + +1. _Sustentabilidade_. O texto simples garante tanto transparência, como atende aos padrões de preservação de longo prazo. O Word pode seguir o caminho do [Word Perfect](https://pt.wikipedia.org/wiki/WordPerfect) no futuro, mas o texto simples sempre permanecerá fácil de ler, catalogar, extrair e transformar. Além disso, o texto simples permite um controle fácil e poderoso do versionamento do documento, o que é útil na colaboração e na organização de rascunhos. Seus ficheiros de texto simples estarão acessíveis em telefones celulares, tablets ou, talvez, em um terminal de baixa potência em alguma biblioteca remota. O texto simples é compatível com versões anteriores e à prova de futuro. Qualquer que seja o software ou hardware que vier a seguir, ele será capaz de entender os seus ficheiros de texto simples. +2. _Preferência por formatos legíveis por humanos_. Quando escrevemos no Word ou no Google Docs, o que vemos não é o que obtemos. O ficheiro .doc contem uma formatação oculta de caracteres gerados automaticamente, criando uma camada de composição tipográfica ofuscada que é difícil para o usuário solucionar. Algo tão simples como colar uma imagem ou texto do navegador pode ter efeitos imprevisíveis na formatação do seu documento. +3. _Separação entre forma e conteúdo_. Escrever e formatar ao mesmo tempo é distrativo. A ideia é escrever primeiro e formatar depois, o mais próximo possível da hora da publicação. Uma tarefa como mudar da formatação Chicago para MLA deve ser simples. Os editores de periódicos que desejam economizar tempo na formatação desnecessária e na edição de cópias devem ser capazes de fornecer aos seus autores um modelo de formatação que cuida dos detalhes da composição tipográfica. +4. _Apoio ao aparato acadêmico_. O fluxo de trabalho precisa lidar com notas de rodapé, figuras, caracteres internacionais e bibliografias com elegância. +5. _Independência de plataforma_. Na medida em que os vetores de publicação se multiplicam, precisamos ser capazes de gerar uma multiplicidade de formatos, incluindo projeção de slides, impressão, web e celular. Idealmente, gostaríamos de poder gerar os formatos mais comuns sem quebrar as dependências bibliográficas. Nosso fluxo de trabalho também precisa ser portátil - seria bom poder copiar uma pasta para um pen drive e saber que ela contém tudo o que é necessário para publicação de estudos. Escrever em texto simples significa que é possível facilmente compartilhar, editar e arquivar seus documentos em praticamente qualquer ambiente. Por exemplo, um programa escrito em Markdown pode ser salvo como PDF, impresso como um folheto e convertido em HTML para a web, tudo a partir do mesmo ficheiro. Tanto os documentos da web quanto os impressos devem ser publicados da mesma fonte e ter aparência semelhante, preservando o layout lógico do material. + +Mardown e LaTeX cumprem todos esses requisitos. Nós escolhemos Markdown (e não LaTeX) porque ele oferece a sintaxe mais leve e organizada (por isso, _mark down_) e porque quando unido com Pandoc, permite maior flexibilidade nas saídas (incluindo ficheiros .docs e .tex).[^4] + +## Requisitos de Software + +Nós omitimos propositalmente alguns dos detalhes menores vinculados à plataforma ou ao sistema operacional de instalação do software listado abaixo. Por exemplo, não faz sentido fornecer instruções de instalação para o LaTeX, quando as instruções online para o seu sistema operacional serão sempre mais atuais e completas. Da mesma forma, o processo de instalação do Pandoc é melhor explorado pesquisando por “instalar o Pandoc” no Google, com o provável primeiro resultado sendo a página inicial do Pandoc. + + - **Editor de texto simples**. Entrar no mundo de edição de texto simples expande dramaticamente as suas escolhas de ferramentas inovadoras de autoria. Pesquise online por "editor de texto markdown" e experimente as opções. Não importa qual for usada, contanto que seja explicitamente um editor de texto simples, como Atom e Notepad++. Lembre-se de que nós não estamos presos a ferramenta, é possível trocar de editor a qualquer momento. + - **Terminal de linha de comando**. Trabalhar na "linha de comando" equivale a escrever comandos no terminal. Em um Mac, apenas pesquise por "Terminal". No Windows, use o [PowerShell](https://pt.wikipedia.org/wiki/PowerShell). Usuários de Linux provavelmente já devem estar familiarizados com seus terminais. Nós iremos cobrir o básico de como procurar e usar a linha de comando abaixo. + - **Pandoc**. Instruções de instalação detalhadas e para plataformas específicas estão disponíveis no [site do Pandoc](https://pandoc.org/installing.html). _A instalação do Pandoc na sua máquina é crucial para esse tutorial_, então tome o seu tempo navegando pelas instruções. O Pandoc foi criado e é mantido por John MacFarlane, Professor de Filosofia na Universidade da Califórnia, Berkeley. Isso é a humanidade computacional em sua melhor expressão e servirá como o motor de nosso fluxo de trabalho. Com o Pandoc, será possível compilar texto e bibliografia em documentos belamente formatados e flexíveis. Depois de seguir as instruções de instalação, verifique se o Pandoc está instalado digitando `pandoc --version` na linha de comando. Presumimos que a sua versão seja ao menos a versão 1.12.3, lançada em janeiro de 2014. + +Os próximos dois softwares são recomendados, mas não requisitados para realizar esse tutorial. + +* **Zotero ou Endnote**. Softwares de referência bibliográfica como Zotero e Endnote são ferramentas indispensáveis para organizar e formatar citações em um artigo de pesquisa. Esses programas podem exportar suas bibliotecas como um ficheiro BibTeX (sobre o qual você aprenderá mais no Caso 2 a seguir). Este ficheiro, por si só um documento de texto simples formatado com todas as suas citações, permitirá que você cite referências de forma rápida e fácil usando `@tags`. Deve-se notar que também é possível digitar todas as suas referências bibliográficas à mão, usando [nossa bibliografia](https://github.com/dh-notes/pandoc-workflow/blob/master/pandoctut.bib) como modelo. +* **LaTeX**. Instruções de instalação detalhadas e específicas da plataforma estão disponíveis no [site do Pandoc](https://pandoc.org/installing.html). Embora o LaTeX não seja abordado neste tutorial, ele é usado pelo Pandoc para a criação de .pdf. Usuários avançados frequentemente irão converter para LaTeX diretamente para ter um controle mais minucioso sobre a composição do .pdf. Os iniciantes podem querer pular esta etapa. Caso contrário, digite`latex -v` para ver se o LaTeX está instalado corretamente (você receberá um erro se não estiver e algumas informações sobre a versão, se estiver). + +## Básico do Markdown + +O Markdown é uma convenção para estruturar os seus documentos de texto simples semanticamente. A ideia é identificar estruturas lógicas no seu documento (títulos, seções, subseções, notas de rodapé, etc.), marcá-las com caracteres discretos e então "compilar" o texto resultante com um interpretador de composição tipográfica que formatará o documento consistentemente, de acordo com um estilo específico. + +As convenções de Markdown vêm em várias “versões” projetadas para uso em contextos específicos, como blogs, wikis ou repositórios de código. O do Markdown usado pelo [Pandoc](https://pandoc.org/MANUAL.html#pandocs-markdown) é voltado para uso acadêmico. Suas convenções são descritas na página Markdown do Pandoc. Suas convenções incluem o bloco “[YAML](https://pandoc.org/MANUAL.html#extension-yaml_metadata_block)”, que contém alguns metadados úteis. + +Vamos agora criar um documento simples no Markdown. Abra um editor de texto simples de sua escolha e comece a digitar. Deve ser assim: + +``` +--- +title: Fluxo de Trabalho em Texto Simples +author: Gabriela Domingues +date: 20 de janeiro de 2014 +fontfamily: times +--- +``` + +A versão do Markdown usada pelo Pandoc armazena cada um dos valores acima, e "imprime-os" na localização apropriada do seu documento de saída quando o documento estiver pronto para a composição tipográfica. Aprenderemos mais tarde a adicionar outros campos mais poderosos ao bloco "YAML". Por enquanto, vamos fingir que estamos escrevendo um artigo que contém três seções, cada uma subdividida em duas subseções. Deixe uma linha em branco após os três últimos traços no bloco "YAML" e cole o seguinte: + +``` + +# Seção 1 + +## Subseção 1.1 +Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. + +O parágrafo seguinte deve começar sem recuo: + +## Subseção 1.2 +Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. + +# Seção 2 + +## Subseção 2.1 +``` + +Vá em frente e escreva um texto simulado também. Espaços em branco são significativos em Markdown: não recue os seus parágrafos. Ao invés disso, separe parágrafos usando uma linha vazia. Linhas vazias também devem preceder os cabeçalhos das seções. + +Use asteriscos para adicionar ênfases em negrito ou em itálico, assim: `*itálico*` e `**negrito**`. Nós devemos também adicionar um link e uma nota de rodapé no nosso texto para cobrir os componentes básicos de um artigo médio. Digite: + +``` +Uma frase que precisa de uma nota.[^1] + +[^1]: Essa é a minha primeira nota de rodapé! E um [link](https://www.eff.org/). +``` + +Quando o texto do link e o endereço são iguais, é mais rápido escrever `` ao invés de `[www.eff.org](www.eff.org)`. + +Vamos salvar nosso ficheiro antes de avançar. Crie a nova pasta que irá armazenar esse projeto. É provável que tenha algum sistema de organização de seus documentos, projetos, ilustrações e bibliografias, mas geralmente, o seu documento, e as suas ilustrações e bibliografia estão em pastas diferentes, o que os torna mais difíceis de achar. Nosso objetivo é criar uma única pasta para cada projeto, com todos os materiais relevantes incluídos. A regra geral é um projeto, um artigo, uma pasta. Nomeie seu ficheiro como `main.md`, onde “md” significa markdown. + +Depois que seu ficheiro for salvo, vamos adicionar uma ilustração. Copie uma imagem (qualquer imagem pequena) para a sua pasta e adicione o seguinte em algum lugar no corpo do texto: `![legenda da imagem](sua_imagem.jpg)`. + +Nesse ponto, o seu `main.md` deve parecer com o que está abaixo. É possível baixar esse exemplo de ficheiro teste.md [aqui](/assets/autoria-sustentavel-texto-simples-pandoc-markdown/teste.md). + +``` +--- +title: Fluxo de trabalho de texto simples +author: Gabriela Domingues +date: 20 de Janeiro de 2014 +--- + +# Seção 1 + +## Subseção 1.1 + +Lorem *ipsum* dolor sit amet, **consectetur** adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. + +## Subseção 1.2 + +Sed ut perspiciatis unde omnis iste natus error sit voluptatem accusantium doloremque laudantium, totam rem aperiam, eaque ipsa quae ab illo inventore veritatis et quasi architecto beatae vitae dicta sunt explicabo. + +O próximo parágrafo deve começar assim. Não dê recuo. + +# Seção 2 + +## Subseção 2.1 + +![legenda da imagem](sua_imagem.jpg) + +## Subseção 2.2 + +Uma frase que precisa de uma nota.[^1] + +[^1]: Essa é a minha primeira nota de rodapé! E um [link](https://www.eff.org/). +``` +Como faremos em breve, esse ficheiro de texto simples pode ser renderizado em um belo PDF: + +{% include figure.html filename="autoria-sustentavel-texto-simples-pandoc-markdown-01.png" alt="Imagem representando o ficheiro MarkDown e a respectiva versão em Word produzida com o Pandoc" caption="Exemplo de captura de tela de Word renderizado no Pandoc" %} + +Se quiser ter uma ideia de como esse tipo de marcação será interpretado como formatação HTML, experimente esse [espaço de teste online](https://daringfireball.net/projects/markdown/dingus) e brinque com vários tipos de sintaxe. Lembre-se de que certos elementos do Markdown com o sabor do Pandoc (como o bloco de título e as notas de rodapé) não funcionarão neste formulário da web, que aceita apenas o básico. + +Neste ponto, gaste algum tempo explorando alguns dos outros recursos do Markdown, como citações (referenciadas pelo símbolo `>`), listas de marcadores que começam com `*` ou `-`, quebras de linha textuais que começam com `|` (útil para poesia), tabelas e algumas das outras funções listadas na página de marcação do Pandoc. + +Preste bastante atenção em espaços vazios e no fluxo dos parágrafos. A documentação coloca sucintamente quando define um parágrafo como "uma ou mais linhas de texto seguida por uma ou mais linhas vazias.". Note que "linhas novas são tratadas como espaços" e que "se precisa de uma quebra de linha forte, coloque dois ou mais espaços no final de uma linha." A melhor maneira de entender o que isso significa é experimentar livremente. Use o modo de visualização do seu editor ou apenas execute o Pandoc para ver os resultados dos seus experimentos. + +Acima de tudo, evite a vontade de formatar. Lembre-se de que estamos identificando unidades semânticas: seções, subseções, ênfases, notas de rodapé e figuras. Mesmo *itálico* e **negrito** em Markdown não são realmente marcos de formatação, mas indicam diferentes níveis de *ênfase*. A formatação acontecerá depois, quando souber o lugar e os requisitos da publicação. + +Existem programas que permitem que se veja uma pré-visualização em tempo real da saída do Markdown enquanto se edita o ficheiro de texto simples, que nós detalhamos abaixo na seção de Recursos Úteis. Poucos deles suportam, entretanto, notas de rodapé, figuras e bibliografias. Para aproveitar o Pandoc ao máximo, nós recomendamos que use ficheiros de texto simples armazenados localmente, no seu computador. + +## Entrando em contato com a linha de comendos do seu computador + +Antes de começarmos a publicar o nosso ficheiro `main.md` em outros formatos, nós precisamos nos orientar sobre como trabalhar com a linha de comando usando o programa de terminal do seu computador, que é o único (e melhor) modo de usar o Pandoc. + +A linha de comando é um lugar amigável, uma vez que se acostuma com ela. Se já estiver familiarizado com o uso da linha de comando, sinta-se à vontade para pular esta seção. Para outros, é importante entender que ser capaz de usar seu programa de terminal diretamente permitirá que se use uma ampla gama de poderosas ferramentas de pesquisa que não poderiam ser usadas de outra forma, e podem servir como base para um trabalho mais avançado. Para os fins deste tutorial, é preciso aprender apenas alguns comandos muito simples. + +Primeiro, abra uma janela de linha de comando. Se você estiver usando o macOS, abra o aplicativo Terminal no diretório ‘Aplicativos / Utilitários’. No Windows, recomendamos que use o PowerShell ou, para uma solução mais robusta, instale o subsistema do Windows para Linux e use o terminal que vem com sua distribuição favorita do Linux. Para obter uma excelente introdução à linha de comando, consulte [“Introdução à linha de comando Bash” (em inglês)](/en/lessons/intro-to-bash), de Ian Milligan e James Baker. + +No terminal, deve-se ver uma janela de texto e um prompt que parece com isso: `nome-do-computador:~nome-do-usuário$`. O título indica qual é o diretório do usuário, e é possível escrever `$ cd~` em qualquer ponto para retornar para o seu diretório de usuário. Não escreva o cifrão, ele apenas simboliza o prompt de comando no seu terminal, indicando que se digite algo no terminal (em oposição a digitar algo no seu documento); lembre-se de apertar "Enter" após todo comando. + +É bem provável que a sua pasta "Documentos" esteja localizada aqui. Digite `$ pwd`(= _print working directory_, exibe o diretório de trabalho) e aperte "Enter" para exibir o nome do diretório atual. Use `$ pwd` sempre que se sentir perdido. + +O comando `$ ls` (= _list_, listar) simplesmente lista os ficheiros no diretório atual. Enfim, pode usar `$cd>`(= _change directory,_ mudar diretório) assim: `$ cd NOME_DIRETÓRIO` (em que `NOME_DIRETÓRIO` é o nome do diretório que se quer acessar). Use `$ cd ..` para mover automaticamente um nível para cima na estrutura de diretórios (o diretório-pai do diretório em que se está). Uma vez que começar a digitar o nome do diretório, use a tecla Tab para completar automaticamente o texto - particularmente útil para nomes de diretório longos ou nomes de diretórios que contenham espaços.[^5] + +Esses três comandos de terminal: `pwd`, `ls` e `cd` são tudo o que é preciso para esse tutorial. Pratique-os por alguns minutos para navegar pela sua pasta de documentos e pense na forma em que os seus ficheiros estão organizados. Se quiser, acompanhe seu gerenciador gráficos regular de ficheiros para se manter informado. + +## Usando Pandoc para converter Markdown em um documento do MS Word + +Nós estamos agora prontos para formatar! Abra a sua janela de terminal, use o `$ pwd`e `$ cd NOME_DIRETÓRIO` para navegar até a pasta correta para o seu projeto. Chegando lá, digite `$ ls` no terminal para listar os ficheiros. Se encontrar o seu ficheiro .md e suas imagens, está no lugar certo. Para converter o .md em um .docx escreva: + +``` + $ pandoc main.md -o main.docx +``` +Abra o ficheiro no MS Word e confira os resultados. Alternativamente, se usa o Open- ou LibreOffice, escreva: +``` + $ pandoc main.md -o project.odt +``` +Se não estiver acostumado com a linha de comando, imagine ler o comando acima como se fosse algo como: "Pandoc, crie um ficheiro MS Word a partir do meu ficheiro Markdown". A parte `-o` é uma "bandeira", que nesse caso diz algo como "ao invés de eu lhe dizer explicitamente os formatos de ficheiro de origem e destino, apenas tente adivinhar olhando para a extensão do ficheiro" ou simplesmente "output (saída)". Muitas opções estão disponíveis através desses sinalizadores no Pandoc. É possível ver a lista completa no [site do Pandoc](https://pandoc.org/) ou digitando `$ man pandoc` no terminal. + +Tente rodar o comando: +``` + pandoc main.md -o projeto.html +``` +Agora navegue de volta para o diretório do seu projeto. O que aconteceu? + +Usuários mais avançados que tem o LaTeX instalado podem querer experimentar convertendo o Markdown em .tex ou ficheiros .pdf especialmente formatados. Uma vez que o LaTeX estiver instalado, um ficheiro PDF belamente formatado pode ser criado usando a mesma estrutura de comando: +``` + pandoc main.md -o main.pdf +``` + +
    + Se este comando falhar, você pode precisar adicionar um componente que forneça ao pandoc o caminho completo para o motor LaTeX que deseja usar, especificando onde está armazenado. A localização variará se você estiver trabalhando em Mac, Windows ou Linux. Os leitores são aconselhados a verificar o caminho correto para o motor LaTeX em seu sistema e seguir as instruções de instalação atuais (em inglês). +
    + +Se o seu documento estiver escrito em outros idiomas que não o inglês, você provavelmente precisará usar o mecanismo XeLaTeX em vez do LaTeX simples para conversão .pdf: +``` + pandoc main.md --pdf-engine=xelatex -o main.pdf +``` +Tenha certeza de que o seu editor de texto suporta a codificação UTF-8. Quando usar XeLaTeX para conversão em .pdf, ao invés do atributo `fontfamily` no "YAML" para mudar fontes, especifique o atributo `mainfont` para produzir algo como isto: +``` + --- + title: Fluxo de Trabalho de Texto Simples + author: Gabriela Domingues + date: 20 de janeiro de 2014 + mainfont: times + ___ +``` + +Por exemplo, estilos de fontes podem ser passados para o Pandoc na forma de `pandoc main.md -- mainfont=times -o destino.pdf`. Nós preferimos, entretanto, usar as opções de cabeçalho do "YAML" sempre que possível, uma vez que os comandos são mais fáceis de lembrar. Usar uma ferramenta de controle de versão como o Git preservará as mudanças "YAML", onde o que é digitado no terminal é mais efêmero. Consulte a seção de Templates (Modelos) no manual do Pandoc (`man pandoc`) para a lista de variáveis do "YAML" disponíveis. + +## Trabalhando com Bibliografias + +Nesta seção, adicionaremos uma bibliografia ao nosso documento e, em seguida, converteremos os formatos de Chicago para MLA. + +Se não estiver usando um gerenciador de referência como Endnote ou Zotero, use. Preferimos o Zotero porque, como o Pandoc, foi criado pela comunidade acadêmica e, como outros projetos de código aberto, é lançado sob a GNU, General Public License. O mais importante para nós é que o seu gerenciador de referência deve ter a capacidade de gerar bibliografias em formato de texto simples, para manter o alinhamento com nosso princípio “tudo em texto simples”. Vá em frente e abra um gerenciador de referência de sua escolha e adicione algumas entradas de amostra. Quando estiver pronto, encontre a opção de exportar sua bibliografia no formato BibTeX (.bib). Salve o ficheiro .bib no diretório do projeto e dê a ele um título razoável como “projeto.bib”. + +A ideia geral é manter as suas fontes organizadas sob um banco de dados bibliográfico centralizado, enquanto geramos ficheiros .bib menores e mais específicos que devem ficar no mesmo diretório que o seu projeto. Vá em frente e abra o seu ficheiro .bib com o editor de texto simples que escolher.[^6] + +O seu ficheiro .bib deve conter múltiplas entradas que se parecem com esta: + + @article{fyfe_digital_2011, + title = {Digital Pedagogy Unplugged}, + volume = {5}, + url = {https://digitalhumanities.org/dhq/vol/5/3/000106/000106.html}, + number = {3}, + urldate = {2013-09-28}, + author = {Fyfe, Paul}, + year = {2011}, + file = {fyfe_digital_pedagogy_unplugged_2011.pdf} + + +Raramente será necessário editá-las manualmente (embora seja possível). Na maioria dos casos, simplesmente o ficheiro .bib será exportado do Zotero ou de um gerenciador de referências semelhante. Reserve um momento para se orientar aqui. Cada entrada consiste em um tipo de documento, “artigo” em nosso caso, um identificador exclusivo (fyfe_digital_2011) e os metadados relevantes sobre título, volume, autor e assim por diante. O que mais nos interessa é o ID exclusivo que segue imediatamente a chave na primeira linha de cada entrada. O ID único é o que nos permite conectar a bibliografia ao documento principal. Deixe este ficheiro aberto por enquanto e volte para o seu ficheiro `main.md`. + +Edite a nota de rodapé na primeira linha do seu ficheiro `main.md` para se parecer com algo como os seguintes exemplos, em que o `@nome_título_data` pode ser substituído por um dos IDs únicos do seu ficheiro `projeto.bib`. + +* `Uma referência formatada como esta será renderizada apropriadamente como citação no estilo em linha - ou nota de rodapé [@nome_título_data, 67].`[^7] +* `Para citações entre aspas, coloque a vírgula fora das marcas de citação [@nome_título_data, 67]. ` + +Uma vez que rodarmos o Markdown através do Pandoc, "@fyfe_digital_2011" será expandido em uma citação completa no estilo que desejar. É possível usar a sintaxe da `@citação` como preferir: em linha com o seu texto ou em notas de rodapé. Para gerar a bibliografia simplesmente inclua uma seção chamada `# Bibliografia` no fim do documento. + +Agora, retorne para o seu cabeçalho de metadados no topo do seu documento .md, e especifique o ficheiro de bibliografia a ser usado, assim: + +``` +--- +title: Fluxo de Trabalho de Texto Simples +author: Gabriela Domingues +date: 20 de janeiro de 2014 +bibliography: projeto.bib +--- +``` +Isso diz ao Pandoc para procurar pela bibliografia no ficheiro `projeto.bib`, sob o mesmo diretório que o seu `main.md`. Vamos ver se funciona. Salve o ficheiro, mude para a janela do terminal e execute: + +``` +$ pandoc main.md --filter pandoc-citeproc -o main.docx + +``` +O filtro “pandoc-citeproc” analisará quaisquer tags de citação encontradas em seu documento. O resultado deve ser um ficheiro MS Word formatado. Se tiver o LaTeX instalado, converta para .pdf usando a mesma sintaxe para resultados mais bonitos. Não se preocupe se as coisas não estiverem exatamente como prefere - lembre-se de que fará o ajuste refinado da formatação de uma vez mais tarde, o mais próximo possível da data da publicação. Por enquanto, estamos apenas criando rascunhos baseados em padrões razoáveis. + +## Mudando estilos de citação + +O estilo de citação padrão no Pandoc é [Chicago Author-date](https://www.chicagomanualofstyle.org/tools_citationguide/citation-guide-2.html). Podemos especificar um estilo diferente usando a folha de estilo, escrita na “Linguagem de Estilo de Citação” (outra convenção de texto simples, neste caso para descrever estilos de citação) e denotada pela extensão de ficheiro .csl. Felizmente, o projeto CSL mantém um repositório de estilos de citação comuns, alguns até personalizados para periódicos específicos. Visite https://editor.citationstyles.org/about/ para localizar o ficheiro .csl para Modern Language Association (Associação de Linguagem Moderna), baixe `modern-language-association.csl` e salve no diretório do projeto como `mla.csl`. Agora precisamos dizer ao Pandoc para usar a folha de estilo MLA em vez do padrão Chicago. Fazemos isso atualizando o cabeçalho YAML: + +``` +--- +title: Fluxo de trabalho de Texto Simples +author: Gabriela Domingues +date: 20 de janeiro de 2014 +bibliography: projeto.bib +csl: mla.csl +--- +``` + +Então repita o comando Pandoc para carregar seu ficheiro markdown em seu formato de destino (.pdf ou .docx): + +``` +$ pandoc main.md --filter pandoc-citeproc -o main.pdf +``` +Traduza o comando para o Português enquanto digita. Na minha cabeça, eu traduzo o comando acima em algo como: "Pandoc, pegue o meu ficheiro markdown, aplique o filtro de citação sobre ele e retorne um ficheiro PDF". Quanto ficar mais familiarizado com as páginas de estilo de citação, considere adicionar os seus ficheiros .csl customizados para periódicos do seu campo no ficheiro como um serviço para a comunidade. + +## Resumo + +Agora, você deve ser capaz de escrever artigos em Markdown, criar rascunhos em múltiplos formatos, adicionar bibliografias e facilmente mudar os estilos de citação. Um último olhar no diretório do projeto revelará vários ficheiros de origem: o ficheiro `main.md`, o ficheiro `projeto.bib`, o ficheiro `mla.csl` e algumas imagens. Além dos ficheiros origens, deve haver alguns ficheiros "destino" que criamos ao longo desse tutorial: `main.docx` ou `main.pdf`. A sua pasta deve se parecer com isso: + +``` +Pandoc-tutorial/ + main.md + projeto.bib + mla.csl + imagem.jpg + main.docx +``` + +Trate seus ficheiros de origem como uma versão autorizada de seu texto e seus ficheiros de destino como “impressões” descartáveis que podem ser geradas facilmente com o Pandoc em tempo real. Todas as revisões devem ser feitas no `main.md`. O ficheiro `main.docx` está lá para formatação e limpeza em estágio final. Se, por exemplo, o periódico requisitar manuscritos com espaçamento duplo, é possível rapidamente colocar o espaçamento duplo no Open Office ou Microsoft Word. Mas não gaste muito tempo formatando. Lembre-se, tudo é retirado quando o seu manuscrito vai para a impressão. O tempo gasto em formatação desnecessária pode ser usado melhorando a prosa do seu rascunho. + +## Recursos úteis + +Se tiver problemas, não há lugar melhor para começar a procurar ajuda do que o [site do Pandoc](https://pandoc.org/) de John MacFarlane e a [lista de e-mails](https://groups.google.com/forum/#!forum/pandoc-discuss) associados. Pelo menos dois sites do tipo “Pergunta e Resposta” podem responder a perguntas no Pandoc: [Stack Overflow](https://stackoverflow.com/questions/tagged/pandoc) e [Digital Humanities Q&A](https://web.archive.org/web/20190203062832/https://digitalhumanities.org/answers/). As perguntas também podem ser feitas ao vivo, no Freenode IRC, canal #Pandoc, frequentado por um amigável grupo de regulares. Conforme aprender mais sobre o Pandoc, também pode explorar um de seus recursos mais poderosos: [filtros](https://github.com/jgm/pandoc/wiki/Pandoc-Filters). + +Embora nossa sugestão seja começar com um editor simples, muitas (mais de 70, de acordo com [esta postagem do blog](https://web.archive.org/web/20140120195538/http://mashable.com/2013/06/24/markdown-tools/) outras alternativas específicas do Markdown para o MS Word estão disponíveis online, e muitas vezes sem custo. Dos autônomos, gostamos de [Write Monkey](https://web.archive.org/web/20260327163157/http://writemonkey.com/) e [Sublime Text](https://www.sublimetext.com/). Várias plataformas baseadas na web surgiram recentemente que fornecem interfaces gráficas elegantes para escrita colaborativa e controle de versão usando Markdown. Algumas delas são: [prose.io](https://prose.io/), [Authorea](https://www.authorea.com/), [Draft](https://www.draftin.com/) e [StackEdit](https://stackedit.io/). + +Mas o ecossistema não é limitado a editores. [Gitit](https://gitit.net/) e [Ikiwiki](https://github.com/dubiousjim/pandoc-iki) suportam autoria em Markdown com Pandoc como analisador. Podemos incluir nesta lista uma série de ferramentas que geram páginas da Web estáticas e rápidas, [Yst](https://github.com/jgm/yst), [Jekyll](https://github.com/fauno/jekyll-pandoc-multiple-formats), [Hakyll](https://jaspervdj.be/hakyll/) e o [script de shell bash](https://github.com/wcaleb/website) do historiador Caleb McDaniel. + +Por fim, plataformas de publicação completas estão se formando ao redor do uso de Markdown. O Markdown na plataforma de marketplace [Leanpub](https://leanpub.com/) pode ser uma alternativa interessante ao modelo de publicação tradicional. E nós mesmos estamos experimentando o design de periódicos acadêmicos com base no GitHub e [readthedocs.org](https://readthedocs.org/) (ferramentas geralmente usadas para documentação técnica). + + +### Notas +[^1]: Não se preocupe se não entender essa terminologia ainda! +[^2]: Os ficheiros fonte para essa documentação podem ser [baixados no GitHub](https://github.com/dh-notes/pandoc-workflow). Use a opção "raw" quando visualizar no GitHub para ver o Markdown fonte. Os autores gostariam de agradecer a Alex Gil e seus colegas do Digital Humanities Center de Columbia e aos participantes do openLab no Studio na biblioteca Butler por testar o código deste tutorial em uma variedade de plataformas. +[^3]: Veja a excelente discussão de Charlie Stross sobre esse tópico em [Porque Microsoft Word Deve Morrer (em inglês)](https://www.antipope.org/charlie/blog-static/2013/10/why-microsoft-word-must-die.html). +[^4]: Não existem boas soluções para chegar diretamente no MS Word a partir do LaTeX. +[^5]: É uma boa ideia criar o hábito de não usar espaços em nomes de pastas ou ficheiros. Traços ou sublinhados ao invés de espaços nos nomes de seus ficheiros garantem uma duradoura compatibilidade entre plataformas. +[^6]: Note que a extensão .bib pode estar "registrada" no Zotero no seu sistema operacional. Isso significa que quando se clica em um ficheiro .bib é provável que se chame o Zotero para abri-lo, enquanto nós queremos abrir com o editor de texto. Eventualmente, pode querer associar a extensão .bib ao seu editor de texto, +[^7]: Agradeço a [@njbart](https://github.com/njbart) pela correção. Em resposta a nossa sugestão original, `Algumas frases precisam de citação.^[@fyfe_digital_2011 argumenta isso também.]`, [ele escreve](https://github.com/programminghistorian/jekyll/issues/46#issue-45559983): “Isso não é recomendado, pois evita que se alterne facilmente entre os estilos de nota de rodapé e data do autor. É melhor usar o [corrigido] (sem circunflexo, sem ponto final entre colchetes e a pontuação final da frase do texto após os colchetes; com estilos de notas de rodapé, o pandoc ajusta automaticamente a posição da pontuação final). ” diff --git a/pt/licoes/camadas-vetoriais-qgis.md b/pt/licoes/camadas-vetoriais-qgis.md index 304861daed..a0d6575332 100644 --- a/pt/licoes/camadas-vetoriais-qgis.md +++ b/pt/licoes/camadas-vetoriais-qgis.md @@ -1,247 +1,247 @@ ---- -title: Criar novas camadas vetoriais com o QGIS 2.0 -layout: lesson -slug: camadas-vetoriais-qgis -date: 2013-12-13 -translation_date: 2021-03-30 -authors: -- Jim Clifford -- Josh MacFadyen -- Daniel Macfarlane -reviewers: -- Finn Arne Jørgensen -- Peter Webster -- Abby Schreiber -editors: -- Adam Crymble -translator: -- Rafael Laguardia -translation-editor: -- Joana Vieira Paulino -translation-reviewer: -- Luis Ferla -- Ana Alcântara -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/365 -activity: presenting -topics: [mapping, data-visualization] -abstract: "Nesta lição, aprenderá como criar camadas vetoriais com base em mapas históricos digitalizados." -original: vector-layers-qgis -avatar_alt: Mapa de ruas da cidade -doi: 10.46430/phpt0009 ---- - -{% include toc.html %} - - - - - -## Objetivos da lição - -Nesta lição, aprenderá como criar camadas vetoriais com base em mapas históricos digitalizados. [Na introdução ao Google Maps e Google Earth](/en/lessons/googlemaps-googleearth) (em inglês), usou camadas vetoriais e criou atributos no Google Earth. Faremos o mesmo nesta lição, embora num nível mais avançado, usando o software QGIS. - -As camadas vetoriais (ou shapefiles) são, junto com as camadas raster, um dos dois tipos básicos de estruturas de armazenamento de dados. As camadas vetoriais usam as três feições1 básicas do SIG (Sistema de Informações Geográficas) - pontos, linhas e polígonos - para representar aspectos do mundo real em formato digital. Pontos podem ser usados para representar locais específicos, como cidades, edifícios, eventos, etc. (a escala do seu mapa determinará o que você representa como um ponto - no mapa de uma província, uma cidade seria um ponto, enquanto no mapa de uma cidade, um edifício pode ser um ponto). Linhas podem representar estradas, rios, canais, ferrovias, etc. Polígonos (formas fechadas) são usados para representar objetos mais complexos, como os limites de um lago, país, divisão administrativa ou eleitoral, etc. (novamente, a escala afetará sua escolha - grandes edifícios num mapa de pormenor de uma cidade podem ser melhor representados como polígonos do que como pontos). - -Nesta lição, criará shapefiles (que são um formato de armazenamento de dados vetoriais) para representar o desenvolvimento histórico de comunidades e estradas na Ilha Prince Edward. Cada shapefile pode ser criado como um dos três tipos de feições: ponto, linha, polígono (embora essas feições não possam ser misturadas num shapefile). Cada feição que cria num shapefile possui um conjunto correspondente de atributos, que são armazenados numa tabela de atributos. Criará feições e aprenderá como modificá-las, o que envolve não apenas a criação visual dos três tipos de feições, mas também a modificação de seus atributos. Para fazer isso, usaremos os ficheiros da lição [instalar o QGIS 2.0 e adicionaremos camadas](/en/lessons/qgis-layers) (em inglês) referentes à Ilha Prince Edward. - -## Começando - -Comece por descarregar o [mapa PEI_Holland](/assets/vector-layers-qgis/PEI_HollandMap1798_compLZW.tif) para a pasta do projeto. - -Abra o ficheiro que você salvou no final da lição [instalar o QGIS 2.0 e adicionar camadas](/en/lessons/qgis-layers) (em inglês). Deve ter as seguintes camadas na aba Camadas: - -- PEI\_placenames -- PEI\_highway -- PEI HYDRONETWORK -- 1935 inventory\_region -- coastline\_polygon -- PEI-CumminsMap1927 - -Desmarque todas essas camadas, exceto 'PEI_placenames', 'coastline_polygon' e 'PEI_CumminsMap1927'. - -{% include figure.html filename="pei1.png" caption="Figura 1" %} - -Agora vamos adicionar um segundo mapa histórico como uma camada raster. - -{% include figure.html filename="pei2.png" caption="Figura 2" %} - -- Em Camada na barra de ferramentas, escolha Adicionar Camada Raster (alternativamente, o mesmo ícone que vê ao lado de 'Adicionar Camada Raster' também pode ser selecionado na barra de ferramentas) -- Encontre o ficheiro que descarregou intitulado 'PEI_HollandMap1798' -- Ser-lhe-á solicitado que defina o sistema de coordenadas desta camada. Na caixa de filtro, pesquise por '2291' e, na caixa abaixo, selecione 'NAD83 (CSRS98) / Prince Edward Isl. Stereographic' -- Se não lhe for solicitado que defina o sistema de coordenadas da camada, será necessário alterar uma configuração. Clique em 'Configurações' e, em seguida, em 'Opções'. Clique em 'CRS' no menu à direita e escolha 'Solicitar CRS' a partir das opções abaixo. 'Quando uma nova camada é criada, ou quando uma camada é carregada sem CRS'. Clique 'OK'. Remova a camada 'PEI_HollandMap1798' (clique com o botão direito sobre ela e clique em Remover) e tente adicioná-la novamente. Desta vez, deve-lhe ser solicitado que forneça um 'CRS' e pode selecionar a opção 'NAD83' (veja acima). - -{% include figure.html filename="Figura3.jpg" caption="Figura 3" %} - -Nas etapas anteriores, selecionou e desmarcou camadas na janela 'Camadas' marcando e desmarcando as caixas ao lado delas. Essas camadas são organizadas em ordem decrescente de visibilidade. Ou seja, a camada superior é a camada superior da janela do visualizador (desde que esteja selecionada). Pode arrastar as camadas para cima e para baixo na janela de camadas para alterar a ordem em que ficarão visíveis na janela de visualização. A camada raster 'litoral_polygon' não está visível no momento porque está abaixo das camadas 'PEI_HollandMap1798' e 'PEI_Cummins1927'. Em geral, é melhor manter as camadas vetoriais acima das camadas raster. - -Desmarque 'PEI_Cummins1927' para que a única camada restante seja 'PEI_HollandMap1798'. Observe que o mapa aparece torto na tela; isso ocorre porque já foi georreferenciado pelos redatores da lição para coincidir com as camadas vetoriais de SIG. Saiba mais sobre georreferenciamento em [georreferenciamento no QGIS 2.0](/en/lessons/georeferencing-qgis) (em inglês). - -{% include figure.html filename="pei4.png" caption="Figura 4" %} - -Agora criaremos um shapefile de pontos, que é uma camada vetorial. Clique em 'Camada' -> 'Nova' -> 'Nova Camada Shapefile' - -- Alternativamente, pode selecionar o ícone 'Nova camada Shapefile' no topo da janela da barra de ferramentas QGIS - -{% include figure.html filename="Figura5.jpg" caption="Figura 5" %} - -Depois de selecionar 'Nova Camada Shapefile', aparece uma janela intitulada 'Nova Camada Vetorial' - -- Na categoria 'Tipo', 'ponto' já está selecionado. Clique no botão 'Especificar CRS' e selecione 'NAD83 (CSRS98) / Prince Edward Isl. Estereográfico (EPSG: 2291)' e, em seguida, clique em OK (para obter informações sobre como [entender e selecionar a zona UTM](https://perma.cc/TA7Z-V3SZ)). - -{% include figure.html filename="Figura6.jpg" caption="Figura 6" %} - -Retornando à janela 'Nova Camada vetorial', iremos criar alguns atributos. Para criar o primeiro atributo: - -- Em 'Novo atributo', no campo ao lado de 'Nome', digite 'Nome_Assentamento' (observe que ao trabalhar em bancos de dados não pode usar espaços vazios nos nomes, por isso a convenção é usar sublinhados em seus lugares) -- Clique em 'Adicionar' à lista de atributos - -Agora vamos criar um segundo atributo: - -- Em 'Novo Atributo', no campo ao lado de 'Nome', digite 'Ano' -- Desta vez, vamos mudar o 'Tipo' para 'Número Inteiro' -- Clique em 'Adicionar à lista de atributos' - -Para o terceiro atributo: - -- Sob Novo atributo, no campo ao lado de Nome, digite 'Ano_Final' (o SIG nem sempre é ideal para lidar com mudanças ao longo do tempo, então em alguns casos é importante ter um campo para identificar aproximadamente quando algo deixou de existir) -- Mude o 'Tipo' novamente para 'Número Inteiro' -- Clique em Adicionar à lista de atributos - -{% include figure.html filename="Figura7.jpg" caption="Figura 7" %} - -- Ao concluir essas três etapas, termine de criar esse shapefile clicando em OK na parte inferior direita da janela 'Nova Camada Vetorial'. Um 'pop-up' irá surgir, nomeie-o de 'Assentamentos' e salve-o com os seus outros ficheiros SIG. - -Observe que uma camada chamada 'Assentamentos' agora aparece na janela 'Camadas'. Reposicione-a acima das camadas raster. - -{% include figure.html filename="Figura8.jpg" caption="Figura 8" %} - -Desmarque todas as camadas, exceto 'Assentamentos'. A janela de visualização agora está em branco, pois não criaámos nenhum dado. Agora criaremos novos dados do 'PEI_CumminsMap1927' e do 'PEI_HollandMap 1798' para mostrar o aumento da ocupação entre o final do século XVIII e o início do século XX. - -- Nós começaremos com o mapa mais recente e, portanto, geralmente mais preciso. Selecione novamente (ou seja, marque as caixas ao lado) 'coast_polygon' e 'PEI_CumminsMap1927'. -- Na janela de visualização, aumente o 'Zoom' em 'Charlottetown' (dica: 'Charlottetown' fica perto do meio da ilha no lado sul, na confluência de três rios). -- Selecione a camada de 'Assentamentos' na janela 'Camadas'. -- Na barra de menu, selecione 'Alternar Edição'. - -{% include figure.html filename="pei9.png" caption="Figura 9" %} - -- Depois de selecionar 'Alternar Edição', os botões de edição ficarão disponíveis à direita na barra de menus. Selecione o botão de feição com 'três pontos'. - -{% include figure.html filename="pei10.png" caption="Figura 10" %} - -- O cursor aparece agora como uma cruz - aponte a cruz para 'Charlottetown' (se por acaso não conhecer a geografia do 'PEI', pode ter ajuda adicionando a camada 'PEI_nomes de local'), mantendo-a dentro da linha costeira atual e clique (a digitalização é sempre um compromisso entre precisão e funcionalidade; dependendo da qualidade do mapa original e da digitalização, para a maioria das aplicações históricas, a precisão extrema não é necessária). -- Uma janela de atributos aparecerá. Deixe o campo 'id' em branco (no momento da escrita, o QGIS criará dois campos 'id' e este é desnecessário). No campo 'Assentamento', digite 'Charlottetown'. No campo 'Ano', digite '1764'. Clique em 'OK'. -Vamos agora repetir as etapas que realizámos com 'Charlottetown' para 'Montague', 'Summerside' e 'Cavendish' (novamente, pode encontrar esses locais adicionando as camadas 'PEI_nomes de local'). Encontre 'Montague' no mapa, selecione o botão de feição com 'três pontos' e clique em Montague no mapa. Quando a janela 'Atributos' aparecer, insira 'Montague' e '1732' nos campos apropriados. Repita para 'Summerside (1876)' e 'Cavendish (1790)'. - -{% include figure.html filename="Figura11.jpg" caption="Figura 11" %} - -Na janela 'Camadas', desmarque 'PEI_CumminsMap1927' e selecione 'PEI_HollandMap1798'. Agora vamos identificar dois assentamentos ('Princetown' e 'Havre-St-Pierre') que já não existem. - -- Para localizar 'Princetown', procure 'Richmond Bay' e 'Cape Aylebsury' (na costa norte a oeste de 'Cavendish'), aqui você encontrará 'Princetown' (sombreado) perto da fronteira entre o amarelo e o azul. - -- Se consultar a [entrada da Wikipedia](https://pt.wikipedia.org/wiki/Ilha_do_Pr%C3%ADncipe_Eduardo) desta cidade, notará que por causa de um porto raso, 'Princetown' não se tornou um assentamento importante. Foi renomeado em 1947 e, posteriormente, rebaixado para uma aldeia. Por esse motivo, incluiremos 1947 como a data final para este assentamento. - -- Com o cursor do mouse (em formato de cruz), clique em 'Princetown'. Na 'tabela de atributos' que aparece, coloque 'Princetown' no campo 'Assentamento', coloque '1764' no campo 'Ano' e coloque '1947' em 'Ano_Final'. Clique 'OK'. - -{% include figure.html filename="Figura12.jpg" caption="Figura 12" %} - -- Clique no ícone 'Salvar edições' na barra de menu (fica entre 'Alternar' e 'Adicione Feição'). - -- Clique duas vezes na camada de 'Assentamentos' na janela 'Camadas', escolha a guia 'Etiquetas' na parte superior da janela seguinte. Clique na caixa ao lado de 'Mostrar etiquetas'. Em Campo contendo rótulo, selecione 'Ano' (se necessário), altere o tamanho da fonte para 18,0, altere 'Posicionamento para Acima à esquerda' e clique em 'OK'. - -Na costa norte do 'lote 39', entre 'Britain's Pond' e 'St. Peters Bay', colocaremos agora um ponto para a localização de uma aldeia há muito perdida chamada 'Havre-St-Pierre'. - -- 'Havre-St-Pierre' foi o primeiro assentamento acadiano da ilha, mas está desabitado desde a deportação dos acadianos em 1758. - -- Com o cursor do mouse (em formato de cruz), clique em 'Havre-St. Pierre'. Na 'tabela de Atributos' que aparece, coloque 'Havre-St-Pierre' no campo 'Assentamento', coloque '1720' no campo 'Ano' e '1758' em 'Ano_Final'. Clique 'OK'. - -{% include figure.html filename="pei13.png" caption="Figura 13" %} - -Agora vamos criar outra camada vetorial: um vetor linha. Clique em 'Camada' -> 'Nova' -> 'Nova Camada Shapefile'. A janela 'Nova Camada Vetorial' aparecerá (na categoria 'Tipo', no topo, selecione 'Linha') - -- Clique no botão 'Especificar CRS' e selecione 'NAD83 (CSRS98) / Prince Edward Isl. Estereográfico (EPSG: 2291)' e clique em 'OK'. -- Em 'Novo atributo', no campo ao lado de 'Nome', digite 'Nome_Estrada'. -- Clique em 'Adicionar campos à lista'. - -Crie um segundo atributo: - -- Em 'Novo atributo', no campo ao lado de 'Nome', digite 'Ano'. -- Mude o 'Tipo' para 'Número Inteiro'. -- Clique em 'Adicionar à lista de Atributos'. -- Para terminar de criar este ficheiro, clique em 'OK' na parte inferior direita da janela 'Nova Camada Vetorial'. Uma tela para 'salvar' aparece - chame-a de 'estradas' e salve-a com seus outros ficheiros SIG. - -Vamos agora traçar as estradas do 'mapa de 1798' para que possamos compará-las com as estradas atuais. Certifique-se de ter as camadas 'PEI_Holland1798' e 'Assentamentos' marcadas na janela de 'Camadas'. Selecione a camada 'estradas' na janela de 'camadas', selecione 'Alternar Edição' na barra de ferramentas superior e selecione 'Adicionar Feição'. - -{% include figure.html filename="pei14.png" caption="Figura 14" %} - -- Primeiro trace a estrada de 'Charlottetown' a 'Princetown'. Clique em 'Charlottetown' e depois clique repetidamente em pontos ao longo da estrada para 'Princetown' e verá a linha a ser criada. Repita até chegar a 'Princetown' e clique com o botão direito. Na janela 'Atributos' - estrada que aparece, no campo 'Nome', insira 'para Princetown' e no campo 'Ano' insira '1798'. Clique em 'OK'. - -{% include figure.html filename="pei15.png" caption="Figura 15" %} - -- Repita esta etapa para mais 3 a 4 estradas encontradas no 'PEI_HollandMap1798'. - -- Clique em 'Salvar mudanças' e, em seguida, clique em 'Alternar Edição' para desligá-lo. - -Desmarque 'PEI_HollandMap1798' na janela 'Camadas' e selecione o mapa 'PEI_highway'. Compare as estradas representadas no mapa 'PEI_highway' (as linhas vermelhas pontilhadas) com as estradas que você acabou de traçar. - -{% include figure.html filename="pei16.png" caption="Figura 16" %} - -- Podemos ver que algumas dessas estradas correspondem exatamente às estradas atuais, enquanto outras não correspondem de forma alguma. Seriam necessárias mais pesquisas históricas para determinar se isso ocorre simplesmente porque o mapa da Holanda não representa suficientemente as estradas na época, ou se as estradas mudaram consideravelmente desde então. - -Agora crie um terceiro tipo de camada vetorial: um vetor poligonal. Clique em 'Camada' -> 'Nova' -> 'Nova Camada Vetorial'. A janela 'Nova Camada Vetorial' aparecerá - na categoria 'Tipo', no topo, selecione 'Polígono'. - -- Clique no botão 'Selecione o SRC' e selecione 'NAD83 (CSRS98) / Prince Edward Isl. Estereográfico (EPSG: 2291)' e clique em 'OK'. -- Em 'Novo Atributo', no campo ao lado de 'Nome', digite 'nome_lote' no campo ao lado de 'Ano'. -- Clique em 'Adicionar campos à lista'. - -Crie um segundo atributo: - -- Em 'Novo atributo', no campo ao lado de 'Nome', digite 'Ano'. -- Mude o 'Tipo' para 'Número Inteiro'. -- Clique em 'Adicionar à lista de Atributos'. - -{% include figure.html filename="Figura17.jpg" caption="Figura 17" %} - -Comece criando um polígono para o 'Lote 66', que é o único lote retangular na ilha. - -- Clique em 'Alternar Edição' na barra de ferramentas superior e, em seguida, clique em 'Adicionar Feição'. -- Clique nos quatro cantos do 'lote 66' e você verá um polígono criado. -- Clique com o botão direito no canto final e uma janela de 'Atributos' aparecerá. Adicione '66' ao campo 'nome_lote' e adicione '1764' (o ano em que esses lotes foram inventariados) ao campo 'Ano'. - -{% include figure.html filename="Figura18.jpg" caption="Figura 18" %} - -Agora vamos rastrear o 'Lote 38', que fica a oeste de 'Havre-St-Pierre'. Certifique-se de que há uma marca de seleção na caixa ao lado da camada 'PEI_HollandMap1798' na janela 'Camadas'. - -Clique em 'Alternar Edição' na barra de ferramentas superior e, em seguida, clique em 'Adicionar Feição'. - -Trace o contorno do 'Lote 38', que é mais difícil por causa da linha costeira, com a maior precisão possível. Para mostrar a feição 'Ajuste', queremos que trace ao longo da costa atual (o 'ajuste' é uma operação de edição automática que ajusta a feição que você desenhou para coincidir ou alinhar exatamente com as coordenadas e forma de outra feição próxima). - -- Selecione 'Configurações'-> 'Opções de Ajuste'. - -{% include figure.html filename="Figura19.jpg" caption="Figura 19" %} - -- Uma janela de 'opções de ajuste' irá abrir: clique na caixa ao lado de 'coast_polygon', para a categoria 'Modo' selecione 'vértice e segmento', para 'Tolerância' selecione '10.0', e para 'Unidades' selecione 'pixels'. Clique 'OK'. -- -{% include figure.html filename="Figura20.jpg" caption="Figura 20" %} - -Certifique-se de que a camada de 'lotes' esteja selecionada na janela 'Camadas' e selecione 'Adicionar feição' na barra de ferramentas. - -- Com o cursor, clique nos dois cantos inferiores do polígono, assim como fez com o 'lote 38'. Na linha costeira, você notará que tem uma coleção de linhas para traçar ao redor do 'Savage Harbour'. É aqui que os recursos de aderência se tornam úteis. Enquanto traçar a linha ao longo da costa atual, sua precisão aumentará significativamente, encaixando os 'cliques' diretamente no topo da linha existente. Quanto mais 'cliques' você fizer, mais preciso será, mas tenha em mente que, para muitos fins de SIGH (SIG histórico), obter extrema precisão às vezes produz retornos decrescentes. - -{% include figure.html filename="pei21.png" caption="Figura 21" %} - -Quando terminar de traçar e criar o polígono, selecione e desmarque as várias 'camadas' que criou, comparando e vendo quais relações pode deduzir. -No Google Earth, havia limitações nos tipos de 'feições', 'atributos' e dados fornecidos, e o Google Earth fez grande parte do trabalho por si. Isso é bom quando está aprendendo ou deseja criar mapas rapidamente. A vantagem de usar o software QGIS para criar novas camadas vetoriais é a liberdade e controle sobre os tipos de dados que se pode usar e as 'feições' e 'atributos' que se podem criar. Assim, é possível criar mapas personalizados e ir muito além do que pode ser alcançado no Google Earth ou no Google Maps Engine Lite. Viu isso em primeira mão com as camadas vetoriais de pontos, linhas e polígonos que aprendeu a criar nesta lição. Se tiver dados sobre, por exemplo, registros de saúde pública no século XVIII, pode criar uma nova camada mostrando a distribuição de surtos de febre tifoide e ver se há correlações com estradas e assentamentos principais. Além disso, o software SIG permite não apenas representar e apresentar dados espaciais de maneiras mais sofisticadas, mas também analisar e criar novos dados que não seriam possíveis de outra forma. - -**Aprendeu como criar camadas vetoriais. Certifique-se de salvar seu trabalho!** - -1 É possível identificar a palavra 'feição', em traduções no QGIS BR, ao referir os três tipos de 'formas' ou 'geometrias' usadas nas camadas vetoriais dos SIG. Mas, isto cria uma diferença entre as versões do QGIS BR e QGIS PT. - -*Esta lição é parte do [Geospatial Historian][].* - - [Intro to Google Maps and Google Earth]: /lessons/googlemaps-googleearth - [Installing QGIS 2.0 and Adding Layers]: /lessons/qgis-layers - [PEI_Holland map]: /assets/vector-layers-qgis/PEI_HollandMap1798_compLZW.tif - [Georeferencing in QGIS 2.0]: /lessons/georeferencing-qgis - [Wikipedia entry]: http://en.wikipedia.org/wiki/Prince_Royalty,_Prince_Edward_Island - [Geospatial Historian]: http://geospatialhistorian.wordpress.com/ +--- +title: Criar novas camadas vetoriais com o QGIS 2.0 +layout: lesson +slug: camadas-vetoriais-qgis +date: 2013-12-13 +translation_date: 2021-03-30 +authors: +- Jim Clifford +- Josh MacFadyen +- Daniel Macfarlane +reviewers: +- Finn Arne Jørgensen +- Peter Webster +- Abby Schreiber +editors: +- Adam Crymble +translator: +- Rafael Laguardia +translation-editor: +- Joana Vieira Paulino +translation-reviewer: +- Luis Ferla +- Ana Alcântara +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/365 +activity: presenting +topics: [mapping, data-visualization] +abstract: "Nesta lição, aprenderá como criar camadas vetoriais com base em mapas históricos digitalizados." +original: vector-layers-qgis +avatar_alt: Mapa de ruas da cidade +doi: 10.46430/phpt0009 +--- + +{% include toc.html %} + + + + + +## Objetivos da lição + +Nesta lição, aprenderá como criar camadas vetoriais com base em mapas históricos digitalizados. [Na introdução ao Google Maps e Google Earth](/en/lessons/googlemaps-googleearth) (em inglês), usou camadas vetoriais e criou atributos no Google Earth. Faremos o mesmo nesta lição, embora num nível mais avançado, usando o software QGIS. + +As camadas vetoriais (ou shapefiles) são, junto com as camadas raster, um dos dois tipos básicos de estruturas de armazenamento de dados. As camadas vetoriais usam as três feições1 básicas do SIG (Sistema de Informações Geográficas) - pontos, linhas e polígonos - para representar aspectos do mundo real em formato digital. Pontos podem ser usados para representar locais específicos, como cidades, edifícios, eventos, etc. (a escala do seu mapa determinará o que você representa como um ponto - no mapa de uma província, uma cidade seria um ponto, enquanto no mapa de uma cidade, um edifício pode ser um ponto). Linhas podem representar estradas, rios, canais, ferrovias, etc. Polígonos (formas fechadas) são usados para representar objetos mais complexos, como os limites de um lago, país, divisão administrativa ou eleitoral, etc. (novamente, a escala afetará sua escolha - grandes edifícios num mapa de pormenor de uma cidade podem ser melhor representados como polígonos do que como pontos). + +Nesta lição, criará shapefiles (que são um formato de armazenamento de dados vetoriais) para representar o desenvolvimento histórico de comunidades e estradas na Ilha Prince Edward. Cada shapefile pode ser criado como um dos três tipos de feições: ponto, linha, polígono (embora essas feições não possam ser misturadas num shapefile). Cada feição que cria num shapefile possui um conjunto correspondente de atributos, que são armazenados numa tabela de atributos. Criará feições e aprenderá como modificá-las, o que envolve não apenas a criação visual dos três tipos de feições, mas também a modificação de seus atributos. Para fazer isso, usaremos os ficheiros da lição [instalar o QGIS 2.0 e adicionaremos camadas](/en/lessons/qgis-layers) (em inglês) referentes à Ilha Prince Edward. + +## Começando + +Comece por descarregar o [mapa PEI_Holland](/assets/vector-layers-qgis/PEI_HollandMap1798_compLZW.tif) para a pasta do projeto. + +Abra o ficheiro que você salvou no final da lição [instalar o QGIS 2.0 e adicionar camadas](/en/lessons/qgis-layers) (em inglês). Deve ter as seguintes camadas na aba Camadas: + +- PEI\_placenames +- PEI\_highway +- PEI HYDRONETWORK +- 1935 inventory\_region +- coastline\_polygon +- PEI-CumminsMap1927 + +Desmarque todas essas camadas, exceto 'PEI_placenames', 'coastline_polygon' e 'PEI_CumminsMap1927'. + +{% include figure.html filename="pei1.png" caption="Figura 1" %} + +Agora vamos adicionar um segundo mapa histórico como uma camada raster. + +{% include figure.html filename="pei2.png" caption="Figura 2" %} + +- Em Camada na barra de ferramentas, escolha Adicionar Camada Raster (alternativamente, o mesmo ícone que vê ao lado de 'Adicionar Camada Raster' também pode ser selecionado na barra de ferramentas) +- Encontre o ficheiro que descarregou intitulado 'PEI_HollandMap1798' +- Ser-lhe-á solicitado que defina o sistema de coordenadas desta camada. Na caixa de filtro, pesquise por '2291' e, na caixa abaixo, selecione 'NAD83 (CSRS98) / Prince Edward Isl. Stereographic' +- Se não lhe for solicitado que defina o sistema de coordenadas da camada, será necessário alterar uma configuração. Clique em 'Configurações' e, em seguida, em 'Opções'. Clique em 'CRS' no menu à direita e escolha 'Solicitar CRS' a partir das opções abaixo. 'Quando uma nova camada é criada, ou quando uma camada é carregada sem CRS'. Clique 'OK'. Remova a camada 'PEI_HollandMap1798' (clique com o botão direito sobre ela e clique em Remover) e tente adicioná-la novamente. Desta vez, deve-lhe ser solicitado que forneça um 'CRS' e pode selecionar a opção 'NAD83' (veja acima). + +{% include figure.html filename="Figura3.jpg" caption="Figura 3" %} + +Nas etapas anteriores, selecionou e desmarcou camadas na janela 'Camadas' marcando e desmarcando as caixas ao lado delas. Essas camadas são organizadas em ordem decrescente de visibilidade. Ou seja, a camada superior é a camada superior da janela do visualizador (desde que esteja selecionada). Pode arrastar as camadas para cima e para baixo na janela de camadas para alterar a ordem em que ficarão visíveis na janela de visualização. A camada raster 'litoral_polygon' não está visível no momento porque está abaixo das camadas 'PEI_HollandMap1798' e 'PEI_Cummins1927'. Em geral, é melhor manter as camadas vetoriais acima das camadas raster. + +Desmarque 'PEI_Cummins1927' para que a única camada restante seja 'PEI_HollandMap1798'. Observe que o mapa aparece torto na tela; isso ocorre porque já foi georreferenciado pelos redatores da lição para coincidir com as camadas vetoriais de SIG. Saiba mais sobre georreferenciamento em [georreferenciamento no QGIS 2.0](/en/lessons/georeferencing-qgis) (em inglês). + +{% include figure.html filename="pei4.png" caption="Figura 4" %} + +Agora criaremos um shapefile de pontos, que é uma camada vetorial. Clique em 'Camada' -> 'Nova' -> 'Nova Camada Shapefile' + +- Alternativamente, pode selecionar o ícone 'Nova camada Shapefile' no topo da janela da barra de ferramentas QGIS + +{% include figure.html filename="Figura5.jpg" caption="Figura 5" %} + +Depois de selecionar 'Nova Camada Shapefile', aparece uma janela intitulada 'Nova Camada Vetorial' + +- Na categoria 'Tipo', 'ponto' já está selecionado. Clique no botão 'Especificar CRS' e selecione 'NAD83 (CSRS98) / Prince Edward Isl. Estereográfico (EPSG: 2291)' e, em seguida, clique em OK (para obter informações sobre como [entender e selecionar a zona UTM](https://perma.cc/TA7Z-V3SZ)). + +{% include figure.html filename="Figura6.jpg" caption="Figura 6" %} + +Retornando à janela 'Nova Camada vetorial', iremos criar alguns atributos. Para criar o primeiro atributo: + +- Em 'Novo atributo', no campo ao lado de 'Nome', digite 'Nome_Assentamento' (observe que ao trabalhar em bancos de dados não pode usar espaços vazios nos nomes, por isso a convenção é usar sublinhados em seus lugares) +- Clique em 'Adicionar' à lista de atributos + +Agora vamos criar um segundo atributo: + +- Em 'Novo Atributo', no campo ao lado de 'Nome', digite 'Ano' +- Desta vez, vamos mudar o 'Tipo' para 'Número Inteiro' +- Clique em 'Adicionar à lista de atributos' + +Para o terceiro atributo: + +- Sob Novo atributo, no campo ao lado de Nome, digite 'Ano_Final' (o SIG nem sempre é ideal para lidar com mudanças ao longo do tempo, então em alguns casos é importante ter um campo para identificar aproximadamente quando algo deixou de existir) +- Mude o 'Tipo' novamente para 'Número Inteiro' +- Clique em Adicionar à lista de atributos + +{% include figure.html filename="Figura7.jpg" caption="Figura 7" %} + +- Ao concluir essas três etapas, termine de criar esse shapefile clicando em OK na parte inferior direita da janela 'Nova Camada Vetorial'. Um 'pop-up' irá surgir, nomeie-o de 'Assentamentos' e salve-o com os seus outros ficheiros SIG. + +Observe que uma camada chamada 'Assentamentos' agora aparece na janela 'Camadas'. Reposicione-a acima das camadas raster. + +{% include figure.html filename="Figura8.jpg" caption="Figura 8" %} + +Desmarque todas as camadas, exceto 'Assentamentos'. A janela de visualização agora está em branco, pois não criaámos nenhum dado. Agora criaremos novos dados do 'PEI_CumminsMap1927' e do 'PEI_HollandMap 1798' para mostrar o aumento da ocupação entre o final do século XVIII e o início do século XX. + +- Nós começaremos com o mapa mais recente e, portanto, geralmente mais preciso. Selecione novamente (ou seja, marque as caixas ao lado) 'coast_polygon' e 'PEI_CumminsMap1927'. +- Na janela de visualização, aumente o 'Zoom' em 'Charlottetown' (dica: 'Charlottetown' fica perto do meio da ilha no lado sul, na confluência de três rios). +- Selecione a camada de 'Assentamentos' na janela 'Camadas'. +- Na barra de menu, selecione 'Alternar Edição'. + +{% include figure.html filename="pei9.png" caption="Figura 9" %} + +- Depois de selecionar 'Alternar Edição', os botões de edição ficarão disponíveis à direita na barra de menus. Selecione o botão de feição com 'três pontos'. + +{% include figure.html filename="pei10.png" caption="Figura 10" %} + +- O cursor aparece agora como uma cruz - aponte a cruz para 'Charlottetown' (se por acaso não conhecer a geografia do 'PEI', pode ter ajuda adicionando a camada 'PEI_nomes de local'), mantendo-a dentro da linha costeira atual e clique (a digitalização é sempre um compromisso entre precisão e funcionalidade; dependendo da qualidade do mapa original e da digitalização, para a maioria das aplicações históricas, a precisão extrema não é necessária). +- Uma janela de atributos aparecerá. Deixe o campo 'id' em branco (no momento da escrita, o QGIS criará dois campos 'id' e este é desnecessário). No campo 'Assentamento', digite 'Charlottetown'. No campo 'Ano', digite '1764'. Clique em 'OK'. +Vamos agora repetir as etapas que realizámos com 'Charlottetown' para 'Montague', 'Summerside' e 'Cavendish' (novamente, pode encontrar esses locais adicionando as camadas 'PEI_nomes de local'). Encontre 'Montague' no mapa, selecione o botão de feição com 'três pontos' e clique em Montague no mapa. Quando a janela 'Atributos' aparecer, insira 'Montague' e '1732' nos campos apropriados. Repita para 'Summerside (1876)' e 'Cavendish (1790)'. + +{% include figure.html filename="Figura11.jpg" caption="Figura 11" %} + +Na janela 'Camadas', desmarque 'PEI_CumminsMap1927' e selecione 'PEI_HollandMap1798'. Agora vamos identificar dois assentamentos ('Princetown' e 'Havre-St-Pierre') que já não existem. + +- Para localizar 'Princetown', procure 'Richmond Bay' e 'Cape Aylebsury' (na costa norte a oeste de 'Cavendish'), aqui você encontrará 'Princetown' (sombreado) perto da fronteira entre o amarelo e o azul. + +- Se consultar a [entrada da Wikipedia](https://pt.wikipedia.org/wiki/Ilha_do_Pr%C3%ADncipe_Eduardo) desta cidade, notará que por causa de um porto raso, 'Princetown' não se tornou um assentamento importante. Foi renomeado em 1947 e, posteriormente, rebaixado para uma aldeia. Por esse motivo, incluiremos 1947 como a data final para este assentamento. + +- Com o cursor do mouse (em formato de cruz), clique em 'Princetown'. Na 'tabela de atributos' que aparece, coloque 'Princetown' no campo 'Assentamento', coloque '1764' no campo 'Ano' e coloque '1947' em 'Ano_Final'. Clique 'OK'. + +{% include figure.html filename="Figura12.jpg" caption="Figura 12" %} + +- Clique no ícone 'Salvar edições' na barra de menu (fica entre 'Alternar' e 'Adicione Feição'). + +- Clique duas vezes na camada de 'Assentamentos' na janela 'Camadas', escolha a guia 'Etiquetas' na parte superior da janela seguinte. Clique na caixa ao lado de 'Mostrar etiquetas'. Em Campo contendo rótulo, selecione 'Ano' (se necessário), altere o tamanho da fonte para 18,0, altere 'Posicionamento para Acima à esquerda' e clique em 'OK'. + +Na costa norte do 'lote 39', entre 'Britain's Pond' e 'St. Peters Bay', colocaremos agora um ponto para a localização de uma aldeia há muito perdida chamada 'Havre-St-Pierre'. + +- 'Havre-St-Pierre' foi o primeiro assentamento acadiano da ilha, mas está desabitado desde a deportação dos acadianos em 1758. + +- Com o cursor do mouse (em formato de cruz), clique em 'Havre-St. Pierre'. Na 'tabela de Atributos' que aparece, coloque 'Havre-St-Pierre' no campo 'Assentamento', coloque '1720' no campo 'Ano' e '1758' em 'Ano_Final'. Clique 'OK'. + +{% include figure.html filename="pei13.png" caption="Figura 13" %} + +Agora vamos criar outra camada vetorial: um vetor linha. Clique em 'Camada' -> 'Nova' -> 'Nova Camada Shapefile'. A janela 'Nova Camada Vetorial' aparecerá (na categoria 'Tipo', no topo, selecione 'Linha') + +- Clique no botão 'Especificar CRS' e selecione 'NAD83 (CSRS98) / Prince Edward Isl. Estereográfico (EPSG: 2291)' e clique em 'OK'. +- Em 'Novo atributo', no campo ao lado de 'Nome', digite 'Nome_Estrada'. +- Clique em 'Adicionar campos à lista'. + +Crie um segundo atributo: + +- Em 'Novo atributo', no campo ao lado de 'Nome', digite 'Ano'. +- Mude o 'Tipo' para 'Número Inteiro'. +- Clique em 'Adicionar à lista de Atributos'. +- Para terminar de criar este ficheiro, clique em 'OK' na parte inferior direita da janela 'Nova Camada Vetorial'. Uma tela para 'salvar' aparece - chame-a de 'estradas' e salve-a com seus outros ficheiros SIG. + +Vamos agora traçar as estradas do 'mapa de 1798' para que possamos compará-las com as estradas atuais. Certifique-se de ter as camadas 'PEI_Holland1798' e 'Assentamentos' marcadas na janela de 'Camadas'. Selecione a camada 'estradas' na janela de 'camadas', selecione 'Alternar Edição' na barra de ferramentas superior e selecione 'Adicionar Feição'. + +{% include figure.html filename="pei14.png" caption="Figura 14" %} + +- Primeiro trace a estrada de 'Charlottetown' a 'Princetown'. Clique em 'Charlottetown' e depois clique repetidamente em pontos ao longo da estrada para 'Princetown' e verá a linha a ser criada. Repita até chegar a 'Princetown' e clique com o botão direito. Na janela 'Atributos' - estrada que aparece, no campo 'Nome', insira 'para Princetown' e no campo 'Ano' insira '1798'. Clique em 'OK'. + +{% include figure.html filename="pei15.png" caption="Figura 15" %} + +- Repita esta etapa para mais 3 a 4 estradas encontradas no 'PEI_HollandMap1798'. + +- Clique em 'Salvar mudanças' e, em seguida, clique em 'Alternar Edição' para desligá-lo. + +Desmarque 'PEI_HollandMap1798' na janela 'Camadas' e selecione o mapa 'PEI_highway'. Compare as estradas representadas no mapa 'PEI_highway' (as linhas vermelhas pontilhadas) com as estradas que você acabou de traçar. + +{% include figure.html filename="pei16.png" caption="Figura 16" %} + +- Podemos ver que algumas dessas estradas correspondem exatamente às estradas atuais, enquanto outras não correspondem de forma alguma. Seriam necessárias mais pesquisas históricas para determinar se isso ocorre simplesmente porque o mapa da Holanda não representa suficientemente as estradas na época, ou se as estradas mudaram consideravelmente desde então. + +Agora crie um terceiro tipo de camada vetorial: um vetor poligonal. Clique em 'Camada' -> 'Nova' -> 'Nova Camada Vetorial'. A janela 'Nova Camada Vetorial' aparecerá - na categoria 'Tipo', no topo, selecione 'Polígono'. + +- Clique no botão 'Selecione o SRC' e selecione 'NAD83 (CSRS98) / Prince Edward Isl. Estereográfico (EPSG: 2291)' e clique em 'OK'. +- Em 'Novo Atributo', no campo ao lado de 'Nome', digite 'nome_lote' no campo ao lado de 'Ano'. +- Clique em 'Adicionar campos à lista'. + +Crie um segundo atributo: + +- Em 'Novo atributo', no campo ao lado de 'Nome', digite 'Ano'. +- Mude o 'Tipo' para 'Número Inteiro'. +- Clique em 'Adicionar à lista de Atributos'. + +{% include figure.html filename="Figura17.jpg" caption="Figura 17" %} + +Comece criando um polígono para o 'Lote 66', que é o único lote retangular na ilha. + +- Clique em 'Alternar Edição' na barra de ferramentas superior e, em seguida, clique em 'Adicionar Feição'. +- Clique nos quatro cantos do 'lote 66' e você verá um polígono criado. +- Clique com o botão direito no canto final e uma janela de 'Atributos' aparecerá. Adicione '66' ao campo 'nome_lote' e adicione '1764' (o ano em que esses lotes foram inventariados) ao campo 'Ano'. + +{% include figure.html filename="Figura18.jpg" caption="Figura 18" %} + +Agora vamos rastrear o 'Lote 38', que fica a oeste de 'Havre-St-Pierre'. Certifique-se de que há uma marca de seleção na caixa ao lado da camada 'PEI_HollandMap1798' na janela 'Camadas'. + +Clique em 'Alternar Edição' na barra de ferramentas superior e, em seguida, clique em 'Adicionar Feição'. + +Trace o contorno do 'Lote 38', que é mais difícil por causa da linha costeira, com a maior precisão possível. Para mostrar a feição 'Ajuste', queremos que trace ao longo da costa atual (o 'ajuste' é uma operação de edição automática que ajusta a feição que você desenhou para coincidir ou alinhar exatamente com as coordenadas e forma de outra feição próxima). + +- Selecione 'Configurações'-> 'Opções de Ajuste'. + +{% include figure.html filename="Figura19.jpg" caption="Figura 19" %} + +- Uma janela de 'opções de ajuste' irá abrir: clique na caixa ao lado de 'coast_polygon', para a categoria 'Modo' selecione 'vértice e segmento', para 'Tolerância' selecione '10.0', e para 'Unidades' selecione 'pixels'. Clique 'OK'. +- +{% include figure.html filename="Figura20.jpg" caption="Figura 20" %} + +Certifique-se de que a camada de 'lotes' esteja selecionada na janela 'Camadas' e selecione 'Adicionar feição' na barra de ferramentas. + +- Com o cursor, clique nos dois cantos inferiores do polígono, assim como fez com o 'lote 38'. Na linha costeira, você notará que tem uma coleção de linhas para traçar ao redor do 'Savage Harbour'. É aqui que os recursos de aderência se tornam úteis. Enquanto traçar a linha ao longo da costa atual, sua precisão aumentará significativamente, encaixando os 'cliques' diretamente no topo da linha existente. Quanto mais 'cliques' você fizer, mais preciso será, mas tenha em mente que, para muitos fins de SIGH (SIG histórico), obter extrema precisão às vezes produz retornos decrescentes. + +{% include figure.html filename="pei21.png" caption="Figura 21" %} + +Quando terminar de traçar e criar o polígono, selecione e desmarque as várias 'camadas' que criou, comparando e vendo quais relações pode deduzir. +No Google Earth, havia limitações nos tipos de 'feições', 'atributos' e dados fornecidos, e o Google Earth fez grande parte do trabalho por si. Isso é bom quando está aprendendo ou deseja criar mapas rapidamente. A vantagem de usar o software QGIS para criar novas camadas vetoriais é a liberdade e controle sobre os tipos de dados que se pode usar e as 'feições' e 'atributos' que se podem criar. Assim, é possível criar mapas personalizados e ir muito além do que pode ser alcançado no Google Earth ou no Google Maps Engine Lite. Viu isso em primeira mão com as camadas vetoriais de pontos, linhas e polígonos que aprendeu a criar nesta lição. Se tiver dados sobre, por exemplo, registros de saúde pública no século XVIII, pode criar uma nova camada mostrando a distribuição de surtos de febre tifoide e ver se há correlações com estradas e assentamentos principais. Além disso, o software SIG permite não apenas representar e apresentar dados espaciais de maneiras mais sofisticadas, mas também analisar e criar novos dados que não seriam possíveis de outra forma. + +**Aprendeu como criar camadas vetoriais. Certifique-se de salvar seu trabalho!** + +1 É possível identificar a palavra 'feição', em traduções no QGIS BR, ao referir os três tipos de 'formas' ou 'geometrias' usadas nas camadas vetoriais dos SIG. Mas, isto cria uma diferença entre as versões do QGIS BR e QGIS PT. + +*Esta lição é parte do [Geospatial Historian][].* + +- [Intro to Google Maps and Google Earth](/en/lessons/googlemaps-googleearth) +- [Installing QGIS 2.0 and Adding Layers](/en/lessons/qgis-layers) +- [PEI_Holland map](/assets/vector-layers-qgis/PEI_HollandMap1798_compLZW.tif) +- [Georeferencing in QGIS 2.0](/en/lessons/georeferencing-qgis) +- [Wikipedia entry](https://en.wikipedia.org/wiki/Prince_Royalty,_Prince_Edward_Island) +- [Geospatial Historian](https://geospatialhistorian.wordpress.com/) diff --git a/pt/licoes/contagem-mineracao-dados-investigacao-unix.md b/pt/licoes/contagem-mineracao-dados-investigacao-unix.md index 0539c899fa..5a98fa9000 100644 --- a/pt/licoes/contagem-mineracao-dados-investigacao-unix.md +++ b/pt/licoes/contagem-mineracao-dados-investigacao-unix.md @@ -1,139 +1,139 @@ ---- -title: Contagem e mineração de dados de investigação com Unix -slug: contagem-mineracao-dados-investigacao-unix -layout: lesson -date: 2014-09-20 -translation_date: 2021-12-17 -authors: -- James Baker -- Ian Milligan -reviewers: -- M. H. Beals -- Allison Hegel -editors: -- Adam Crymble -translator: -- Felipe Lamarca -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Daniel Bonatto Seco -- Ian Araujo -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/440 -activity: transforming -topics: [data-manipulation] -abstract: "Esta lição examinará como dados de investigação, quando organizados de maneira clara e previsível, podem ser contabilizados e minerados utilizando o shell do Unix." -original: research-data-with-unix -avatar_alt: Um diagrama de um mineiro classificando minério com um aparelho -doi: 10.46430/phpt0019 ---- - -{% include toc.html %} - -# Contagem e mineração de dados de investigação com Unix - -## Introdução - -Esta lição examinará como dados de investigação, quando organizados de maneira clara e previsível, podem ser contabilizados e minerados utilizando o shell do Unix. Esta lição se baseia nas lições "[Preservar seus dados de investigação](/pt/licoes/preservar-os-seus-dados-de-investigacao)" e "[Introduction to the Bash Command Line](/en/lessons/intro-to-bash)" (em inglês). Dependendo do quão confiante estiver no uso do shell do Unix, ela também pode ser usada como uma lição independente ou uma revisão. - -Uma vez acumulados dados de investigação para um projeto, um historiador pode fazer diferentes perguntas aos mesmos dados durante um projeto subsequente. Caso estes dados estejam espalhados em vários ficheiros - uma série de dados tabulares, um conjunto de textos transcritos, uma coleção de imagens - eles podem ser contabilizados e minerados utilizando comandos Unix simples. - -O shell do Unix oferece acesso a uma ampla gama de comandos que podem transformar o modo como você contabiliza e minera dados de investigação. Essa lição irá apresentá-lo a uma série de comandos que usam contagem e mineração de dados tabulares, embora eles apenas arranhem a superfície do que o shell do Unix pode fazer. Ao aprender apenas alguns comandos simples, será capaz de realizar tarefas que são impossíveis no Libre Office Calc, Microsoft Excel ou outros programas de estatística similares. Esses comandos podem facilmente ter o seu uso estendido para dados não-estruturados. - -Essa lição também irá demonstrar que as opções disponíveis para manipulação, contagem e mineração de dados geralmente dependem da quantidade de metadados, ou texto descritivo, contidos nos nomes dos ficheiros dos dados que estiver utilizando, tanto quanto da gama de comandos Unix que aprendeu a utilizar. Portanto, ainda que não seja um pré-requisito do trabalho com o shell do Unix, reservar um momento para estruturar os seus dados de investigação e convenções de nomes de ficheiros de uma maneira consistente e previsível é certamente um passo significativo para aproveitar ao máximo os comandos Unix e ser capaz de contar e minerar os seus dados de investigação. Para entender a importância de dedicar um tempo a tornar os seus dados consistentes e previsíveis, além de questões de preservação, consulte: "[Preservar seus dados de investigação](/pt/licoes/preservar-os-seus-dados-de-investigacao)". - -_____ - -## Software e configuração - -Usuários de Windows precisarão instalar o Git Bash. Ele pode ser instalado fazendo o download do instalador mais recente na [página web do git for windows](https://gitforwindows.org/) (em inglês). Instruções de instalação estão disponíveis na [documentação do Git for Windows](https://github.com/git-for-windows/git/wiki/Technical-overview) (em inglês). - -Usuários de OS X e Linux deverão utilizar os próprios terminais para seguir esta lição, como foi discutido em "[Introduction to the Bash Command Line](/en/lessons/intro-to-bash)" (em inglês). - -Esta lição foi revista utilizando o Git Bash 2.34.1 e o sistema operacional Windows 10. Caminhos de ficheiro equivalentes para OS X/Linux foram incluídos sempre que possível. No entanto, como os comandos e flags podem mudar ligeiramente entre os sistemas operacionais OS X/Linux, sugere-se que os usuários verifiquem Deborah S. Ray e Eric J. Ray, "[*Unix and Linux: Visual Quickstart Guide*](https://www.worldcat.org/title/unix-and-linux/oclc/308171076&referer=brief_results)", 4ª edição, que cobre a interoperabilidade em maiores detalhes. - -Os ficheiros utilizados nesta lição estão disponíveis em "[Figshare](https://doi.org/10.6084/m9.figshare.1172094)" (em inglês). Os dados contêm os metadados para artigos de periódicos categorizados em 'History' no banco de dados ESTAR da British Library. Os dados são compartilhados sob isenção dos direitos autorais CC0. - -Faça o download dos ficheiros necessários, salve-os no seu computador e descompacte-os. Caso não tenha um software padrão para lidar com ficheiros .zip, recomendamos [7-zip](http://www.7-zip.org/) (em inglês) para este propósito. No Windows, recomendamos descompactar a pasta em sua unidade C: para que os ficheiros estejam em `c:\proghist\`. No entanto, qualquer localização servirá, mas precisará ajustar os seus comandos à medida que for avançando na lição caso use uma localização diferente. No caso de OS X ou Linux, recomendamos de modo similar que descompacte os ficheiros no seu diretório de usuário, de modo que eles apareçam em `/usuario/NOME-DE-USUARIO/proghist/`. Em ambos os casos, isso significa que, ao abrir uma nova janela de terminal, pode simplesmente digitar `cd proghist` para mover para o diretório correto (no Windows, se o comando referido não resultar, poderá ter de digitar `cd C:\proghist` para acessar o diretório). - -_____ - -## Contabilizando ficheiros - -Você começará esta lição contabilizando os conteúdos dos ficheiros utilizando o shell do Unix. O shell do Unix pode ser usado para rapidamente gerar contagens de ficheiros, algo difícil de se conseguir usando interfaces gráficas de usuário (do inglês, *Graphical User Interfaces* - GUI) de suítes padrão de escritório, como o pacote Office, por exemplo. - -Abra o shell do Unix e navegue até o diretório que contém os nossos dados, o subdiretório `data` do diretório `proghist`. Lembre-se: caso não tenha certeza de onde está na sua estrutura de diretórios, digite `pwd` e use o comando `cd` para mover para onde precisa estar. A estrutura de diretórios é um pouco diferente entre OS X/Linux e Windows: no primeiro caso, o diretório está em um formato como `~/usuario/NOME-DE-USUARIO/proghist/data`, e no Windows o formato é do tipo `c:\proghist\data`. - -Digite `ls` e pressione a tecla Enter. Isso exibe uma lista que inclui dois ficheiros e um subdiretório. - -Os ficheiros nesse diretório são a base de dados `2014-01_JA.csv`, que contém os metadados dos artigos de periódico, e um ficheiro contendo a documentação a respeito do `2014-01_JA.csv` chamado `2014-01_JA.txt`. - -O subdiretório é nomeado como `derived_data`. Ele contém quatro ficheiros [.tsv](http://en.wikipedia.org/wiki/Tab-separated_values) derivados do `2014-01_JA.csv`. Cada um deles inclui todos os dados em que uma palavra-chave como `africa` ou `america` aparece no campo `Title` do `2014-01_JA.csv`. O diretório `derived_data` também inclui um subdiretório chamado `results`. - -*Nota: Ficheiros [CSV](https://pt.wikipedia.org/wiki/Comma-separated_values) são aqueles nos quais as unidades de dados (ou células) são separadas por vírgula (comma-separated-values) e ficheiros TSV são aqueles nos quais as unidades são separadas por tabulação. Ambos podem ser lidos em editores de texto simples ou em programas de estatística como Libre Office Calc ou Microsoft Excel.* - -Antes de começar a trabalhar com esses ficheiros, deve mover-se para dentro do diretório no qual eles estão armazenados. Navegue até `c:\proghist\data\derived_data` no Windows ou `~/usuario/NOME-DE-USUARIO/proghist/data/derived_data` no OS X/Linux. - -Agora que já está aqui, pode contabilizar o conteúdo dos ficheiros. - -No Unix, o comando `wc` é usado para contar os conteúdos de um ficheiro ou de uma série de ficheiros. Digite `wc -w 2014-01-31_JA_africa.tsv` e pressione a tecla Enter. A flag `-w` combinado com `wc` instrui o computador a exibir no shell uma contagem de palavras e o nome do ficheiro que foi contabilizado. - -Como foi visto no "[Introduction to the Bash Command Line](/en/lessons/intro-to-bash)", flags como `-w` são parte essencial para aproveitar ao máximo o shell do Unix, uma vez que eles oferecem melhor controle sobre os comandos. - -Se a sua investigação está mais interessada no número de entradas (ou linhas) do que no número de palavras, pode utilizar a flag de contagem de linhas. Digite `wc -l 2014-01-31_JA_africa.tsv` e pressione Enter. Combinado com o `wc`, a flag `-l` exibe uma contagem de linhas e o nome do ficheiro que foi contabilizado. - -Finalmente, digite `wc -c 2014-01-31_JA_africa.tsv` e pressione Enter. Isso usa a flag `-c` combinado com o comando `wc` para exibir uma contagem de caracteres do `2014-01-31_JA_africa.tsv`. - -*Nota: Usuários de OS X e Linux devem substituir a flag `-c` por `-m`.* - -Com essas três flags, o uso mais simples que um historiador pode fazer do comando `wc` é comparar o formato das fontes no formato digital - por exemplo, a contagem do número de palavras por página de um livro, a distribuição de caracteres por página ao longo de uma coleção de jornais, o comprimento médio das linhas usadas pelos poetas. Também pode utilizar `wc` com uma combinação de curingas / caracteres variáveis (*wildcards*) e flags para construir *queries* mais complexas. Digite `wc -l 2014-01-31_JA_a*.tsv` e pressione Enter. Isso exibe a contagem de linhas para `2014-01-31_JA_africa.tsv` e `2014-01-31_JA_america.tsv`, além da soma das linhas destes ficheiros, oferecendo uma maneira simples de comparar esses dois conjuntos de dados de investigação. Claro, pode ser mais rápido comparar a contagem de linhas desses dois documentos no Libre Office Calc, Microsoft Excel ou outro programa similar. Mas quando desejar comparar a contagem de linhas de dezenas, centenas ou milhares de documentos, o shell do Unix tem uma clara vantagem em velocidade. - -Além disso, à medida que os nossos conjuntos de dados aumentam de tamanho, pode utilizar o shell do Unix para fazer mais do que copiar essas contagens de linha manualmente, com capturas de tela ou com métodos de copiar e colar. Ao utilizar o operador de redirecionamento `>` pode exportar os resultados da sua *query* em um novo ficheiro. Digite `wc -l 2014-01-31_JA_a*.tsv > results/2014-01-31_JA_a_wc.txt` e pressione Enter. Isso executa a mesma *query* anterior, mas, ao invés de exibir os resultados no shell do Unix, ele salva os resultados como `2014-01-31_JA_a_wc.txt`. Ao preceder com `results/`, ele move o ficheiro .txt para o subdiretório `results`. Para verificar isso, navegue até ao subdiretório `results`, pressione Enter, digite `ls` e pressione Enter mais uma vez para ver este ficheiro listado em `c:\proghist\data\derived_data\results` no Windows ou `/usuario/NOME-DE-USUARIO/proghist/data/derived_data/results` no OS X/Linux. - -## Minerando ficheiros - -O shell do Unix pode fazer muito mais do que contar palavras, caracteres e linhas de um ficheiro. O comando `grep` (que significa '*global regular expression print*') é usado para buscar *strings* (cadeias de caracteres) específicas ao longo de múltiplos ficheiros. Ele é capaz de fazer isso muito mais rapidamente do que interfaces gráficas de busca oferecidas pela maioria dos sistemas operacionais ou suítes de escritório. Combinado com o operador `>`, o comando `grep` se torna uma ferramenta de investigação poderosa, que pode ser usada para minerar os seus dados em busca de características ou grupos de palavras que aparecem ao longo de múltiplos ficheiros e então exportar esses dados para um novo ficheiro. As únicas limitações aqui são a sua imaginação, o formato dos seus dados e - quando trabalhando com milhares ou milhões de ficheiros - o poder de processamento ao seu dispor. - -Para começar a utilizar o `grep`, primeiro navegue até o diretório `derived_data` (`cd ..`). Aqui digite `grep 1999 *.tsv` e pressione Enter. Essa *query* busca em todos os ficheiros no diretório que se enquadram nos critérios fornecidos (os ficheiros .tsv) por instâncias da *string*, ou cluster de caracteres, '1999'. Em seguida, exibe no shell. - -
    -Há uma grande quantidade de dados a serem exibidos. Então, caso fique entediado, pressione `ctrl+c` para cancelar a ação. Ctrl+c é utilizado para cancelar qualquer processo no shell do Unix. -
    - -Pressione a seta para cima uma vez para voltar à ação mais recente. Altere `grep 1999 *.tsv` para `grep -c 1999 *.tsv` e pressione Enter. O shell irá agora exibir o número de vezes que a *string* '1999' apareceu em cada um dos ficheiros .tsv. Volte à linha anterior novamente, altere para `grep -c 1999 2014-01-31_JA_*.tsv > results/2014-01-31_JA_1999.txt` e pressione Enter. Essa *query* procura instâncias da *string* '1999' em todos os documentos que se adequam aos critérios e as salva em `2014-01-31_JA_1999.txt` no subdiretório `results`. - -*Strings* não precisam ser números. `grep -c revolution 2014-01-31_JA_america.tsv 2014-02-02_JA_britain.tsv`, por exemplo, conta todas as instâncias da *string* `revolution` dentro dos ficheiros definidos e exibe essas contagens no shell. Execute esse comando e o altere para `grep -ci revolution 2014-01-31_JA_america.tsv 2014-02-02_JA_britain.tsv`. Isso repete a *query*, mas imprime um resultado que não diferencia maiúsculas de minúsculas, combinando a flag -i com -c, (incluindo instâncias `revolution` e `Revolution`). Note que a contagem aumentou quase 30 vezes para os títulos de artigos de períodicos que contêm a palavra-chave `revolution`. Como antes, voltar ao comando anterior e adicionar `> results/`, seguido do nome do ficheiro (idealmente no formato .txt), armazenará os resultados em um ficheiro. - -Também pode utilizar o `grep` para criar subconjuntos de dados tabulares. Digite `grep -i revolution 2014-01-31_JA_america.tsv 2014-02-02_JA_britain.tsv > ANO-MES-DIA_JA_america_britain_i_revolution.tsv` (onde `ANO-MES-DIA` é a data em que você está completando esta lição) e pressione Enter. Este comando verifica ambos os ficheiros definidos e exporta todas as linhas contendo `revolution` (sem diferenciar maiúsculas de minúsculas) para o ficheiro .tsv especificado. - -O dado não foi salvo ao diretório `results` porque ele não é estritamente um resultado; é um dado derivado. Dependendo do seu projeto de investigação, pode ser mais fácil armazenar isso em outro subdiretório. Por enquanto, dê uma olhada neste ficheiro para verificar o seu conteúdo e, quando estiver satisfeito, delete-o usando o comando `rm`. - -*Nota: O comando `rm` é muito poderoso e deve ser usado com cautela. Por favor, verifique "[Introduction to the Bash Command Line](/en/lessons/intro-to-bash)" (em inglês) para instruções de como utilizar esse comando corretamente.* - -Finalmente, pode usar outra flag, `-v`, para excluir elementos ao usar o comando `grep`. Digite `grep -iv revolution 2014*_JA_a*.tsv > 2014_JA_iv_revolution.csv` e pressione Enter. Essa *query* busca nos ficheiros definidos (três no total) e exporta todas as linhas que não contêm `revolution` ou `Revolution` ao `c:\proghist\data\derived_data\2014_JA_iv_revolution.csv`. - -Note que transformou os dados de um formato para outro - de .tsv para .csv. Frequentemente há uma perda de estrutura dos dados ao realizar essas transformações. Para observar isso, execute `grep -iv revolution 2014*_JA_a*.tsv > 2014_JA_iv_revolution.tsv` e abra os ficheiros .csv e .tsv no Libre Office Calc, Microsoft Excel, ou outro programa similar. Observe as diferenças no delineamento da coluna entre os dois ficheiros. - -*Resumo* - -Agora no shell do Unix você pode: - -- usar o comando `wc` com as flags `-w` e `-l` para contar as palavras e linhas de um ficheiro ou uma série de ficheiros. -- usar o redirecionador ou estrutura `subdiretório/nome-do-ficheiro` para armazenar os resultados em um subdiretório. -- usar o comando `grep` para buscar por instâncias de uma *string*. -- usar `grep` com a flag `-c` para contar instâncias de uma *string*, a flag `-i` para retornar buscas por *strings* ignorando diferenças entre maiúsculas e minúsculas, e a flag `-v` para excluir uma *string* dos resultados. -- combinar esses comandos e flags para construir *queries* complexas de uma forma que sugere o potencial de uso do shell do Unix para contabilizar e minerar os seus dados de investigação e projetos de investigação. - -_____ - -#### Conclusão - -Nessa lição aprendeu a executar contagens básicas em ficheiros, realizar *queries* em dados de investigação em busca de *strings* comuns e armazenar resultados e dados derivados. Ainda que essa lição seja restrita ao uso do shell do Unix para contabilizar e minerar dados tabulares, os processos podem facilmente ser estendidos a textos livres. Para isso, recomendamos dois guias escritos por William Turkel: - -- William Turkel, '[Basic Text Analysis with Command Line Tools in Linux](https://web.archive.org/web/20140925220046/http://williamjturkel.net/2013/06/15/basic-text-analysis-with-command-line-tools-in-linux/)' (15 de junho de 2013) -- William Turkel, '[Pattern Matching and Permuted Term Indexing with Command Line Tools in Linux](https://web.archive.org/web/20200925054120/http://williamjturkel.net/2013/06/20/pattern-matching-and-permuted-term-indexing-with-command-line-tools-in-linux/)' (20 de junho de 2013) - -Como essas recomendações sugerem, a presente lição apenas aborda superficialmente o que o ambiente do shell do Unix é capaz de fazer. Espera-se, no entanto, que tenha oferecido uma prova suficiente para estimular uma investigação mais aprofundada e uma prática produtiva. - -Para muitos historiadores, o potencial total dessas ferramentas deve surgir somente ao incorporar essas habilidades em um projeto de investigação real. Uma vez que a sua investigação cresce e, com isso, os seus dados de investigação, ser capaz de manipular, contabilizar e minerar milhares de ficheiros será extremamente útil. Caso opte por trabalhar nesta lição e investigar o shell do Unix mais a fundo, descobrirá que mesmo uma grande coleção de ficheiros que não contêm quaisquer elementos de dados alfanuméricos, como ficheiros de imagem, podem ser facilmente classificados, selecionados e consultados em um shell do Unix. +--- +title: Contagem e mineração de dados de investigação com Unix +slug: contagem-mineracao-dados-investigacao-unix +layout: lesson +date: 2014-09-20 +translation_date: 2021-12-17 +authors: +- James Baker +- Ian Milligan +reviewers: +- M. H. Beals +- Allison Hegel +editors: +- Adam Crymble +translator: +- Felipe Lamarca +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Daniel Bonatto Seco +- Ian Araujo +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/440 +activity: transforming +topics: [data-manipulation] +abstract: "Esta lição examinará como dados de investigação, quando organizados de maneira clara e previsível, podem ser contabilizados e minerados utilizando o shell do Unix." +original: research-data-with-unix +avatar_alt: Um diagrama de um mineiro classificando minério com um aparelho +doi: 10.46430/phpt0019 +--- + +{% include toc.html %} + +# Contagem e mineração de dados de investigação com Unix + +## Introdução + +Esta lição examinará como dados de investigação, quando organizados de maneira clara e previsível, podem ser contabilizados e minerados utilizando o shell do Unix. Esta lição se baseia nas lições "[Preservar seus dados de investigação](/pt/licoes/preservar-os-seus-dados-de-investigacao)" e "[Introduction to the Bash Command Line](/en/lessons/intro-to-bash)" (em inglês). Dependendo do quão confiante estiver no uso do shell do Unix, ela também pode ser usada como uma lição independente ou uma revisão. + +Uma vez acumulados dados de investigação para um projeto, um historiador pode fazer diferentes perguntas aos mesmos dados durante um projeto subsequente. Caso estes dados estejam espalhados em vários ficheiros - uma série de dados tabulares, um conjunto de textos transcritos, uma coleção de imagens - eles podem ser contabilizados e minerados utilizando comandos Unix simples. + +O shell do Unix oferece acesso a uma ampla gama de comandos que podem transformar o modo como você contabiliza e minera dados de investigação. Essa lição irá apresentá-lo a uma série de comandos que usam contagem e mineração de dados tabulares, embora eles apenas arranhem a superfície do que o shell do Unix pode fazer. Ao aprender apenas alguns comandos simples, será capaz de realizar tarefas que são impossíveis no Libre Office Calc, Microsoft Excel ou outros programas de estatística similares. Esses comandos podem facilmente ter o seu uso estendido para dados não-estruturados. + +Essa lição também irá demonstrar que as opções disponíveis para manipulação, contagem e mineração de dados geralmente dependem da quantidade de metadados, ou texto descritivo, contidos nos nomes dos ficheiros dos dados que estiver utilizando, tanto quanto da gama de comandos Unix que aprendeu a utilizar. Portanto, ainda que não seja um pré-requisito do trabalho com o shell do Unix, reservar um momento para estruturar os seus dados de investigação e convenções de nomes de ficheiros de uma maneira consistente e previsível é certamente um passo significativo para aproveitar ao máximo os comandos Unix e ser capaz de contar e minerar os seus dados de investigação. Para entender a importância de dedicar um tempo a tornar os seus dados consistentes e previsíveis, além de questões de preservação, consulte: "[Preservar seus dados de investigação](/pt/licoes/preservar-os-seus-dados-de-investigacao)". + +_____ + +## Software e configuração + +Usuários de Windows precisarão instalar o Git Bash. Ele pode ser instalado fazendo o download do instalador mais recente na [página web do git for windows](https://gitforwindows.org/) (em inglês). Instruções de instalação estão disponíveis na [documentação do Git for Windows](https://github.com/git-for-windows/git/wiki/Technical-overview) (em inglês). + +Usuários de OS X e Linux deverão utilizar os próprios terminais para seguir esta lição, como foi discutido em "[Introduction to the Bash Command Line](/en/lessons/intro-to-bash)" (em inglês). + +Esta lição foi revista utilizando o Git Bash 2.34.1 e o sistema operacional Windows 10. Caminhos de ficheiro equivalentes para OS X/Linux foram incluídos sempre que possível. No entanto, como os comandos e flags podem mudar ligeiramente entre os sistemas operacionais OS X/Linux, sugere-se que os usuários verifiquem Deborah S. Ray e Eric J. Ray, "[*Unix and Linux: Visual Quickstart Guide*](https://www.worldcat.org/title/unix-and-linux/oclc/308171076&referer=brief_results)", 4ª edição, que cobre a interoperabilidade em maiores detalhes. + +Os ficheiros utilizados nesta lição estão disponíveis em "[Figshare](https://doi.org/10.6084/m9.figshare.1172094)" (em inglês). Os dados contêm os metadados para artigos de periódicos categorizados em 'History' no banco de dados ESTAR da British Library. Os dados são compartilhados sob isenção dos direitos autorais CC0. + +Faça o download dos ficheiros necessários, salve-os no seu computador e descompacte-os. Caso não tenha um software padrão para lidar com ficheiros .zip, recomendamos [7-zip](https://www.7-zip.org/) (em inglês) para este propósito. No Windows, recomendamos descompactar a pasta em sua unidade C: para que os ficheiros estejam em `c:\proghist\`. No entanto, qualquer localização servirá, mas precisará ajustar os seus comandos à medida que for avançando na lição caso use uma localização diferente. No caso de OS X ou Linux, recomendamos de modo similar que descompacte os ficheiros no seu diretório de usuário, de modo que eles apareçam em `/usuario/NOME-DE-USUARIO/proghist/`. Em ambos os casos, isso significa que, ao abrir uma nova janela de terminal, pode simplesmente digitar `cd proghist` para mover para o diretório correto (no Windows, se o comando referido não resultar, poderá ter de digitar `cd C:\proghist` para acessar o diretório). + +_____ + +## Contabilizando ficheiros + +Você começará esta lição contabilizando os conteúdos dos ficheiros utilizando o shell do Unix. O shell do Unix pode ser usado para rapidamente gerar contagens de ficheiros, algo difícil de se conseguir usando interfaces gráficas de usuário (do inglês, *Graphical User Interfaces* - GUI) de suítes padrão de escritório, como o pacote Office, por exemplo. + +Abra o shell do Unix e navegue até o diretório que contém os nossos dados, o subdiretório `data` do diretório `proghist`. Lembre-se: caso não tenha certeza de onde está na sua estrutura de diretórios, digite `pwd` e use o comando `cd` para mover para onde precisa estar. A estrutura de diretórios é um pouco diferente entre OS X/Linux e Windows: no primeiro caso, o diretório está em um formato como `~/usuario/NOME-DE-USUARIO/proghist/data`, e no Windows o formato é do tipo `c:\proghist\data`. + +Digite `ls` e pressione a tecla Enter. Isso exibe uma lista que inclui dois ficheiros e um subdiretório. + +Os ficheiros nesse diretório são a base de dados `2014-01_JA.csv`, que contém os metadados dos artigos de periódico, e um ficheiro contendo a documentação a respeito do `2014-01_JA.csv` chamado `2014-01_JA.txt`. + +O subdiretório é nomeado como `derived_data`. Ele contém quatro ficheiros [.tsv](https://en.wikipedia.org/wiki/Tab-separated_values) derivados do `2014-01_JA.csv`. Cada um deles inclui todos os dados em que uma palavra-chave como `africa` ou `america` aparece no campo `Title` do `2014-01_JA.csv`. O diretório `derived_data` também inclui um subdiretório chamado `results`. + +*Nota: Ficheiros [CSV](https://pt.wikipedia.org/wiki/Comma-separated_values) são aqueles nos quais as unidades de dados (ou células) são separadas por vírgula (comma-separated-values) e ficheiros TSV são aqueles nos quais as unidades são separadas por tabulação. Ambos podem ser lidos em editores de texto simples ou em programas de estatística como Libre Office Calc ou Microsoft Excel.* + +Antes de começar a trabalhar com esses ficheiros, deve mover-se para dentro do diretório no qual eles estão armazenados. Navegue até `c:\proghist\data\derived_data` no Windows ou `~/usuario/NOME-DE-USUARIO/proghist/data/derived_data` no OS X/Linux. + +Agora que já está aqui, pode contabilizar o conteúdo dos ficheiros. + +No Unix, o comando `wc` é usado para contar os conteúdos de um ficheiro ou de uma série de ficheiros. Digite `wc -w 2014-01-31_JA_africa.tsv` e pressione a tecla Enter. A flag `-w` combinado com `wc` instrui o computador a exibir no shell uma contagem de palavras e o nome do ficheiro que foi contabilizado. + +Como foi visto no "[Introduction to the Bash Command Line](/en/lessons/intro-to-bash)", flags como `-w` são parte essencial para aproveitar ao máximo o shell do Unix, uma vez que eles oferecem melhor controle sobre os comandos. + +Se a sua investigação está mais interessada no número de entradas (ou linhas) do que no número de palavras, pode utilizar a flag de contagem de linhas. Digite `wc -l 2014-01-31_JA_africa.tsv` e pressione Enter. Combinado com o `wc`, a flag `-l` exibe uma contagem de linhas e o nome do ficheiro que foi contabilizado. + +Finalmente, digite `wc -c 2014-01-31_JA_africa.tsv` e pressione Enter. Isso usa a flag `-c` combinado com o comando `wc` para exibir uma contagem de caracteres do `2014-01-31_JA_africa.tsv`. + +*Nota: Usuários de OS X e Linux devem substituir a flag `-c` por `-m`.* + +Com essas três flags, o uso mais simples que um historiador pode fazer do comando `wc` é comparar o formato das fontes no formato digital - por exemplo, a contagem do número de palavras por página de um livro, a distribuição de caracteres por página ao longo de uma coleção de jornais, o comprimento médio das linhas usadas pelos poetas. Também pode utilizar `wc` com uma combinação de curingas / caracteres variáveis (*wildcards*) e flags para construir *queries* mais complexas. Digite `wc -l 2014-01-31_JA_a*.tsv` e pressione Enter. Isso exibe a contagem de linhas para `2014-01-31_JA_africa.tsv` e `2014-01-31_JA_america.tsv`, além da soma das linhas destes ficheiros, oferecendo uma maneira simples de comparar esses dois conjuntos de dados de investigação. Claro, pode ser mais rápido comparar a contagem de linhas desses dois documentos no Libre Office Calc, Microsoft Excel ou outro programa similar. Mas quando desejar comparar a contagem de linhas de dezenas, centenas ou milhares de documentos, o shell do Unix tem uma clara vantagem em velocidade. + +Além disso, à medida que os nossos conjuntos de dados aumentam de tamanho, pode utilizar o shell do Unix para fazer mais do que copiar essas contagens de linha manualmente, com capturas de tela ou com métodos de copiar e colar. Ao utilizar o operador de redirecionamento `>` pode exportar os resultados da sua *query* em um novo ficheiro. Digite `wc -l 2014-01-31_JA_a*.tsv > results/2014-01-31_JA_a_wc.txt` e pressione Enter. Isso executa a mesma *query* anterior, mas, ao invés de exibir os resultados no shell do Unix, ele salva os resultados como `2014-01-31_JA_a_wc.txt`. Ao preceder com `results/`, ele move o ficheiro .txt para o subdiretório `results`. Para verificar isso, navegue até ao subdiretório `results`, pressione Enter, digite `ls` e pressione Enter mais uma vez para ver este ficheiro listado em `c:\proghist\data\derived_data\results` no Windows ou `/usuario/NOME-DE-USUARIO/proghist/data/derived_data/results` no OS X/Linux. + +## Minerando ficheiros + +O shell do Unix pode fazer muito mais do que contar palavras, caracteres e linhas de um ficheiro. O comando `grep` (que significa '*global regular expression print*') é usado para buscar *strings* (cadeias de caracteres) específicas ao longo de múltiplos ficheiros. Ele é capaz de fazer isso muito mais rapidamente do que interfaces gráficas de busca oferecidas pela maioria dos sistemas operacionais ou suítes de escritório. Combinado com o operador `>`, o comando `grep` se torna uma ferramenta de investigação poderosa, que pode ser usada para minerar os seus dados em busca de características ou grupos de palavras que aparecem ao longo de múltiplos ficheiros e então exportar esses dados para um novo ficheiro. As únicas limitações aqui são a sua imaginação, o formato dos seus dados e - quando trabalhando com milhares ou milhões de ficheiros - o poder de processamento ao seu dispor. + +Para começar a utilizar o `grep`, primeiro navegue até o diretório `derived_data` (`cd ..`). Aqui digite `grep 1999 *.tsv` e pressione Enter. Essa *query* busca em todos os ficheiros no diretório que se enquadram nos critérios fornecidos (os ficheiros .tsv) por instâncias da *string*, ou cluster de caracteres, '1999'. Em seguida, exibe no shell. + +
    +Há uma grande quantidade de dados a serem exibidos. Então, caso fique entediado, pressione `ctrl+c` para cancelar a ação. Ctrl+c é utilizado para cancelar qualquer processo no shell do Unix. +
    + +Pressione a seta para cima uma vez para voltar à ação mais recente. Altere `grep 1999 *.tsv` para `grep -c 1999 *.tsv` e pressione Enter. O shell irá agora exibir o número de vezes que a *string* '1999' apareceu em cada um dos ficheiros .tsv. Volte à linha anterior novamente, altere para `grep -c 1999 2014-01-31_JA_*.tsv > results/2014-01-31_JA_1999.txt` e pressione Enter. Essa *query* procura instâncias da *string* '1999' em todos os documentos que se adequam aos critérios e as salva em `2014-01-31_JA_1999.txt` no subdiretório `results`. + +*Strings* não precisam ser números. `grep -c revolution 2014-01-31_JA_america.tsv 2014-02-02_JA_britain.tsv`, por exemplo, conta todas as instâncias da *string* `revolution` dentro dos ficheiros definidos e exibe essas contagens no shell. Execute esse comando e o altere para `grep -ci revolution 2014-01-31_JA_america.tsv 2014-02-02_JA_britain.tsv`. Isso repete a *query*, mas imprime um resultado que não diferencia maiúsculas de minúsculas, combinando a flag -i com -c, (incluindo instâncias `revolution` e `Revolution`). Note que a contagem aumentou quase 30 vezes para os títulos de artigos de períodicos que contêm a palavra-chave `revolution`. Como antes, voltar ao comando anterior e adicionar `> results/`, seguido do nome do ficheiro (idealmente no formato .txt), armazenará os resultados em um ficheiro. + +Também pode utilizar o `grep` para criar subconjuntos de dados tabulares. Digite `grep -i revolution 2014-01-31_JA_america.tsv 2014-02-02_JA_britain.tsv > ANO-MES-DIA_JA_america_britain_i_revolution.tsv` (onde `ANO-MES-DIA` é a data em que você está completando esta lição) e pressione Enter. Este comando verifica ambos os ficheiros definidos e exporta todas as linhas contendo `revolution` (sem diferenciar maiúsculas de minúsculas) para o ficheiro .tsv especificado. + +O dado não foi salvo ao diretório `results` porque ele não é estritamente um resultado; é um dado derivado. Dependendo do seu projeto de investigação, pode ser mais fácil armazenar isso em outro subdiretório. Por enquanto, dê uma olhada neste ficheiro para verificar o seu conteúdo e, quando estiver satisfeito, delete-o usando o comando `rm`. + +*Nota: O comando `rm` é muito poderoso e deve ser usado com cautela. Por favor, verifique "[Introduction to the Bash Command Line](/en/lessons/intro-to-bash)" (em inglês) para instruções de como utilizar esse comando corretamente.* + +Finalmente, pode usar outra flag, `-v`, para excluir elementos ao usar o comando `grep`. Digite `grep -iv revolution 2014*_JA_a*.tsv > 2014_JA_iv_revolution.csv` e pressione Enter. Essa *query* busca nos ficheiros definidos (três no total) e exporta todas as linhas que não contêm `revolution` ou `Revolution` ao `c:\proghist\data\derived_data\2014_JA_iv_revolution.csv`. + +Note que transformou os dados de um formato para outro - de .tsv para .csv. Frequentemente há uma perda de estrutura dos dados ao realizar essas transformações. Para observar isso, execute `grep -iv revolution 2014*_JA_a*.tsv > 2014_JA_iv_revolution.tsv` e abra os ficheiros .csv e .tsv no Libre Office Calc, Microsoft Excel, ou outro programa similar. Observe as diferenças no delineamento da coluna entre os dois ficheiros. + +*Resumo* + +Agora no shell do Unix você pode: + +- usar o comando `wc` com as flags `-w` e `-l` para contar as palavras e linhas de um ficheiro ou uma série de ficheiros. +- usar o redirecionador ou estrutura `subdiretório/nome-do-ficheiro` para armazenar os resultados em um subdiretório. +- usar o comando `grep` para buscar por instâncias de uma *string*. +- usar `grep` com a flag `-c` para contar instâncias de uma *string*, a flag `-i` para retornar buscas por *strings* ignorando diferenças entre maiúsculas e minúsculas, e a flag `-v` para excluir uma *string* dos resultados. +- combinar esses comandos e flags para construir *queries* complexas de uma forma que sugere o potencial de uso do shell do Unix para contabilizar e minerar os seus dados de investigação e projetos de investigação. + +_____ + +#### Conclusão + +Nessa lição aprendeu a executar contagens básicas em ficheiros, realizar *queries* em dados de investigação em busca de *strings* comuns e armazenar resultados e dados derivados. Ainda que essa lição seja restrita ao uso do shell do Unix para contabilizar e minerar dados tabulares, os processos podem facilmente ser estendidos a textos livres. Para isso, recomendamos dois guias escritos por William Turkel: + +- William Turkel, '[Basic Text Analysis with Command Line Tools in Linux](https://web.archive.org/web/20140925220046/http://williamjturkel.net/2013/06/15/basic-text-analysis-with-command-line-tools-in-linux/)' (15 de junho de 2013) +- William Turkel, '[Pattern Matching and Permuted Term Indexing with Command Line Tools in Linux](https://web.archive.org/web/20200925054120/http://williamjturkel.net/2013/06/20/pattern-matching-and-permuted-term-indexing-with-command-line-tools-in-linux/)' (20 de junho de 2013) + +Como essas recomendações sugerem, a presente lição apenas aborda superficialmente o que o ambiente do shell do Unix é capaz de fazer. Espera-se, no entanto, que tenha oferecido uma prova suficiente para estimular uma investigação mais aprofundada e uma prática produtiva. + +Para muitos historiadores, o potencial total dessas ferramentas deve surgir somente ao incorporar essas habilidades em um projeto de investigação real. Uma vez que a sua investigação cresce e, com isso, os seus dados de investigação, ser capaz de manipular, contabilizar e minerar milhares de ficheiros será extremamente útil. Caso opte por trabalhar nesta lição e investigar o shell do Unix mais a fundo, descobrirá que mesmo uma grande coleção de ficheiros que não contêm quaisquer elementos de dados alfanuméricos, como ficheiros de imagem, podem ser facilmente classificados, selecionados e consultados em um shell do Unix. diff --git a/pt/licoes/contar-frequencias-palavras-python.md b/pt/licoes/contar-frequencias-palavras-python.md index 1c9f544cbc..70907d71bc 100644 --- a/pt/licoes/contar-frequencias-palavras-python.md +++ b/pt/licoes/contar-frequencias-palavras-python.md @@ -1,360 +1,360 @@ ---- -title: Contagem de Frequências de Palavras com Python -layout: lesson -slug: contar-frequencias-palavras-python -date: 2012-07-17 -translation_date: 2022-01-13 -authors: -- William J. Turkel -- Adam Crymble -reviewers: -- Jim Clifford -- Frederik Elwert -editors: -- Miriam Posner -translator: -- Felipe Lamarca -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Ana Carolina Erthal -- Joana Vieira Paulino -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/461 -activity: analyzing -topics: [python] -abstract: "Contar a frequência de palavras específicas de uma lista pode fornecer dados esclarecedores. Esta lição ensinará uma maneira fácil de contar essas frequências com Python." -original: counting-frequencies -avatar_alt: Homem descontente sentado em um tronco cercado por pássaros -doi: 10.46430/phpt0023 ---- - -{% include toc.html %} - -## Objetivos da Lição - -Sua lista agora está limpa o suficiente para que possa começar a analisar seu conteúdo de maneiras significativas. Contar a frequência de palavras específicas de uma lista pode fornecer dados esclarecedores. Python possui uma maneira fácil de contar frequências, mas requer o uso de um novo tipo de variável: o *dicionário*. Antes de começar a trabalhar com um dicionário, considere os processos utilizados para calcular frequências em uma lista. - -### Ficheiros Necessários para esta Lição - -- `obo.py` - -Caso não possua esse ficheiro, pode fazer o *download* do ficheiro ([zip][]) que contém todo o código das lições anteriores desta série. - -## Frequências - -Agora desejamos contar a frequência de cada palavra em nossa lista. Já viu que é fácil de processar uma lista utilizando um `for` *loop*. Tente salvar e executar o exemplo a seguir. Lembre-se de que `+=` informa ao programa para acrescentar algo ao final de uma variável existente. - -``` python -# count-list-items-1.py - -wordstring = 'foi o melhor dos tempos foi o pior dos tempos ' -wordstring += 'foi a idade da sabedoria foi a idade da ignorância' -wordlist = wordstring.split() - -wordfreq = [] -for w in wordlist: - wordfreq.append(wordlist.count(w)) - -print("String\n" + wordstring +"\n") -print("Lista\n" + str(wordlist) + "\n") -print("Frequências\n" + str(wordfreq) + "\n") -print("Pares\n" + str(list(zip(wordlist, wordfreq)))) -``` - -Aqui, começamos com uma string e separamo-la em uma lista, como fizemos anteriormente. Depois disso criamos uma lista (inicialmente vazia) chamada `wordfreq`, percorremos cada palavra na `wordlist` e contamos o número de vezes que aquela palavra aparece em toda a lista. Então, adicionamos a contagem de cada palavra à nossa lista `wordfreq`. Utilizando a operação `zip`, somos capazes de combinar a primeira palavra da lista de palavras com o primeiro número na lista de frequências, a segunda palavra e a segunda frequência e assim por diante. Terminamos com uma lista de pares de palavras e frequências. A função `str` converte qualquer objeto numa string para que ele possa ser exibido. - -Deve obter algo assim: - -``` python -String -foi o melhor dos tempos foi o pior dos tempos foi a idade da sabedoria foi a idade da ignorância - -Lista -['foi', 'o', 'melhor', 'dos', 'tempos', 'foi', 'o', 'pior', 'dos', 'tempos', 'foi', 'a', 'idade', 'da', 'sabedoria', 'foi', 'a', 'idade', 'da', 'ignorância'] - -Frequências -[4, 2, 1, 2, 2, 4, 2, 1, 2, 2, 4, 2, 2, 2, 1, 4, 2, 2, 2, 1] - -Pares -[('foi', 4), ('o', 2), ('melhor', 1), ('dos', 2), ('tempos', 2), ('foi', 4), ('o', 2), ('pior', 1), ('dos', 2), ('tempos', 2), ('foi', 4), ('a', 2), ('idade', 2), ('da', 2), ('sabedoria', 1), ('foi', 4), ('a', 2), ('idade', 2), ('da', 2), ('ignorância', 1)] -``` - -Valerá a pena estudar o código acima até entendê-lo antes de continuar. - -O Python também inclui uma ferramenta muito conveniente chamada *[list comprehension][]* (ver uma explicação do método de [compreensão de lista](https://pt.wikipedia.org/wiki/Compreens%C3%A3o_de_lista) em português), que pode ser utilizada para fazer o mesmo que um `for` *loop* de maneira mais económica. - -``` python -# count-list-items-1.py - -wordstring = 'foi o melhor dos tempos foi o pior dos tempos ' -wordstring += 'foi a idade da sabedoria foi a idade da ignorância' -wordlist = wordstring.split() - -wordfreq = [wordlist.count(w) for w in wordlist] # uma list comprehension - -print("String\n" + wordstring +"\n") -print("Lista\n" + str(wordlist) + "\n") -print("Frequências\n" + str(wordfreq) + "\n") -print("Pares\n" + str(list(zip(wordlist, wordfreq)))) -``` - -Se estudar esse método de compreensão de lista cuidadosamente, descobrirá que ele faz exatamente o mesmo que o `for` *loop* no exemplo anterior, mas de maneira condensada. Qualquer um dos métodos funcionará bem, então use a versão com a qual se sente mais confortável. - -Em geral é prudente utilizar um código que entenda ao invés de um código que seja executado mais rapidamente. - -Neste ponto, temos uma lista de pares, onde cada par contém uma palavra e sua frequência. Essa lista é um pouco redundante. Se 'the' ocorre 500 vezes, então essa lista contém quinhentas cópias do par ('the', 500). Essa lista também está ordenada pelas palavras no texto original, ao invés de listar as palavras na ordem da mais frequente para a menos frequente. Podemos resolver esses problemas convertendo-a em um dicionário, e depois exibindo o dicionário na ordem do item mais comum para o menos comum. - -## Dicionários de Python - -Tanto strings quanto listas são ordenadas sequencialmente, o que significa que pode acessar seus conteúdos utilizando um índice (*index*), um número que começa no 0. Caso tenha uma lista contendo strings, pode utilizar um par de índices para acessar uma string particular na lista, e depois um caractere particular naquela string. Estude os exemplos abaixo: - - -``` python - -s = 'olá mundo' -print(s[0]) --> o - -print(s[1]) --> l - -m = ['olá', 'mundo'] -print(m[0]) --> olá - -print(m[1]) --> mundo - -print(m[0][1]) --> l - -print(m[1][0]) --> m -``` - -Para manter controle sobre as frequências, utilizaremos outro tipo de objeto Python: um dicionário. O dicionário é uma coleção não ordenada de objetos. Isso significa que não pode utilizar índices para recuperar seus elementos. Pode, por outro lado, buscá-los utilizando uma chave, ou *key* no inglês (daí o nome "dicionário"). Estude o exemplo a seguir: - - -``` python - -d = {'mundo': 1, 'olá': 0} -print(d['olá']) --> 0 - -print(d['mundo']) --> 1 - -print(d.keys()) --> dict_keys(['mundo', 'olá']) -``` - -Dicionários podem ser um pouco confusos para um novo programador. Tente pensar neles como um dicionário de idiomas. Caso não saiba (ou não se lembre) como exatamente "*bijection*" difere de "*surjection*", pode buscar pelos dois termos no *Oxford English Dictionary*. O mesmo princípio se aplica quando realiza um `print(d['olá'])` exceto pelo fato de que, ao invés de exibir uma definição literária, ele exibe o valor associado à palavra-chave 'olá', conforme definido por você quando criou o dicionário chamado `d`. Nesse caso, esse valor é "0". - -Observe que usa chaves para definir um dicionário, mas colchetes para acessar coisas dentro dele. A operação `keys` retorna uma lista de chaves que estão definidas no dicionário. - -## Pares Palavra-Frequência - -Com base no que temos até agora, queremos uma função que seja capaz de converter uma lista de palavras em um dicionário de pares palavra-frequência. O único comando novo que vamos precisar é `dict`, que faz um dicionário a partir de uma lista de pares. Copie o código a seguir e adicione-o ao módulo `obo.py`: - -``` python -# Dada uma lista de palavras, retorna um dicionário de pares palavra-frequência. - -def wordListToFreqDict(wordlist): - wordfreq = [wordlist.count(p) for p in wordlist] - return dict(list(zip(wordlist,wordfreq))) -``` - -Também vamos querer uma função que seja capaz de ordenar o dicionário de pares palavra-frequência por frequência decrescente. Copie o código a seguir e adicione-o também ao módulo `obo.py`: - - -``` python -# Ordena um dicionário de pares palavra-frequência em ordem decrescente de frequência. - -def sortFreqDict(freqdict): - aux = [(freqdict[key], key) for key in freqdict] - aux.sort() - aux.reverse() - return aux -``` - -Agora podemos escrever um programa que recebe uma URL e retorna pares palavra-frequência para a página web, de acordo com a ordem decrescente de frequência. Copie o programa a seguir no Komodo Edit, armazene-o como `html-to-freq.py` e execute-o. Estude o programa e seu resultado cuidadosamente antes de continuar. - - -``` python -#html-to-freq.py - -import urllib.request, urllib.error, urllib.parse, obo - -url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' - -response = urllib.request.urlopen(url) -html = response.read().decode('UTF-8') -text = obo.stripTags(html).lower() -wordlist = obo.stripNonAlphaNum(text) -dictionary = obo.wordListToFreqDict(wordlist) -sorteddict = obo.sortFreqDict(dictionary) - -for s in sorteddict: print(str(s)) -``` - -## Removendo *Stop Words* - -Quando vemos o resultado do nosso programa `html-to-freq.py`, verificamos que muitas das palavras mais frequentes no texto são palavras funcionais como *the*, *of*, *to* e *and*. - -``` python -(192, 'the') -(105, 'i') -(74, 'to') -(71, 'was') -(67, 'of') -(62, 'in') -(53, 'a') -(52, 'and') -(50, 'you') -(50, 'he') -(40, 'that') -(39, 'his') -(36, 'it') -``` - -Essas palavras são geralmente as mais comuns em qualquer texto de língua inglesa, então elas não nos dizem muito a respeito do julgamento de Bowsey. Em geral, estamos mais interessados em encontrar as palavras que nos auxiliarão a diferenciar esse texto de outros textos sobre assuntos distintos. Desse modo, vamos remover as palavras funcionais comuns. Palavras que são ignoradas dessa forma são conhecidas como _stopwords_[^1]. Utilizaremos a lista a seguir, adaptada de uma publicação *online* por [cientistas da computação em Glasgow][]. Copie-a e adicione-a no início da biblioteca `obo.py` que está construindo. - -``` python -stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards'] -stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along'] -stopwords += ['already', 'also', 'although', 'always', 'am', 'among'] -stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another'] -stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere'] -stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became'] -stopwords += ['because', 'become', 'becomes', 'becoming', 'been'] -stopwords += ['before', 'beforehand', 'behind', 'being', 'below'] -stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both'] -stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant'] -stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de'] -stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due'] -stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else'] -stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever'] -stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except'] -stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first'] -stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found'] -stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give'] -stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her'] -stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers'] -stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however'] -stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed'] -stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep'] -stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made'] -stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine'] -stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much'] -stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never'] -stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none'] -stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of'] -stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or'] -stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves'] -stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please'] -stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed'] -stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should'] -stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so'] -stopwords += ['some', 'somehow', 'someone', 'something', 'sometime'] -stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take'] -stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves'] -stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby'] -stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they'] -stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three'] -stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to'] -stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve'] -stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon'] -stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what'] -stopwords += ['whatever', 'when', 'whence', 'whenever', 'where'] -stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon'] -stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who'] -stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with'] -stopwords += ['within', 'without', 'would', 'yet', 'you', 'your'] -stopwords += ['yours', 'yourself', 'yourselves'] -``` - -Agora, livrar-se das *stop words* em uma lista é fácil: basta usar outra *list comprehension*. Adicione também essa função ao módulo `obo.py`: - -``` python -# Dada uma lista de palavras, remove qualquer uma que esteja em uma lista de stop words - -def removeStopwords(wordlist, stopwords): - return [w for w in wordlist if w not in stopwords] -``` - -## Juntando Tudo - -Agora temos tudo o que precisamos para determinar frequências de palavras para páginas web. Copie o código a seguir no Komodo Edit, armazene-o como `html-to-freq-2.py` e execute-o: - - -``` python -# html-to-freq-2.py - -import urllib.request, urllib.error, urllib.parse -import obo - -url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' - -response = urllib.request.urlopen(url) -html = response.read().decode('UTF-8') -text = obo.stripTags(html).lower() -fullwordlist = obo.stripNonAlphaNum(text) -wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) -dictionary = obo.wordListToFreqDict(wordlist) -sorteddict = obo.sortFreqDict(dictionary) - -for s in sorteddict: print(str(s)) -``` - -Se tudo correu bem, sua saída deve-se parecer com isto: - -``` python -(25, 'house') -(20, 'yes') -(20, 'prisoner') -(19, 'mr') -(17, 'man') -(15, 'akerman') -(14, 'mob') -(13, 'black') -(12, 'night') -(11, 'saw') -(9, 'went') -(9, 'sworn') -(9, 'room') -(9, 'pair') -(9, 'know') -(9, 'face') -(8, 'time') -(8, 'thing') -(8, 'june') -(8, 'believe') -... -``` - -## Leituras Sugeridas - -Lutz, Learning Python - -- Ch. 9: Tuples, Files, and Everything Else -- Ch. 11: Assignment, Expressions, and print -- Ch. 12: if Tests -- Ch. 13: while and for Loops - -Pilgrim, Diving into Python - -- Ch. 7: [Regular Expressions][] - -## Sincronização de Código - -Para acompanhar lições futuras, é importante ter os ficheiros e programas corretos no seu diretório “programming-historian”. No final de cada lição, é possível fazer o *download* do ficheiro zip “programming-historian” para garantir que possui o código correto. - -- programming-historian-5 ([zip sync][]) - - [list comprehension]: http://docs.python.org/tutorial/datastructures.html#list-comprehensions - [cientistas da computação em Glasgow]: http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words - [Regular Expressions]: https://web.archive.org/web/20180416143856/http://www.diveintopython.net/regular_expressions/index.html - [zip]: https://programminghistorian.org/assets/python-lessons4.zip - [zip sync]: https://programminghistorian.org/assets/python-lessons5.zip - [^1]: Na língua portuguesa, palavras similares seriam "e", "de", "da", "do", "um", "uma", dentre outras, a depender de cada caso. +--- +title: Contagem de Frequências de Palavras com Python +layout: lesson +slug: contar-frequencias-palavras-python +date: 2012-07-17 +translation_date: 2022-01-13 +authors: +- William J. Turkel +- Adam Crymble +reviewers: +- Jim Clifford +- Frederik Elwert +editors: +- Miriam Posner +translator: +- Felipe Lamarca +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Ana Carolina Erthal +- Joana Vieira Paulino +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/461 +activity: analyzing +topics: [python] +abstract: "Contar a frequência de palavras específicas de uma lista pode fornecer dados esclarecedores. Esta lição ensinará uma maneira fácil de contar essas frequências com Python." +original: counting-frequencies +avatar_alt: Homem descontente sentado em um tronco cercado por pássaros +doi: 10.46430/phpt0023 +--- + +{% include toc.html %} + +## Objetivos da Lição + +Sua lista agora está limpa o suficiente para que possa começar a analisar seu conteúdo de maneiras significativas. Contar a frequência de palavras específicas de uma lista pode fornecer dados esclarecedores. Python possui uma maneira fácil de contar frequências, mas requer o uso de um novo tipo de variável: o *dicionário*. Antes de começar a trabalhar com um dicionário, considere os processos utilizados para calcular frequências em uma lista. + +### Ficheiros Necessários para esta Lição + +- `obo.py` + +Caso não possua esse ficheiro, pode fazer o *download* do ficheiro ([zip][]) que contém todo o código das lições anteriores desta série. + +## Frequências + +Agora desejamos contar a frequência de cada palavra em nossa lista. Já viu que é fácil de processar uma lista utilizando um `for` *loop*. Tente salvar e executar o exemplo a seguir. Lembre-se de que `+=` informa ao programa para acrescentar algo ao final de uma variável existente. + +``` python +# count-list-items-1.py + +wordstring = 'foi o melhor dos tempos foi o pior dos tempos ' +wordstring += 'foi a idade da sabedoria foi a idade da ignorância' +wordlist = wordstring.split() + +wordfreq = [] +for w in wordlist: + wordfreq.append(wordlist.count(w)) + +print("String\n" + wordstring +"\n") +print("Lista\n" + str(wordlist) + "\n") +print("Frequências\n" + str(wordfreq) + "\n") +print("Pares\n" + str(list(zip(wordlist, wordfreq)))) +``` + +Aqui, começamos com uma string e separamo-la em uma lista, como fizemos anteriormente. Depois disso criamos uma lista (inicialmente vazia) chamada `wordfreq`, percorremos cada palavra na `wordlist` e contamos o número de vezes que aquela palavra aparece em toda a lista. Então, adicionamos a contagem de cada palavra à nossa lista `wordfreq`. Utilizando a operação `zip`, somos capazes de combinar a primeira palavra da lista de palavras com o primeiro número na lista de frequências, a segunda palavra e a segunda frequência e assim por diante. Terminamos com uma lista de pares de palavras e frequências. A função `str` converte qualquer objeto numa string para que ele possa ser exibido. + +Deve obter algo assim: + +``` python +String +foi o melhor dos tempos foi o pior dos tempos foi a idade da sabedoria foi a idade da ignorância + +Lista +['foi', 'o', 'melhor', 'dos', 'tempos', 'foi', 'o', 'pior', 'dos', 'tempos', 'foi', 'a', 'idade', 'da', 'sabedoria', 'foi', 'a', 'idade', 'da', 'ignorância'] + +Frequências +[4, 2, 1, 2, 2, 4, 2, 1, 2, 2, 4, 2, 2, 2, 1, 4, 2, 2, 2, 1] + +Pares +[('foi', 4), ('o', 2), ('melhor', 1), ('dos', 2), ('tempos', 2), ('foi', 4), ('o', 2), ('pior', 1), ('dos', 2), ('tempos', 2), ('foi', 4), ('a', 2), ('idade', 2), ('da', 2), ('sabedoria', 1), ('foi', 4), ('a', 2), ('idade', 2), ('da', 2), ('ignorância', 1)] +``` + +Valerá a pena estudar o código acima até entendê-lo antes de continuar. + +O Python também inclui uma ferramenta muito conveniente chamada *[list comprehension][]* (ver uma explicação do método de [compreensão de lista](https://pt.wikipedia.org/wiki/Compreens%C3%A3o_de_lista) em português), que pode ser utilizada para fazer o mesmo que um `for` *loop* de maneira mais económica. + +``` python +# count-list-items-1.py + +wordstring = 'foi o melhor dos tempos foi o pior dos tempos ' +wordstring += 'foi a idade da sabedoria foi a idade da ignorância' +wordlist = wordstring.split() + +wordfreq = [wordlist.count(w) for w in wordlist] # uma list comprehension + +print("String\n" + wordstring +"\n") +print("Lista\n" + str(wordlist) + "\n") +print("Frequências\n" + str(wordfreq) + "\n") +print("Pares\n" + str(list(zip(wordlist, wordfreq)))) +``` + +Se estudar esse método de compreensão de lista cuidadosamente, descobrirá que ele faz exatamente o mesmo que o `for` *loop* no exemplo anterior, mas de maneira condensada. Qualquer um dos métodos funcionará bem, então use a versão com a qual se sente mais confortável. + +Em geral é prudente utilizar um código que entenda ao invés de um código que seja executado mais rapidamente. + +Neste ponto, temos uma lista de pares, onde cada par contém uma palavra e sua frequência. Essa lista é um pouco redundante. Se 'the' ocorre 500 vezes, então essa lista contém quinhentas cópias do par ('the', 500). Essa lista também está ordenada pelas palavras no texto original, ao invés de listar as palavras na ordem da mais frequente para a menos frequente. Podemos resolver esses problemas convertendo-a em um dicionário, e depois exibindo o dicionário na ordem do item mais comum para o menos comum. + +## Dicionários de Python + +Tanto strings quanto listas são ordenadas sequencialmente, o que significa que pode acessar seus conteúdos utilizando um índice (*index*), um número que começa no 0. Caso tenha uma lista contendo strings, pode utilizar um par de índices para acessar uma string particular na lista, e depois um caractere particular naquela string. Estude os exemplos abaixo: + + +``` python + +s = 'olá mundo' +print(s[0]) +-> o + +print(s[1]) +-> l + +m = ['olá', 'mundo'] +print(m[0]) +-> olá + +print(m[1]) +-> mundo + +print(m[0][1]) +-> l + +print(m[1][0]) +-> m +``` + +Para manter controle sobre as frequências, utilizaremos outro tipo de objeto Python: um dicionário. O dicionário é uma coleção não ordenada de objetos. Isso significa que não pode utilizar índices para recuperar seus elementos. Pode, por outro lado, buscá-los utilizando uma chave, ou *key* no inglês (daí o nome "dicionário"). Estude o exemplo a seguir: + + +``` python + +d = {'mundo': 1, 'olá': 0} +print(d['olá']) +-> 0 + +print(d['mundo']) +-> 1 + +print(d.keys()) +-> dict_keys(['mundo', 'olá']) +``` + +Dicionários podem ser um pouco confusos para um novo programador. Tente pensar neles como um dicionário de idiomas. Caso não saiba (ou não se lembre) como exatamente "*bijection*" difere de "*surjection*", pode buscar pelos dois termos no *Oxford English Dictionary*. O mesmo princípio se aplica quando realiza um `print(d['olá'])` exceto pelo fato de que, ao invés de exibir uma definição literária, ele exibe o valor associado à palavra-chave 'olá', conforme definido por você quando criou o dicionário chamado `d`. Nesse caso, esse valor é "0". + +Observe que usa chaves para definir um dicionário, mas colchetes para acessar coisas dentro dele. A operação `keys` retorna uma lista de chaves que estão definidas no dicionário. + +## Pares Palavra-Frequência + +Com base no que temos até agora, queremos uma função que seja capaz de converter uma lista de palavras em um dicionário de pares palavra-frequência. O único comando novo que vamos precisar é `dict`, que faz um dicionário a partir de uma lista de pares. Copie o código a seguir e adicione-o ao módulo `obo.py`: + +``` python +# Dada uma lista de palavras, retorna um dicionário de pares palavra-frequência. + +def wordListToFreqDict(wordlist): + wordfreq = [wordlist.count(p) for p in wordlist] + return dict(list(zip(wordlist,wordfreq))) +``` + +Também vamos querer uma função que seja capaz de ordenar o dicionário de pares palavra-frequência por frequência decrescente. Copie o código a seguir e adicione-o também ao módulo `obo.py`: + + +``` python +# Ordena um dicionário de pares palavra-frequência em ordem decrescente de frequência. + +def sortFreqDict(freqdict): + aux = [(freqdict[key], key) for key in freqdict] + aux.sort() + aux.reverse() + return aux +``` + +Agora podemos escrever um programa que recebe uma URL e retorna pares palavra-frequência para a página web, de acordo com a ordem decrescente de frequência. Copie o programa a seguir no Komodo Edit, armazene-o como `html-to-freq.py` e execute-o. Estude o programa e seu resultado cuidadosamente antes de continuar. + + +``` python +#html-to-freq.py + +import urllib.request, urllib.error, urllib.parse, obo + +url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' + +response = urllib.request.urlopen(url) +html = response.read().decode('UTF-8') +text = obo.stripTags(html).lower() +wordlist = obo.stripNonAlphaNum(text) +dictionary = obo.wordListToFreqDict(wordlist) +sorteddict = obo.sortFreqDict(dictionary) + +for s in sorteddict: print(str(s)) +``` + +## Removendo *Stop Words* + +Quando vemos o resultado do nosso programa `html-to-freq.py`, verificamos que muitas das palavras mais frequentes no texto são palavras funcionais como *the*, *of*, *to* e *and*. + +``` python +(192, 'the') +(105, 'i') +(74, 'to') +(71, 'was') +(67, 'of') +(62, 'in') +(53, 'a') +(52, 'and') +(50, 'you') +(50, 'he') +(40, 'that') +(39, 'his') +(36, 'it') +``` + +Essas palavras são geralmente as mais comuns em qualquer texto de língua inglesa, então elas não nos dizem muito a respeito do julgamento de Bowsey. Em geral, estamos mais interessados em encontrar as palavras que nos auxiliarão a diferenciar esse texto de outros textos sobre assuntos distintos. Desse modo, vamos remover as palavras funcionais comuns. Palavras que são ignoradas dessa forma são conhecidas como _stopwords_[^1]. Utilizaremos a lista a seguir, adaptada de uma publicação *online* por [cientistas da computação em Glasgow][]. Copie-a e adicione-a no início da biblioteca `obo.py` que está construindo. + +``` python +stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards'] +stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along'] +stopwords += ['already', 'also', 'although', 'always', 'am', 'among'] +stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another'] +stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere'] +stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became'] +stopwords += ['because', 'become', 'becomes', 'becoming', 'been'] +stopwords += ['before', 'beforehand', 'behind', 'being', 'below'] +stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both'] +stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant'] +stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de'] +stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due'] +stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else'] +stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever'] +stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except'] +stopwords += ['few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first'] +stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found'] +stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give'] +stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her'] +stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers'] +stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however'] +stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed'] +stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep'] +stopwords += ['last', 'latter', 'latterly', 'least', 'less', 'ltd', 'made'] +stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine'] +stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much'] +stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never'] +stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none'] +stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of'] +stopwords += ['off', 'often', 'on','once', 'one', 'only', 'onto', 'or'] +stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves'] +stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please'] +stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed'] +stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should'] +stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so'] +stopwords += ['some', 'somehow', 'someone', 'something', 'sometime'] +stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take'] +stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves'] +stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby'] +stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they'] +stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three'] +stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to'] +stopwords += ['together', 'too', 'top', 'toward', 'towards', 'twelve'] +stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon'] +stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what'] +stopwords += ['whatever', 'when', 'whence', 'whenever', 'where'] +stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon'] +stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who'] +stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with'] +stopwords += ['within', 'without', 'would', 'yet', 'you', 'your'] +stopwords += ['yours', 'yourself', 'yourselves'] +``` + +Agora, livrar-se das *stop words* em uma lista é fácil: basta usar outra *list comprehension*. Adicione também essa função ao módulo `obo.py`: + +``` python +# Dada uma lista de palavras, remove qualquer uma que esteja em uma lista de stop words + +def removeStopwords(wordlist, stopwords): + return [w for w in wordlist if w not in stopwords] +``` + +## Juntando Tudo + +Agora temos tudo o que precisamos para determinar frequências de palavras para páginas web. Copie o código a seguir no Komodo Edit, armazene-o como `html-to-freq-2.py` e execute-o: + + +``` python +# html-to-freq-2.py + +import urllib.request, urllib.error, urllib.parse +import obo + +url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' + +response = urllib.request.urlopen(url) +html = response.read().decode('UTF-8') +text = obo.stripTags(html).lower() +fullwordlist = obo.stripNonAlphaNum(text) +wordlist = obo.removeStopwords(fullwordlist, obo.stopwords) +dictionary = obo.wordListToFreqDict(wordlist) +sorteddict = obo.sortFreqDict(dictionary) + +for s in sorteddict: print(str(s)) +``` + +Se tudo correu bem, sua saída deve-se parecer com isto: + +``` python +(25, 'house') +(20, 'yes') +(20, 'prisoner') +(19, 'mr') +(17, 'man') +(15, 'akerman') +(14, 'mob') +(13, 'black') +(12, 'night') +(11, 'saw') +(9, 'went') +(9, 'sworn') +(9, 'room') +(9, 'pair') +(9, 'know') +(9, 'face') +(8, 'time') +(8, 'thing') +(8, 'june') +(8, 'believe') +... +``` + +## Leituras Sugeridas + +Lutz, Learning Python + +- Ch. 9: Tuples, Files, and Everything Else +- Ch. 11: Assignment, Expressions, and print +- Ch. 12: if Tests +- Ch. 13: while and for Loops + +Pilgrim, Diving into Python + +- Ch. 7: [Regular Expressions][] + +## Sincronização de Código + +Para acompanhar lições futuras, é importante ter os ficheiros e programas corretos no seu diretório “programming-historian”. No final de cada lição, é possível fazer o *download* do ficheiro zip “programming-historian” para garantir que possui o código correto. + +- programming-historian-5 ([zip sync][]) + + [list comprehension]: https://docs.python.org/tutorial/datastructures.html#list-comprehensions + [cientistas da computação em Glasgow]: https://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words + [Regular Expressions]: https://web.archive.org/web/20180416143856/https://www.diveintopython.net/regular_expressions/index.html + [zip]: https://programminghistorian.org/assets/python-lessons4.zip + [zip sync]: https://programminghistorian.org/assets/python-lessons5.zip + [^1]: Na língua portuguesa, palavras similares seriam "e", "de", "da", "do", "um", "uma", dentre outras, a depender de cada caso. diff --git a/pt/licoes/criacao-visualizacao-ficheiros-html-python.md b/pt/licoes/criacao-visualizacao-ficheiros-html-python.md index 37e40a4839..533a14c56b 100644 --- a/pt/licoes/criacao-visualizacao-ficheiros-html-python.md +++ b/pt/licoes/criacao-visualizacao-ficheiros-html-python.md @@ -1,146 +1,146 @@ ---- -title: Criação e Visualização de Ficheiros HTML com Python -layout: lesson -slug: criacao-visualizacao-ficheiros-html-python -date: 2012-07-17 -translation_date: 2022-10-31 -authors: -- William J. Turkel -- Adam Crymble -reviewers: -- Jim Clifford -editors: -- Miriam Posner -translator: -- Felipe Lamarca -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Gabriela Kucuruza -- Ana Carolina Erthal -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/462 -activity: presenting -topics: [python, website] -abstract: "Com esta lição aprenderá a criar ficheiros HTML com scripts Python e a usar o Python para abrir automaticamente um ficheiro HTML no Firefox." -original: creating-and-viewing-html-files-with-python -avatar_alt: Criança desenhando numa tábua -doi: 10.46430/phpt0030 ---- - -{% include toc.html %} - -## Objetivos da Lição - -Esta lição usa o Python para criar e visualizar um ficheiro HTML. Se escrever programas que produzem HTML, pode utilizar qualquer navegador para ver os seus resultados. Isso é especialmente conveniente se o seu programa cria automaticamente hiperlinks ou entidades gráficas, como gráficos e diagramas. - -Aqui irá aprender como criar ficheiros HTML com scripts Python e como utilizar o Python para abrir um ficheiro HTML automaticamente no Firefox. - -## Ficheiros Necessários para esta Lição - -- `obo.py` - -Caso não possua esses ficheiros da lição anterior, pode fazer o *download* do programming-historian-5, um [ficheiro zip da lição anterior](/assets/python-lessons5.zip). - -## Criando HTML com Python - -Até aqui, aprendemos como usar o Python para fazer o *download* de fontes *online* e extrair informação delas de forma automática. Lembre-se de que o nosso objetivo final é incorporar perfeitamente a programação em nossa prática de investigação. Em linha com este objetivo, nesta lição e na próxima aprenderemos como apresentar dados de volta à forma de HTML. Isso possui algumas vantagens. Primeiro, ao armazenar a informação no nosso disco rígido como um ficheiro HTML, podemos abri-lo com o Firefox e usar o [Zotero](https://www.zotero.org/), por exemplo, para indexar e fazer anotações posteriormente. Segundo, há uma ampla gama de opções de visualização para HTML que podemos usar mais tarde. - -Caso ainda não tenha feito o [tutorial de HTML do W3 Schools](http://www.w3schools.com/html/default.asp), reserve alguns minutos para fazê-lo antes de continuar. Criaremos um documento HTML usando Python, então será saber o que é um documento HTML! - -## "Olá mundo" em HTML usando Python - -Uma das ideias mais poderosas na ciência da computação é que um ficheiro que parece conter código sob uma perspectiva pode ser visto como dados sob outra. É possível, em outras palavras, escrever programas que manipulam outros programas. O que faremos a seguir é criar um ficheiro HTML que diz "Olá mundo!" usando Python. Faremos isso armazenando *tags* HTML em uma string multilinha de Python e guardando os conteúdos em um novo ficheiro. Esse ficheiro será armazenado com uma extensão `.html` ao invés de uma extensão `.txt`. - -Tipicamente um ficheiro HTML começa com uma [declaração do tipo de documento](http://www.w3schools.com/tags/tag_doctype.asp). Vimos isso ao escrever um programa HTML "Olá mundo!" em uma lição anterior. Para facilitar a leitura do nosso código, omitiremos o `doctype` neste exemplo. Lembre-se de que uma string multilinha é criada colocando o texto entre três aspas (veja abaixo): - -``` python -# write-html.py - -f = open('helloworld.html','w') - -message = """ - -

    Olá mundo!

    -""" - -f.write(message) -f.close() -``` - -Salve o programa acima como `write-html.py` e execute-o. Use `Ficheiro -> Abrir` (ou `Arquivo -> Abrir`, na versão brasileira) no editor de texto de sua escolha para abrir `helloworld.html` para verificar que seu programa de fato criou o ficheiro. O conteúdo deve se parecer com isto: - -{% include figure.html filename="hello-world-html.png" caption="Fonte HTML gerada pelo programa Python" %} - -Agora vá para o seu navegador Firefox e escolha `Ficheiro -> Nova Guia` (ou `Arquivo -> Nova aba`, na versão brasileira), vá para a guia e escolha `Ficheiro -> Abrir Ficheiro` (ou `Arquivo -> Abrir arquivo`, na versão brasileira). Selecione `helloworld.html`. Deve agora ser capaz de ver a sua mensagem no navegador. Reserve um momento para pensar sobre isso: agora tem a habilidade de escrever um programa que pode criar uma página web automaticamente. Não há razão pela qual não possa escrever um programa para criar automaticamente um *site* inteiro, caso deseje. - -
    - Por questões de versionamento, é possível que o seu navegador Firefox não possua a opção de abrir um ficheiro manualmente na guia. Nesse caso, procure pelo ficheiro HTML no seu diretório, clique nele com o botão direito e selecione a opção de abri-lo com o navegador Firefox. -
    - -## Usando o Python para Controlar o Firefox - -Nós criamos um ficheiro HTML automaticamente, mas depois precisamos deixar o nosso editor, ir para o Firefox e abrir o ficheiro em uma nova guia. Não seria melhor incluir essa etapa final no nosso programa Python? Digite ou copie o código abaixo e armazene-o como `write-html-2.py`. Quando executá-lo, ele deve criar o seu ficheiro HTML e depois abri-lo automaticamente numa nova guia do Firefox. Maravilha! - -### Instruções para Mac - -Usuários de Mac precisarão especificar a localização precisa do ficheiro `.html` nos seus computadores. Para fazer isso, localize a pasta `programming-historian` que criou para fazer esses tutoriais, clique com o botão direito nela e selecione "Obter Informações" (ou "*Get Info*"). - -Pode então recortar e colar a localização do ficheiro listado depois de "Onde:" (ou "*Where:*") e se certificar de incluir uma barra final (/) para que o computador saiba que deseja algo dentro desse diretório (e não o diretório em si). - - -``` python -# write-html-2-mac.py -import webbrowser - -f = open('helloworld.html','w') - -message = """ - -

    Olá mundo!

    -""" - -f.write(message) -f.close() - -#Altere o caminho para refletir a localização do ficheiro -filename = 'file:///Users/username/Desktop/programming-historian/' + 'helloworld.html' -webbrowser.open_new_tab(filename) -``` - -Caso receba um erro "Ficheiro não encontrado" (ou "*File not found*"), significa que não mudou o caminho para o ficheiro corretamente. - -### Instruções para Windows - -``` python -# write-html-2-windows.py - -import webbrowser - -f = open('helloworld.html','w') - -message = """ - -

    Olá mundo!

    -""" - -f.write(message) -f.close() - -webbrowser.open_new_tab('helloworld.html') -``` - -\*\*\* - -No final, não só escreveu um programa Python que pode criar um HTML simples, mas também controlou o seu navegador Firefox utilizando Python. Na próxima lição, focaremos em apresentar os dados que coletamos na forma de um ficheiro HTML. - -## Leituras Sugeridas - -- Lutz, Learning Python - - Re-read and review Chs. 1-17 - -## Sincronização de Código - -Para acompanhar lições futuras, é importante ter os ficheiros e programas corretos no seu diretório “programming-historian”. No final de cada lição, é possível fazer o *download* do ficheiro zip “programming-historian” para garantir que possui o código correto. Caso esteja acompanhando com a versão para Mac / Linux, deve ter que abrir o ficheiro `obo.py` e mudar "file:///Users/username/Desktop/programming-historian/" para o caminho até o diretório no seu próprio computador. - -- [python-lessons6.zip](/assets/python-lessons6.zip) +--- +title: Criação e Visualização de Ficheiros HTML com Python +layout: lesson +slug: criacao-visualizacao-ficheiros-html-python +date: 2012-07-17 +translation_date: 2022-10-31 +authors: +- William J. Turkel +- Adam Crymble +reviewers: +- Jim Clifford +editors: +- Miriam Posner +translator: +- Felipe Lamarca +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Gabriela Kucuruza +- Ana Carolina Erthal +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/462 +activity: presenting +topics: [python, website] +abstract: "Com esta lição aprenderá a criar ficheiros HTML com scripts Python e a usar o Python para abrir automaticamente um ficheiro HTML no Firefox." +original: creating-and-viewing-html-files-with-python +avatar_alt: Criança desenhando numa tábua +doi: 10.46430/phpt0030 +--- + +{% include toc.html %} + +## Objetivos da Lição + +Esta lição usa o Python para criar e visualizar um ficheiro HTML. Se escrever programas que produzem HTML, pode utilizar qualquer navegador para ver os seus resultados. Isso é especialmente conveniente se o seu programa cria automaticamente hiperlinks ou entidades gráficas, como gráficos e diagramas. + +Aqui irá aprender como criar ficheiros HTML com scripts Python e como utilizar o Python para abrir um ficheiro HTML automaticamente no Firefox. + +## Ficheiros Necessários para esta Lição + +- `obo.py` + +Caso não possua esses ficheiros da lição anterior, pode fazer o *download* do programming-historian-5, um [ficheiro zip da lição anterior](/assets/python-lessons5.zip). + +## Criando HTML com Python + +Até aqui, aprendemos como usar o Python para fazer o *download* de fontes *online* e extrair informação delas de forma automática. Lembre-se de que o nosso objetivo final é incorporar perfeitamente a programação em nossa prática de investigação. Em linha com este objetivo, nesta lição e na próxima aprenderemos como apresentar dados de volta à forma de HTML. Isso possui algumas vantagens. Primeiro, ao armazenar a informação no nosso disco rígido como um ficheiro HTML, podemos abri-lo com o Firefox e usar o [Zotero](https://www.zotero.org/), por exemplo, para indexar e fazer anotações posteriormente. Segundo, há uma ampla gama de opções de visualização para HTML que podemos usar mais tarde. + +Caso ainda não tenha feito o [tutorial de HTML do W3 Schools](https://www.w3schools.com/html/default.asp), reserve alguns minutos para fazê-lo antes de continuar. Criaremos um documento HTML usando Python, então será saber o que é um documento HTML! + +## "Olá mundo" em HTML usando Python + +Uma das ideias mais poderosas na ciência da computação é que um ficheiro que parece conter código sob uma perspectiva pode ser visto como dados sob outra. É possível, em outras palavras, escrever programas que manipulam outros programas. O que faremos a seguir é criar um ficheiro HTML que diz "Olá mundo!" usando Python. Faremos isso armazenando *tags* HTML em uma string multilinha de Python e guardando os conteúdos em um novo ficheiro. Esse ficheiro será armazenado com uma extensão `.html` ao invés de uma extensão `.txt`. + +Tipicamente um ficheiro HTML começa com uma [declaração do tipo de documento](https://www.w3schools.com/tags/tag_doctype.asp). Vimos isso ao escrever um programa HTML "Olá mundo!" em uma lição anterior. Para facilitar a leitura do nosso código, omitiremos o `doctype` neste exemplo. Lembre-se de que uma string multilinha é criada colocando o texto entre três aspas (veja abaixo): + +``` python +# write-html.py + +f = open('helloworld.html','w') + +message = """ + +

    Olá mundo!

    +""" + +f.write(message) +f.close() +``` + +Salve o programa acima como `write-html.py` e execute-o. Use `Ficheiro -> Abrir` (ou `Arquivo -> Abrir`, na versão brasileira) no editor de texto de sua escolha para abrir `helloworld.html` para verificar que seu programa de fato criou o ficheiro. O conteúdo deve se parecer com isto: + +{% include figure.html filename="hello-world-html.png" caption="Fonte HTML gerada pelo programa Python" %} + +Agora vá para o seu navegador Firefox e escolha `Ficheiro -> Nova Guia` (ou `Arquivo -> Nova aba`, na versão brasileira), vá para a guia e escolha `Ficheiro -> Abrir Ficheiro` (ou `Arquivo -> Abrir arquivo`, na versão brasileira). Selecione `helloworld.html`. Deve agora ser capaz de ver a sua mensagem no navegador. Reserve um momento para pensar sobre isso: agora tem a habilidade de escrever um programa que pode criar uma página web automaticamente. Não há razão pela qual não possa escrever um programa para criar automaticamente um *site* inteiro, caso deseje. + +
    + Por questões de versionamento, é possível que o seu navegador Firefox não possua a opção de abrir um ficheiro manualmente na guia. Nesse caso, procure pelo ficheiro HTML no seu diretório, clique nele com o botão direito e selecione a opção de abri-lo com o navegador Firefox. +
    + +## Usando o Python para Controlar o Firefox + +Nós criamos um ficheiro HTML automaticamente, mas depois precisamos deixar o nosso editor, ir para o Firefox e abrir o ficheiro em uma nova guia. Não seria melhor incluir essa etapa final no nosso programa Python? Digite ou copie o código abaixo e armazene-o como `write-html-2.py`. Quando executá-lo, ele deve criar o seu ficheiro HTML e depois abri-lo automaticamente numa nova guia do Firefox. Maravilha! + +### Instruções para Mac + +Usuários de Mac precisarão especificar a localização precisa do ficheiro `.html` nos seus computadores. Para fazer isso, localize a pasta `programming-historian` que criou para fazer esses tutoriais, clique com o botão direito nela e selecione "Obter Informações" (ou "*Get Info*"). + +Pode então recortar e colar a localização do ficheiro listado depois de "Onde:" (ou "*Where:*") e se certificar de incluir uma barra final (/) para que o computador saiba que deseja algo dentro desse diretório (e não o diretório em si). + + +``` python +# write-html-2-mac.py +import webbrowser + +f = open('helloworld.html','w') + +message = """ + +

    Olá mundo!

    +""" + +f.write(message) +f.close() + +#Altere o caminho para refletir a localização do ficheiro +filename = 'file:///Users/username/Desktop/programming-historian/' + 'helloworld.html' +webbrowser.open_new_tab(filename) +``` + +Caso receba um erro "Ficheiro não encontrado" (ou "*File not found*"), significa que não mudou o caminho para o ficheiro corretamente. + +### Instruções para Windows + +``` python +# write-html-2-windows.py + +import webbrowser + +f = open('helloworld.html','w') + +message = """ + +

    Olá mundo!

    +""" + +f.write(message) +f.close() + +webbrowser.open_new_tab('helloworld.html') +``` + +\*\*\* + +No final, não só escreveu um programa Python que pode criar um HTML simples, mas também controlou o seu navegador Firefox utilizando Python. Na próxima lição, focaremos em apresentar os dados que coletamos na forma de um ficheiro HTML. + +## Leituras Sugeridas + +- Lutz, Learning Python + - Re-read and review Chs. 1-17 + +## Sincronização de Código + +Para acompanhar lições futuras, é importante ter os ficheiros e programas corretos no seu diretório “programming-historian”. No final de cada lição, é possível fazer o *download* do ficheiro zip “programming-historian” para garantir que possui o código correto. Caso esteja acompanhando com a versão para Mac / Linux, deve ter que abrir o ficheiro `obo.py` e mudar "file:///Users/username/Desktop/programming-historian/" para o caminho até o diretório no seu próprio computador. + +- [python-lessons6.zip](/assets/python-lessons6.zip) diff --git a/pt/licoes/criar-exposicao-omeka.md b/pt/licoes/criar-exposicao-omeka.md index 8bda21e4bb..150e885312 100644 --- a/pt/licoes/criar-exposicao-omeka.md +++ b/pt/licoes/criar-exposicao-omeka.md @@ -45,7 +45,7 @@ Agora que [adicionamos itens no seu site do Omeka](/en/lessons/up-and-running-wi A coleção é uma lista de objetos. Uma exposição é um _tour_ guiado pelos seus itens com textos descritivos e _layouts_ customizados. Para criar uma, clique na aba **Exposições** e depois em **Adicionar uma exposição**. Preencha o formulário na parte superior da página. Um **slug** é um nome da sua exposição, legível por uma máquina, que fará parte do seu URL. -Nota da tradução: para configurar o Omeka.net internamente para a língua portuguesa, siga as instruções disponíveis neste [tutorial.](/pt/licoes/introducao-omeka-net#configurar-o-seu-site-para--portugu%C3%AAs-nota-da-tradu%C3%A7%C3%A3o) +Nota da tradução: para configurar o Omeka.net internamente para a língua portuguesa, siga as instruções disponíveis neste [tutorial.](/pt/licoes/introducao-omeka-net#configurar-o-seu-site-para--português-nota-da-tradução) ## Adicionar uma página diff --git a/pt/licoes/download-automatico-wget.md b/pt/licoes/download-automatico-wget.md index c6cabf717d..31a2e9d3d1 100644 --- a/pt/licoes/download-automatico-wget.md +++ b/pt/licoes/download-automatico-wget.md @@ -142,7 +142,7 @@ Neste ponto, a instalação do wget já deve estar concluída satisfatoriamente Se, por alguma razão, não conseguir instalar o pacote de gerenciamento, poderá simplesmente fazer o download do wget em separado. Esta opção é aplicável se utiliza um pacote de gerenciamento diferente (tal como Mac Ports) ou se deseja manter a infraestrutura num padrão mínimo. Siga as mesmas instruções novamente para instalar o xcode e o conjunto de ferramentas de linha de comando (Command Line Tools). -A seguir, faça o download de uma versão não compilada do wget no [website do GNU](http://www.gnu.org/software/wget/) (Eu escolhi fazer o dowload do ficheiro `wget-1.13.tar.gz`, disponível tanto no link [HTTP](http://ftp.gnu.org/gnu/wget/) como na página de downloads do [FTP](ftp://ftp.gnu.org/gnu/wget/), descompacte-o (clicando duas vezes sobre o ficheiro) no seu diretório 'home' (em um Mac, este será o `/User` directory – por exemplo, meu nome de usuário é ianmilligan e aparece próximo ao ícone de uma casa no meu localizador), e depois abra o Terminal. Para este tutorial, a versão do download é o `wget-1.13`. +A seguir, faça o download de uma versão não compilada do wget no [website do GNU](https://www.gnu.org/software/wget/) (Eu escolhi fazer o dowload do ficheiro `wget-1.13.tar.gz`, disponível tanto no link [HTTP](https://ftp.gnu.org/gnu/wget/) como na página de downloads do [FTP](ftp://ftp.gnu.org/gnu/wget/), descompacte-o (clicando duas vezes sobre o ficheiro) no seu diretório 'home' (em um Mac, este será o `/User` directory – por exemplo, meu nome de usuário é ianmilligan e aparece próximo ao ícone de uma casa no meu localizador), e depois abra o Terminal. Para este tutorial, a versão do download é o `wget-1.13`. Primeiramente, é preciso se direcionar para o diretório onde se encontram os ficheiros wget. No terminal, digite: @@ -188,7 +188,7 @@ De agora em diante, os usuários de todas as três plataformas estão em sintoni A documentação completa para wget pode ser encontrada na página [manual GNU wget](https://perma.cc/67JQ-TSB5). -Tome-se um exemplo de conjunto de dados. Digamos que queira fazer o download de todos os artigos hospedados no website [ActiveHistory.ca](https://perma.cc/KK9H-4XKL). Eles estão localizados em [http://activehistory.ca/papers/](https://perma.cc/CL79-ZN93); o que indica que eles estão todos contidos no diretório `/papers/`: por exemplo, o nono artigo publicado no website é o [http://activehistory.ca/papers/historypaper-9/](https://perma.cc/KF6E-8XZM). Pense nesta estrutura da mesma maneira que os diretórios do seu computador: se tiver uma pasta intitulada `/História/`, ela provavelmente conterá vários ficheiros. +Tome-se um exemplo de conjunto de dados. Digamos que queira fazer o download de todos os artigos hospedados no website [ActiveHistory.ca](https://perma.cc/KK9H-4XKL). Eles estão localizados em [https://activehistory.ca/papers/](https://perma.cc/CL79-ZN93); o que indica que eles estão todos contidos no diretório `/papers/`: por exemplo, o nono artigo publicado no website é o [https://activehistory.ca/papers/historypaper-9/](https://perma.cc/KF6E-8XZM). Pense nesta estrutura da mesma maneira que os diretórios do seu computador: se tiver uma pasta intitulada `/História/`, ela provavelmente conterá vários ficheiros. A mesma estrutura é válida para websites, e é utilizada esta lógica para informar ao computador quais ficheiros deseja-se fazer download. @@ -226,7 +226,7 @@ index.html [ <=> ] 65,60K --.-KB/s em 0,04s 2023-08-08 15:58:54 (1,83 MB/s) - ‘index.html’ salvo [67178] ``` -O que fez foi apenas o download da primeira página do [http://activehistory.ca/papers/](https://perma.cc/CL79-ZN93), a página de index dos artigos, para seu novo diretório. Se abri-la, verá o texto principal da página principal (homepage) do ActiveHistory.ca. Então, num piscar de olhos, já fizemos o download de algo rapidamente. +O que fez foi apenas o download da primeira página do [https://activehistory.ca/papers/](https://perma.cc/CL79-ZN93), a página de index dos artigos, para seu novo diretório. Se abri-la, verá o texto principal da página principal (homepage) do ActiveHistory.ca. Então, num piscar de olhos, já fizemos o download de algo rapidamente. No entanto, o objetivo é fazer o download de todos os artigos. Para isto é preciso incluir alguns poucos comandos no wget. @@ -240,7 +240,7 @@ No exemplo anterior, o componente [URL] informa ao programa para onde ele deve i -r -A recuperação recursiva é a parte mais importante do wget. Isto significa que o programa, ao iniciar, segue os links do website e também faz o download dos mesmos. Desta forma, por exemplo, o [http://activehistory.ca/papers/](https://perma.cc/CL79-ZN93) possui um link para o [http://activehistory.ca/papers/historypaper-9/](https://perma.cc/KF6E-8XZM), assim, ele fará o download deste também, ao utilizar a recuperação recursiva. Contudo, ele também seguirá quaisquer outros links: se houver um link para [http://uwo.ca](https://perma.cc/W7LH-SRTQ) em algum local daquela página, ele o seguirá e também fará o download. Por padrão, `-r` direciona o wget a até cinco websites após o primeiro. Isto consiste em seguir links até um limite de cinco cliques após o primeiro website. Desta maneira, funcionará de maneira bastante indiscriminada. Então precisamos de mais comandos: +A recuperação recursiva é a parte mais importante do wget. Isto significa que o programa, ao iniciar, segue os links do website e também faz o download dos mesmos. Desta forma, por exemplo, o [https://activehistory.ca/papers/](https://perma.cc/CL79-ZN93) possui um link para o [https://activehistory.ca/papers/historypaper-9/](https://perma.cc/KF6E-8XZM), assim, ele fará o download deste também, ao utilizar a recuperação recursiva. Contudo, ele também seguirá quaisquer outros links: se houver um link para [https://uwo.ca](https://perma.cc/W7LH-SRTQ) em algum local daquela página, ele o seguirá e também fará o download. Por padrão, `-r` direciona o wget a até cinco websites após o primeiro. Isto consiste em seguir links até um limite de cinco cliques após o primeiro website. Desta maneira, funcionará de maneira bastante indiscriminada. Então precisamos de mais comandos: ``` bash --no-parent @@ -248,7 +248,7 @@ A recuperação recursiva é a parte mais importante do wget. Isto significa que (O travessão duplo indica o texto completo de um comando. Todos os comandos também possuem uma versão abreviada que pode se iniciar com a utilização de `-np`). -Isto é importante. Significa que o wget deve seguir os links, mas não além do último diretório pai. No caso, implica dizer que ele não avançará a lugar nenhum que não seja parte da hierarquia do [http://activehistory.ca/papers/](https://perma.cc/CL79-ZN93). Se o endereço web for muito longo como `http://niche-canada.org/projects/events/new-events/not-yet-happened-events/`, ele encontra ficheiros apenas na pasta `/not-yet-happened-events/`. Este é um comando essencial para delinear sua pesquisa. +Isto é importante. Significa que o wget deve seguir os links, mas não além do último diretório pai. No caso, implica dizer que ele não avançará a lugar nenhum que não seja parte da hierarquia do [https://activehistory.ca/papers/](https://perma.cc/CL79-ZN93). Se o endereço web for muito longo como `http://niche-canada.org/projects/events/new-events/not-yet-happened-events/`, ele encontra ficheiros apenas na pasta `/not-yet-happened-events/`. Este é um comando essencial para delinear sua pesquisa. Aqui está uma representação gráfica: diff --git a/pt/licoes/download-multiplos-registros-query-strings.md b/pt/licoes/download-multiplos-registros-query-strings.md index dd4d26e209..a22b93d672 100644 --- a/pt/licoes/download-multiplos-registros-query-strings.md +++ b/pt/licoes/download-multiplos-registros-query-strings.md @@ -1,778 +1,778 @@ ---- -title: Download de Múltiplos Registros usando Query Strings -layout: lesson -collection: lessons -slug: download-multiplos-registros-query-strings -date: 2012-11-11 -translation_date: 2022-11-25 -authors: -- Adam Crymble -reviewers: -- Luke Bergmann -- Sharon Howard -- Frederik Elwert -editors: -- Fred Gibbs -translator: -- Felipe Lamarca -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- André Salvo -- Aracele Torres -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/465 -activity: acquiring -topics: [web-scraping] -abstract: "Fazer o download de um único registro de um website é fácil, mas fazer o download de vários registros de uma vez - uma necessidade cada vez mais frequente para um historiador - é muito mais eficiente usando uma linguagem de programação como o Python. Nessa lição, escreveremos um programa que fará o download de uma série de registros do Old Bailey Online usando critérios de busca personalizados e irá armazená-los num diretório no nosso computador." -redirect_from: /licoes/download-de-multiplos-registros-usando-query-strings -original: downloading-multiple-records-using-query-strings -avatar_alt: Figuras trabalhando numa mina, empurrando carrinhos -doi: 10.46430/phpt0034 ---- - -{% include toc.html %} - -
    -O site do Old Bailey Online foi recentemente atualizado. Infelizmente, devido às diversas mudanças, muitos (se não todos) os elementos do site de exemplo usado nesta lição não funcionarão conforme descrito. No entanto, as metodologias ensinadas por esta lição permanecem relevantes e podem ser adaptadas pelos leitores para um site de exemplo diferente. Estamos trabalhando na adaptação da lição para o novo site do Old Bailey Online, mas ainda não temos cronograma preciso de quando a lição será atualizada. [Abril de 2024] -
    - -## Objetivos do Módulo - -Fazer o *download* de um único registro de um website é fácil, mas fazer o *download* de vários registros de uma vez - uma necessidade cada vez mais frequente para um historiador - é muito mais eficiente usando uma linguagem de programação como o Python. Nesta lição, escreveremos um programa que fará o *download* de uma série de registros do *[Old Bailey Online](http://www.oldbaileyonline.org/)* usando critérios de investigação personalizados e irá armazená-los num diretório no nosso computador. Esse processo envolve interpretar e manipular *Query Strings* de URL. Nesse caso, o tutorial buscará fazer o *download* de fontes que contenham referências a afrodescendentes que foram publicadas no *Old Bailey Proceedings* entre 1700 e 1750. - -
    -Os exemplos nessa lição incluem linguagem histórica racializada que os leitores podem achar ofensiva. O autor não tolera o uso dessa linguagem, mas tentou usá-la no seu contexto histórico, reconhecendo que, de outra forma, é impossível encontrar os materiais desejados do estudo de caso. Qualquer pessoa que ensine com este material é aconselhada a adotar uma abordagem sensível em relação à linguagem e a aplicar as boas práticas ao ensinar sobre raça. O autor recomenda os muitos recursos do Teaching Tolerance; Peggy McIntosh, ‘White Privilege: Unpacking the Invisible Knapsack’, Peace and Freedom Magazine, (1989), 10-12; Binyavanga Wainaina, ‘How to Write About Africa’, Granta (92): 2006. -
    - -## Para Quem isso é Útil? - -Automatizar o processo de *download* de registros de uma base de dados *online* será útil para qualquer um que trabalhe com fontes históricas armazenadas *online* de forma ordenada e acessível e que deseje salvar cópias dessas fontes no seu próprio computador. É particularmente útil para alguém que deseja fazer o *download* de vários registros específicos, em vez de apenas um punhado. Caso deseje fazer o *download* de *todos* ou da *maioria* dos registros de uma base de dados em particular, pode achar o tutorial de Ian Milligan sobre [Automated Downloading with WGET](/en/lessons/automated-downloading-with-wget) mais adequado. - -O presente tutorial permitirá que faça *download* de forma isolada e discriminada de registros específicos que atendam às suas necessidades. Fazer o *download* de múltiplas fontes de forma automática economiza um tempo considerável. O que faz com as fontes baixadas depende dos seus objetivos de investigação. Pode desejar criar visualizações ou realizar uma série de métodos de análise de dados, ou simplesmente reformatá-las para facilitar a navegação. Ou pode desejar apenas manter uma cópia de *backup* para poder acessá-las sem acesso à internet. - -Essa lição é voltada para usuários de Python com nível intermediário. Caso ainda não tenha tentado as lições do [Básico de Programação em Python](/pt/licoes/introducao-instalacao-python), pode achá-las um ponto de partida útil. - -## Aplicando nosso Conhecimento Histórico - -Nesta lição, estamos tentando criar o nosso próprio corpus de casos relacionados com pessoas afrodescendentes. A partir do [caso de Benjamin Bowsey](http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33) no *Old Bailey* em 1780, podemos notar que "*black*" pode ser uma palavra-chave útil para usarmos para localizar outros casos envolvendo réus de ascendência africana. No entanto, quando buscamos por *black* no *website* do *Old Bailey*, percebemos que esta palavra às vezes se refere a outros usos: *black horses* ou *black cloth*. A tarefa de desambiguar esse uso da linguagem terá que esperar por outra lição. Por enquanto, vamos nos voltar para casos mais fáceis. Como historiadores, provavelmente, podemos pensar em palavras-chave de termos historicamente racializados relacionados com afrodescendentes as quais valeria a pena buscar. A infame "*n-word*", é claro, não é útil, já que esse termo não era comumente utilizado até meados do século XIX. Outras expressões racializadas como "*negro*" e "*mulatto*" são, porém, muito mais relevantes para o início do século XVIII. Essas palavras-chave são menos ambíguas do que "*black*" e são muito mais propensas a serem referências imediatas a pessoas no nosso público-alvo. Se testarmos esses dois termos em buscas separadas simples no *Old Bailey website*, temos resultados como nessa captura de tela: - -{% include figure.html filename="SearchResultsNegro.png" caption="Resultados de investigação para 'negro' no *Old Bailey Online*" %} - -{% include figure.html filename="SearchResultsMulatto.png" caption="Resultados de investigação para 'mulatto' no *Old Bailey Online*" %} - -Depois de examinar estes resultados de busca, parece evidente que são referências a pessoas e não a cavalos, panos ou qualquer outra coisa que seja preta. Desejamos fazer o *download* de todas para usar na nossa análise. Poderíamos, é claro, fazer o *download* de uma por uma manualmente. Mas vamos encontrar uma maneira programática de automatizar essa tarefa. - -## A Investigação Avançada no OBO - -As ferramentas de pesquisa de cada *site* funcionam de maneira diferente. Embora as pesquisas funcionem de forma semelhante, as complexidades das pesquisas numa base de dados podem não ser totalmente óbvias. Portanto, é importante pensar criticamente sobre as opções de busca de uma base de dados e, quando disponível, ler a documentação fornecida pelo *website*. Investigadores de história prudentes sempre interrogam suas fontes; os procedimentos por trás das suas caixas de pesquisa devem receber a mesma atenção. O [formulário de busca avançada](http://www.oldbaileyonline.org/forms/formMain.jsp) do *Old Bailey Online* permite refinar as suas buscas com base em dez campos diferentes, incluindo palavras-chave simples, um intervalo de datas e um tipo de crime. Como as ferramentas de busca de cada *website* são diferentes, vale sempre a pena reservar um momento ou dois para testar e ler a respeito das opções de investigação disponíveis. Uma vez que já fizemos buscas simples por "*negro*" e "*mulatto*", sabemos que haverá resultados. No entanto, vamos usar a busca avançada para limitar os nossos resultados aos registros publicados no *Old Bailey Proceedings* que dizem respeito a julgamentos apenas de 1700 até 1750. É claro que pode alterá-lo para o que desejar, mas isso tornará o exemplo mais simples de ser acompanhado. Faça a busca mostrada na imagem abaixo. Certifique-se de que marcou o botão "*Advanced*" e incluiu as *wildcards* `*` para incluir entradas pluralizadas ou com um "e" extra no final. - -{% include figure.html filename="AdvancedSearchExample.png" caption="Exemplo de Busca Avançada no *Old Bailey*" %} - -Execute a busca e depois clique no *link* "*Calculate Total*" para ver quantas entradas existem. Agora temos 13 resultados (caso tenha um número diferente, volte e certifique-se de que copiou o exemplo acima da forma exata). O que queremos fazer neste ponto é o *download* de todos esses ficheiros de julgamento e analizá-los mais profundamente. Mais uma vez, para apenas 13 registros, também pode fazer o *download* de cada registro manualmente. Mas à medida que mais e mais dados são disponibilizados *online*, torna-se mais comum a necessidade de baixar 1.300 ou até 130.000 registros, caso no qual o *download* individual dos registros se torna impraticável e entender como automatizar o processo se torna muito valioso. Para automatizar o processo, precisamos de dar um passo atrás e lembrar como as URLs de busca são criadas no *Old Bailey website*, um método comum para muitas bases de dados *online* e *websites*. - -## Entendendo *Queries* de URL - -Observe a URL produzida com a última página de resultado de busca. Ela deve se parecer com isso: - -``` -https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=mulatto*+negro*&kwparse=advanced&_divs_div0Type_div1Type=sessionsPaper_trialAccount&fromYear=1700&fromMonth=00&toYear=1750&toMonth=99&start=0&count=0 -``` - -Vimos sobre URLs em [Noções básicas de páginas web e HTML](/pt/licoes/nocoes-basicas-paginas-web-html), mas isso parece muito mais complexo. Ainda que mais longo, *não* é verdadeiramente muito mais complexo. Mas é mais fácil de entender observando como os nossos critérios de busca são representados na URL. - -``` -https://www.oldbaileyonline.org/search.jsp -?gen=1 -&form=searchHomePage -&_divs_fulltext=mulatto*+negro* -&kwparse=advanced -&_divs_div0Type_div1Type=sessionsPaper_trialAccount -&fromYear=1700 -&fromMonth=00 -&toYear=1750 -&toMonth=99 -&start=0 -&count=0 -``` - -Nessa visão, vemos com mais clareza as 12 informações importantes que precisamos para realizar a nossa busca (uma por linha). Na primeira há a URL base do *Old Bailey website*, seguida por uma query "?" (não se preocupe com o *bit* `gen=1`; os desenvolvedores do *Old Bailey Online* dizem que ele não faz nada) e uma série de 10 pares *nome/valor* unidos por caracteres `&`. Juntos, esses 10 pares de nome/valor compõem a *query string* (expressão de busca), que informa ao mecanismo de busca quais variáveis usar em etapas específicas da investigação. Observe que cada par nome/valor contém um nome de variável: `toYear` e, em seguida, atribui a essa variável um valor: `1750`. Isso funciona exatamente da mesma forma que os *Argumentos de Função*, passando certas informações para variáveis específicas. Nesse caso, a variável mais importante é `_divs_fulltext=`, para a qual foi dado o valor: - -``` -mulatto*+negro* -``` - -Esta contém o termo que digitamos na caixa de busca. O programa adicionou automaticamente um sinal de soma `+` no lugar de um espaço em branco (URLs não podem conter espaçamentos); dito de outro modo, isso é exatamente o que pedimos que o *site* do *Old Bailey* encontrasse. As outras variáveis carregam valores que nós também definimos. `fromYear` e `toYear` contém o nosso intervalo de datas. Já que nenhum ano possui 99 meses, como sugerido na variável `toMonth`, podemos assumir que esse seja o modo através do qual o algoritmo garante que todos os registros daquele ano são incluídos. Não há regras difíceis ou rápidas para descobrir o que cada variável faz, porque a pessoa que criou o site as nomeou. Muitas vezes pode fazer uma suposição razoável. Todos os campos de busca possíveis na página de busca avançada possuem os seus próprios pares nome/valor. Caso deseje descobrir o nome da variável de modo a que possa utilizá-la, faça uma nova busca e certifique-se de colocar um valor no campo no qual está interessado. Após submeter a sua busca, verá o seu valor e o nome associado a ele como parte da URL da página dos resultados de busca. Com o *Old Bailey Online*, assim como com noutros *websites*, o formulário de busca (avançada ou não) ajuda, essencialmente, a construir URLs que informam à base de dados o que está buscando. Se puder entender como os campos de busca estão representados no URL - o que geralmente é algo bem direto -, então torna-se relativamente simples construir esses URLs programaticamente e automatizar o processo de *download* de registros. - -Agora tente alterar o `start=0` para `start=10` e pressione `enter`. Deve agora ter os resultados 11-13. A variável `start` informa ao *website* qual a entrada que deve ser mostrada no início da lista de resultados de busca. Nós devemos ser capazes de utilizar esse conhecimento para criar uma série de URLs que nos permitirão fazer o *download* de todos os 13 ficheiros. Vamos nos voltar para isso agora. - -## Fazendo o *Download* de Ficheiros Sistematicamente - -Na lição [Download de Páginas Web com Python](/pt/licoes/download-paginas-web-python), aprendemos que o Python pode fazer o *download* de uma página web desde que tenhamos a URL. Naquela lição, usamos a URL para fazer o *download* da transcrição do julgamento de Benjamin Bowsey. Nesse caso, estamos tentando fazer o *download* de múltiplas transcrições de julgamentos que atendem aos critérios de busca descritos acima sem precisar executar o programa repetidamente. Ao invés disso, queremos um programa que faça o *download* de tudo de uma vez. Neste ponto, temos a URL para a página de resultados de busca que contém as 10 primeiras entradas na nossa investigação. Também sabemos que ao mudarmos o valor de `start` na URL, podemos sequencialmente chamar cada uma das páginas de resultados de busca e finalmente recuperar todos os ficheiros de julgamento que elas possuem. É claro que os resultados de busca não nos oferecem os ficheiros do julgamento em si, mas apenas *links* para eles. Então precisamos de extrair esses *links* para os registros subjacentes dos resultados de busca. No *Old Bailey Online website*, as URLs para os registros individuais (os ficheiros de transcrição de julgamento) podem ser encontrados como *links* na página de resultados de busca. Sabemos que todas as transcrições de julgamento possuem um id de julgamento que assume a forma: "t" seguido por, pelo menos, 8 números (ex.: t17800628-33). Ao buscar *links* que contenham esse padrão, podemos identificar URLs de transcrição de julgamento. Como em lições anteriores, vamos desenvolver um algoritmo de modo a que possamos começar a enfrentar esse problema de uma maneira que o computador possa lidar. Parece que a tarefa pode ser realizada em 4 passos. Precisaremos: - -- Gerar as URLs para cada página de resultados de busca incrementando a variável `start` numa quantidade fixa um número apropriado de vezes. -- Fazer o *download* de cada página de resultados de busca como um ficheiro HTML. -- Extrair os URLs de cada transcrição de julgamento (usando o ID do julgamento como descrito acima) de cada ficheiro HTML de resultados de busca. -- Percorrer essas URLs extraídas para baixar cada transcrição de avaliação e salvá-las num diretório no nosso computador. - -Perceberá que isso é razoavelmente similiar às tarefas que realizamos em [Download de Páginas Web com Python](/pt/licoes/download-paginas-web-python) e [De HTML para Lista de Palavras (parte 2)](/pt/licoes/HTML-lista-palavras-2). Primeiro, fazemos o *download* e, então, analisamos as informações que procuramos. E, nesse caso, fazemos mais alguns *downloads*. - -## Fazendo o *Download* das Páginas de Resultados de Busca - -Primeiro, precisamos de gerar as URLs para fazer o download de cada página de resultados de busca. Já temos a primeira usando a forma do próprio *website*. - -``` -https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=mulatto*+negro*&kwparse=advanced&_divs_div0Type_div1Type=sessionsPaper_trialAccount&fromYear=1700&fromMonth=00&toYear=1750&toMonth=99&start=0&count=0 -``` - -Poderíamos escrever essa URL duas vezes e alterar a variável `start` para obter todas as 13 entradas, mas vamos escrever um programa que funcionaria independentemente de quantas páginas de resultados de busca ou registros precisássemos de fazer *download*, não importando o que decidíssemos investigar. Estude esse código e, depois, adicione essa função ao seu módulo chamado `obo.py` (crie um ficheiro com esse nome e armazene-o no diretório onde deseja trabalhar). Os comentários no código destinam-se a ajudá-lo a decifrar as várias partes. - -``` python -# obo.py -def getSearchResults(query, kwparse, fromYear, fromMonth, toYear, toMonth): - - import urllib.request - - startValue = 0 - - # cada parte do URL. Dividido para facilitar a leitura - url = 'https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=' - url += query - url += '&kwparse=' + kwparse - url += '&_divs_div0Type_div1Type=sessionsPaper_trialAccount' - url += '&fromYear=' + fromYear - url += '&fromMonth=' + fromMonth - url += '&toYear=' + toYear - url += '&toMonth=' + toMonth - url += '&start=' + str(startValue) - url += '&count=0' - - # faz o download da página e armazena o resultado - response = urllib.request.urlopen(url) - webContent = response.read().decode('UTF-8') - filename = 'search-result' - f = open(filename + ".html", 'w') - f.write(webContent) - f.close -``` - -Nessa função, separamos os vários componentes da *Query String* e usamos Argumentos de Função para que a função possa ser reutilizada além dos nossos objetivos específicos atuais. Quando chamarmos por essa função, substituiremos os argumentos pelos valores que desejamos buscar. Depois, fazemos o *download* das páginas dos resultados de busca de maneira similiar a como foi feito em [Download de Páginas Web com Python](/pt/licoes/download-paginas-web-python). Agora, crie um novo ficheiro: `download-searches.py` e copie o código a seguir dentro dele. Observe: os valores que passamos como argumentos são exatamente os mesmos dos utilizados no exemplo acima. Sinta-se livre para testá-los para receber resultados diferentes ou ver como funcionam. - -``` python -#download-searches.py -import obo - -query = 'mulatto*+negro*' - -obo.getSearchResults(query, "advanced", "1700", "00", "1750", "99") -``` - -Quando executar esse código, deve encontrar um novo ficheiro: `search-result.html` no seu `diretório programming-historian` contendo a primeira página dos resultados de busca da sua investigação. Certifique-se de que o *download* foi realizado apropriadamente e apague o ficheiro. Vamos adaptar o nosso programa para fazer o *download* da outra página contendo as outras 3 entradas ao mesmo tempo, assim queremos ter certeza que obteremos as duas. Vamos refinar a nossa função `getSearchResults` adicionando outro argumento de função chamado `entries`, de modo a que possamos dizer ao programa quantas páginas de resultados de busca precisamos fazer o *download*. Usaremos o valor das entradas e matemática simples para determinar quantas páginas de resultado de busca existem. Isso é algo bastante direto uma vez que sabemos que há dez transcrições de julgamento listadas por página. Podemos calcular o número de páginas de resultados de busca dividindo o valor das entradas por 10. Armazenaremos esse resultado na variável chamada `pageCount`. Ela se parecerá com isso: - -``` python -# determina quantos ficheiros precisam ser baixados -pageCount = entries / 10 -``` - -No entanto, em casos em que o número de entradas não é um múltiplo de 10, isso resultará num número decimal. Pode testá-lo executando esse código no seu Terminal (Mac & Linux) / Linha de Comandos Python (Windows) e exibindo o valor mantido em `pageCount`. (Observe que, daqui em diante, usaremos a palavra Terminal para referir esse programa). - -``` python -entries = 13 -pageCount = entries / 10 -print(pageCount) --> 1.3 -``` - -Sabemos que a contagem do número de página deve ser 2 (uma página contendo as entradas 1-10 e uma página contendo as entradas 11-13). Uma vez que sempre queremos o maior inteiro mais próximo, podemos arredondar o resultado da divisão. - -``` python -# determina quantos ficheiros precisam ser baixados -import math -pageCount = entries / 10 -pageCount = math.ceil(pageCount) -``` - -Se adicionarmos isso à nossa função `getSearchResults` abaixo da linha `startValue=0`, agora o código é capaz de calcular o número de páginas cujo *download* precisa de ser realizado. No entanto, nesta etapa ele irá fazer somente o *download* da primeira página, já que informamos à seção de *download* da função para executar somente uma vez. Para corrigir isso, podemos adicionar o código de *download* a um `for` *loop* que fará o *download* uma vez para cada número na variável `pageCount`. Caso ele leia 1, fará o *download* uma vez; caso ele leia 5, fará o *download* cinco vezes e assim por diante. Imediatamente após o `if` *statement* que acabou de escrever, adicione a linha a seguir e indente tudo antes de `f.close` com um espaçamento adicional de modo que tudo fique dentro do `for` *loop*: - -``` python -for pages in range(1, pageCount+1): - print(pages) -``` - -Uma vez que isso é um `for` *loop*, todo o código que desejamos executar repetidamente também precisa de ser planejado. Pode-se certificar de que fez isso corretamente verificando o código finalizado no exemplo abaixo. Esse *loop* aproveita a função [range](https://docs.python.org/3/tutorial/controlflow.html#the-range-function) do Python. Para entender esse `for` *loop* é melhor, provavelmente, pensar em `pageCount` igual a 2 como no exemplo. Portanto, essas duas linhas de código significam: comece a executar com um valor de *loop* inicial 1 e, a cada vez que executar, adicione uma unidade a esse valor. Quando o valor do *loop* é o mesmo de `pageCount`, executa mais uma vez e para. Isso é particularmente valioso porque significa que podemos dizer ao nosso programa para executar exatamente uma vez para cada página de resultados de busca e oferece uma nova habilidade flexível para controlar quantas vezes um `for` *loop* é executado. Caso deseje praticar essa nova e poderosa maneira de escrever *loops*, pode abrir o seu Terminal e brincar. - -``` python -pageCount = 2 -for pages in range(1, pageCount+1): - print(pages) - --> 1 --> 2 -``` - -Antes de adicionar todo esse código à nossa função `getSearchResults`, temos que fazer dois ajustes finais. No final do `for` *loop* (mas ainda dentro do *loop*) e depois que o nosso código de *download* for executado, precisamos de mudar nossa variável `startValue`, que é usada na construção da URL da página que desejamos fazer o *download*. Se nos esquecermos de fazer isso, o nosso programa fará repetidamente o *download* da primeira página de resultados de busca, já que não estamos verdadeiramente mudando nada na URL inicial. A variável `startValue`, como discutido acima, é o que controla em que página de resultados de busca desejamos fazer o *download*. Portanto, podemos solicitar a próxima página de resultados de busca incrementando o valor de `startvalue` em 10 unidades depois que o *download* inicial for concluído. Caso não tenha certeza de onde adicionar essa linha, pode espiar adiante o código finalizado no exemplo abaixo. - -Finalmente, queremos garantir que os nomes do ficheiros que fizemos o *download* são diferentes entre si. De outro modo, cada *download* será armazenado em cima do *download* anterior, deixando apenas um único ficheiro de resultados de busca. Para resolver isso, podemos ajustar os conteúdos da variável `filename` para incluir o valor armazenado em `startValue` de modo que a cada vez que fizermos o *download* de uma nova página, ela recebe um nome diferente. Já que a variável `startValue` é um inteiro, precisaremos de convertê-la para uma string antes de adicioná-la à variável `filename`. Ajuste a linha no seu programa que pertence à variável `filename` para ficar assim: - -``` python -filename = 'search-result' + str(startValue) -``` -Agora deve ser capaz de adicionar essas novas linhas de código à sua função `getSearchResults`. Lembre-se de que fizemos as adições a seguir: - -- Adicionar `entries` como um argumento de função adicional logo depois de `toMonth` -- Calcular o número de páginas de resultados de pesquisa e adicionar isso imediatamente após a linha que começa com `startValue = 0` (antes de construirmos a URL e começarmos o *download*) -- Imediatamente após isso, adicione um `for` *loop* que informará ao programa para executar uma vez para cada página de resultados de busca, e indentar o resto do código de modo a que ele esteja dentro do novo *loop* -- A última linha no `for` *loop* deve agora incrementar o valor da variável `startValue` a cada vez que o *loop* é executado -- Ajustar a variável `filename` existente de modo que a cada vez que for feito o *download* de uma página de resultados de busca ela forneça um nome único ao ficheiro. - -A função finalizada no seu ficheiro `obo.py` deve-se parecer com isso: - -``` python -# cria URLs para páginas de resultados de busca e armazena os ficheiros. -def getSearchResults(query, kwparse, fromYear, fromMonth, toYear, toMonth, entries): - - import urllib.request, math - - startValue = 0 - - # isso é novo! determina quantos ficheiros precisam ser baixados. - pageCount = entries / 10 - pageCount = math.ceil(pageCount) - - # essa linha é nova! - for pages in range(1, pageCount +1): - - # cada parte do URL. Dividido para facilitar a leitura. - url = 'https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=' - url += query - url += '&kwparse=' + kwparse - url += '&_divs_div0Type_div1Type=sessionsPaper_trialAccount' - url += '&fromYear=' + fromYear - url += '&fromMonth=' + fromMonth - url += '&toYear=' + toYear - url += '&toMonth=' + toMonth - url += '&start=' + str(startValue) - url += '&count=0' - - # faz o download da página e salva o resultado. - response = urllib.request.urlopen(url) - webContent = response.read().decode('UTF-8') - filename = 'search-result' + str(startValue) - f = open(filename + ".html", 'w') - f.write(webContent) - f.close - - # essa linha é nova! - startValue = startValue + 10 -``` - -Para executar essa nova função, adicione o argumento extra ao `download-searches.py` e execute o programa novamente: - -``` python -#download-searches.py -import obo - -query = 'mulatto*+negro*' - -obo.getSearchResults(query, "advanced", "1700", "00", "1750", "99", 13) -``` - -Ótimo! Agora temos as duas páginas de resultados de busca, chamadas `search-result0.html` e `search-result10.html`. Mas antes de seguirmos para o próximo passo do algoritmo, vamos cuidar de algumas "tarefas de organização". O nosso diretório `programming-historian` rapidamente se tornará difícil de controlar se fizermos o *download* de múltiplas páginas de resultados de busca e transcrições de julgamento. Vamos fazer com que o Python crie um novo diretório nomeado a partir dos nossos termos de busca. - -Desejamos adicionar essa nova funcionalidade em `getSearchResults`, de modo que os *downloads* das nossas páginas de resultados de busca sejam direcionadas a diretórios com o mesmo nome da nossa *query* de busca. Isso manterá o nosso diretório `programming-historian` mais organizado. Para fazê-lo, criaremos um novo diretório usando a biblioteca `os`, abreviação de "*operating system*" (sistema operacional). Essa biblioteca contém uma função chamada `makedirs` que, não surpreendentemente, cria um novo diretório. Pode testar usando o Terminal: - - -``` python -import os - -query = "meuNovoDiretório" -if not os.path.exists(query): - os.makedirs(query) -``` - -Esse programa irá verificar se o seu computador já possui um diretório com esse nome. Caso não possua, agora deve possuir um diretório chamado `meuNovoDiretório` no seu computador. Num Mac provavelmente está localizado no seu diretório `/Users/username/`, e no Windows deve ser capaz de encontrá-lo no diretório `Python` no seu computador, o mesmo no qual abriu o programa da linha de comandos. Se isso funcionou, pode deletar o diretório do seu disco rígido, já que isso foi só uma prática. Uma vez que desejamos criar um novo diretório nomeado a partir da *query* que inserimos no *Old Bailey Online website*, vamos usar diretamente esse argumento de função `query` da função `getSearchResults`. Para fazer isso, importe a biblioteca `os` após as outras e, depois, adicione o código que acabou de escrever imediatamente abaixo. A sua função `getSearchResults` deve agora se parecer com isso: - -``` python -# cria URLs para páginas de resultados de busca e armazena os ficheiros. -def getSearchResults(query, kwparse, fromYear, fromMonth, toYear, toMonth, entries): - - import urllib.request, math, os - - # Essa linha é nova! Cria um novo diretório. - if not os.path.exists(query): - os.makedirs(query) - - startValue = 0 - - # Determina quantos ficheiros precisam ser baixados. - pageCount = entries / 10 - pageCount = math.ceil(pageCount) - - for pages in range(1, pageCount +1): - - # cada parte do URL. Dividido para facilitar a leitura. - url = 'https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=' - url += query - url += '&kwparse=' + kwparse - url += '&_divs_div0Type_div1Type=sessionsPaper_trialAccount' - url += '&fromYear=' + fromYear - url += '&fromMonth=' + fromMonth - url += '&toYear=' + toYear - url += '&toMonth=' + toMonth - url += '&start=' + str(startValue) - url += '&count=0' - - # faz o download da página e salva o resultado. - response = urllib.request.urlopen(url) - webContent = response.read().decode('UTF-8') - - # armazena o resultado num novo diretório. - filename = 'search-result' + str(startValue) - - f = open(filename + ".html", 'w') - f.write(webContent) - f.close - - startValue = startValue + 10 -``` - -O último passo para essa função é garantir que, quando salvarmos as nossas páginas de resultados de busca, as armazenaremos nesse novo diretório. Para fazer isso, podemos fazer um pequeno ajuste à variável `filename` de modo a que o ficheiro termine no lugar certo. Há muitas formas de o fazer e a mais fácil é simplesmente adicionar o nome do novo diretório mais uma barra no nome do ficheiro: - -``` python -filename = query + '/' + 'search-result' + str(startValue) -``` - -Caso o seu computador esteja executando o Windows, precisará de uma barra invertida em vez da barra do exemplo acima. Adicione a linha acima à sua função `getSearchResults` no lugar da descrição atual do `filename`. - -Se estiver executando o Windows, é provável que o seu programa `downloadSearches.py` falhe quando o executar porque está tentando criar um diretório com um \* nele. O Windows não gosta disso. Para resolver esse problema podemos usar [expressões regulares](https://docs.python.org/3/library/re.html) para remover qualquer caractere não compatível com o Windows. Usamos expressões regulares anteriormente em [Contagem de Frequências de Palavras com Python](/pt/licoes/contar-frequencias-palavras-python). Para remover caracteres não-alfanuméricos da *query*, primeiro importe a biblioteca de expressões regulares imediatamente após importar a biblioteca `os` e, depois, use a função `re.sub()` para criar uma nova string chamada `cleanQuery` que contém apenas caracteres alfanuméricos. Depois precisará de substituir `cleanQuery` como a variável usada nas declarações de `os.path.exists()`, `os.makedirs()` e `filename`. - -``` python -import urllib.request, math, os, re -cleanQuery = re.sub(r'\W+', '', query) -if not os.path.exists(cleanQuery): - os.makedirs(cleanQuery) - -... - -filename = cleanQuery + '/' + 'search-result' + str(startValue) -``` - -A versão final da sua função deve-se parecer com isso: - -``` python -# cria URLs para páginas de resultados de busca e armazena os ficheiros. -def getSearchResults(query, kwparse, fromYear, fromMonth, toYear, toMonth, entries): - - import urllib.request, math, os, re - - cleanQuery = re.sub(r'\W+', '', query) - if not os.path.exists(cleanQuery): - os.makedirs(cleanQuery) - - startValue = 0 - - # Determina quantos ficheiros precisam ser baixados - pageCount = entries / 10 - pageCount = math.ceil(pageCount) - - for pages in range(1, pageCount +1): - - # cada parte do URL. Dividido para facilitar a leitura. - url = 'https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=' - url += query - url += '&kwparse=' + kwparse - url += '&_divs_div0Type_div1Type=sessionsPaper_trialAccount' - url += '&fromYear=' + fromYear - url += '&fromMonth=' + fromMonth - url += '&toYear=' + toYear - url += '&toMonth=' + toMonth - url += '&start=' + str(startValue) - url += '&count=0' - - # faz o download da página e salva o resultado. - response = urllib.request.urlopen(url) - webContent = response.read().decode('UTF-8') - filename = cleanQuery + '/' + 'search-result' + str(startValue) - f = open(filename + ".html", 'w') - f.write(webContent) - f.close - - startValue = startValue + 10 -``` - -Dessa vez dizemos ao programa para fazer o *download* dos julgamentos e armazená-los num novo diretório ao invés do nosso diretório `programming-historian`. Execute o programa `download-searches.py` mais uma vez para se certificar de que ele funcionou e que entendeu como armazenar os ficheiros num diretório particular usando Python. - -### Fazendo o *Download* das Entradas de Julgamento Individuais - -A este ponto, criamos uma função que é capaz de fazer o *download* de todos os ficheiros HTML de resultados de busca a partir do website *Old Bailey Online* para uma busca avançada que definimos e desenvolvemos de forma programática. Agora o próximo passo do algoritmo: extrair as URLs de cada transcrição de julgamento dos ficheiros HTML de resultados de busca. Nas lições que precedem esta (ex.: [Download de Páginas Web com Python](/pt/licoes/download-paginas-web-python)), trabalhamos com as versões para exibição das transcrições dos julgamentos e continuaremos a fazer isso. Sabemos que a versão de exibição do julgamento de Benjamin Bowsey está localizada na URL: - -``` -http://www.oldbaileyonline.org/print.jsp?div=t17800628-33 -``` - -Da mesma forma que alterar as *query strings* nas URLs gera resultados de busca diferentes, alterar a URL dos registros de julgamento - no caso, substituir um ID de julgamento por outro - nos fará obter a transcrição para aquele novo julgamento. Isso significa que, para encontrar e fazer o *download* dos 13 ficheiros que buscamos, tudo o que precisamos são esses IDs de julgamento. Uma vez que sabemos que essas páginas de resultados de busca geralmente contém um *link* para as páginas descritas, há uma boa chance de que consigamos encontrar esses *links* integrados ao código HTML. Se formos capazes de raspar essa informação das páginas de resultados de busca em que fizemos *download*, podemos então usar essa informação para gerar uma URL que nos permitirá fazer o *download* de cada transcrição de julgamento. Essa é uma técnica que irá utilizar para a maioria das páginas de resultados de busca, não só o *Old Bailey Online*! Para fazer isso, primeiro precisamos encontrar onde os IDs de julgamento estão no código HTML dos ficheiros que fizemos o *download* e, depois, determinar uma maneira de isolá-los consistentemente usando código de modo a que, independentemente de qual página de resultado de busca fizermos o *download*, sejamos capazes de encontrar as transcrições de julgamento. Primeiro, abra `search-results0.html` no Komodo Edit e dê uma olhada na lista de julgamentos. A primeira entrada começa com "Anne Smith", então pode usar o recurso `find` no Komodo Edit para pular imediatamente para o lugar certo. Observe que o nome de Anne faz parte de um *link*: - -``` -browse.jsp?id=t17160113-18&div=t17160113-18&terms=mulatto*_negro*#highlight -``` - -Perfeito, o *link* contém o ID do julgamento! Percorra as entradas restantes e verá que isso é verdade em todos os casos. Para nossa sorte, o *site* é bem formatado e parece que cada *link* começa com `browse.jsp?id=` seguido pelo ID do julgamento e termina com um `&`, no caso de Anne: `browse.jsp?id=t17160113-18&`. Podemos escrever algumas linhas de código que sejam capazes de isolar esses IDs. Veja a função a seguir. Essa função também usa a biblioteca `os`, nesse caso para listar todos os ficheiros localizados no diretório criado na seção anterior. A biblioteca `os` possui uma gama de funções úteis que imitam os tipos de tarefas que esperaria ser capaz de fazer com o seu mouse no Mac Finder ou Windows, como abrir, fechar, criar, deletar e mover ficheiros e diretórios, e é uma boa biblioteca a ser masterizada - ou pelo menos para se familiarizar. - -``` python -def getIndivTrials(query): - import os, re - - cleanQuery = re.sub(r'\W+', '', query) - searchResults = os.listdir(cleanQuery) - - print(searchResults) -``` - -Crie e execute um novo programa chamado `extract-trials-ids.py` com o código a seguir. Certifique-se de inserir o mesmo valor nos argumentos da *query* como fez no exemplo anterior: - -``` python -import obo - -obo.getIndivTrials("mulatto*+negro*") -``` - -Se tudo correu bem, deve ver uma lista contendo o nome de todos os ficheiros no seu novo diretório `mulatto*+negro*`, que a essa altura devem ser as duas páginas de resultados de busca. Certifique-se de que isso funcionou antes de prosseguir. Uma vez que armazenamos todas as páginas de resultados de busca com um nome de ficheiro que inclui `search-results`, agora desejamos abrir todos os ficheiros cujo nome contenha `search-results` e extrair todos os IDs de julgamento encontrados neles. Nesse caso sabemos que temos 2, mas desejamos que o nosso código seja o mais reutilizável possível (com razão, é claro!). Restringir essa ação a ficheiros denominados `search-results` significará que este programa funcionará como pretendido, mesmo que o diretório contenha muitos outros ficheiros não relacionados, já que o programa ignorará qualquer coisa com nome diferente. - -Adicione o código a seguir à sua função `getIndivTrials()`, que verificará se cada ficheiro contém `search-results` no seu nome. Em caso verdadeiro, o ficheiro será aberto e o conteúdo será salvo na variável chamada `text`. Essa variável `text` será analisada na busca por um ID de julgamento, que sabemos que sempre segue `browse.jsp?id=`. Se e quando o ID de julgamento for encontrado, ele será armazenado numa lista e exibido na Saída de Comando, que nos deixa com todas as informações que precisamos para então escrever o programa que fará o *download* dos julgamentos desejados. - -``` python -def getIndivTrials(query): - import os, re - - cleanQuery = re.sub(r'\W+', '', query) - searchResults = os.listdir(cleanQuery) - - urls = [] - - # encontra as páginas de resultados de busca. - for files in searchResults: - if files.find("search-result") != -1: - f = open(cleanQuery + "/" + files, 'r') - text = f.read().split(" ") - f.close() - - # busca os IDs de julgamento. - for words in text: - if words.find("browse.jsp?id=") != -1: - # isola o ID - urls.append(words[words.find("id=") +3: words.find("&")]) - - print(urls) -``` - -Essa última linha do `for` *loop* pode parecer confusa, mas certifique-se de que entendeu antes de seguir em frente. A variável `words` é verificada para saber se contém os caracteres `id=` (sem aspas), que obviamente se referem a um ID específico de transcrição de julgamento. Caso contenha, usamos o método de string `slice` para capturar apenas o trecho entre `id=` e `&` e o adicionamos à lista de url. Se soubéssemos as posições exatas dos índices dessa substring, poderíamos ter usado esses valores numéricos no lugar. No entanto, ao utilizar o método de string `find()`, criamos um programa muito mais flexível. O código a seguir faz exatamente a mesma coisa que essa última linha, mas de maneira menos condensada: - -``` python -idStart = words.find("id=") + 3 -idEnd = words.find("&") -trialID = words[idStart: idEnd] - -urls.append(trialID) -``` - -Ao executar novamente o programa `extract-trial-ids.py`, deve ver uma lista de todos os IDs de julgamento. Podemos adicionar algumas linhas extra para transformá-los em URLs propriamente ditos e fazer o *download* de toda a lista para o nosso novo diretório. Também vamos usar a biblioteca `time` para pausar o nosso programa por 3 segundos entre cada *download* - uma técnica chamada *throttling* (em português, estrangulamento). É considerada uma boa forma de não sobrecarregar o servidor de alguém com muitas solicitações por segundo; e o pequeno retardamento torna mais fácil que todos esses ficheiros sejam, de fato, baixados ao invés de ocorrer um [time out](https://en.wikipedia.org/wiki/Timeout_(computing)). Adicione o código a seguir ao final da sua função `getIndivTrials()`. Esse código vai gerar uma URL para cada página individualmente, fará o *download* da página no seu computador, irá colocá-lo no seu diretório, armazenar o ficheiro e pausar por 3 segundos antes de continuar para o próximo julgamento. Todo esse trabalho está contido num `for` *loop* e será executado uma vez para cada julgamento na sua lista de urls. - - -``` python -def getIndivTrials(query): - #... - import urllib.request, time - - # importa funções python built-in para criar caminhos de ficheiro. - from os.path import join as pjoin - - for items in urls: - # gera a URL. - url = "http://www.oldbaileyonline.org/print.jsp?div=" + items - - # faz o download da página. - response = urllib.request.urlopen(url) - webContent = response.read().decode('UTF-8') - - # cria o nome do ficheiro e coloca-o no novo diretório. - filename = items + '.html' - filePath = pjoin(cleanQuery, filename) - - # armazena o ficheiro. - f = open(filePath, 'w') - f.write(webContent) - f.close - - # pausa por 3 segundos. - time.sleep(3) -``` - -Se unirmos tudo numa única função, ela deve-se parecer com isso (note que adicionamos todas as chamadas por `import` no início para manter as coisas claras): - -``` python -def getIndivTrials(query): - import os, re, urllib.request, time - - # importa funções python built-in para criar caminhos de ficheiro. - from os.path import join as pjoin - - cleanQuery = re.sub(r'\W+', '', query) - searchResults = os.listdir(cleanQuery) - - urls = [] - - # encontra páginas de resultados de busca. - for files in searchResults: - if files.find("search-result") != -1: - f = open(cleanQuery + "/" + files, 'r') - text = f.read().split(" ") - f.close() - - # busca por IDs de julgamento. - for words in text: - if words.find("browse.jsp?id=") != -1: - # isola o id - urls.append(words[words.find("id=") +3: words.find("&")]) - - # novo daqui em diante! - for items in urls: - # gera o URL - url = "http://www.oldbaileyonline.org/print.jsp?div=" + items - - # faz o download da página. - response = urllib.request.urlopen(url) - webContent = response.read().decode('UTF-8') - - # cria o nome do ficheiro e coloca-o no novo diretório. - filename = items + '.html' - filePath = pjoin(cleanQuery, filename) - - # armazena o ficheiro. - f = open(filePath, 'w') - f.write(webContent) - f.close - - # pausa por 3 segundos. - time.sleep(3) -``` - -Vamos adicionar a mesma pausa de três segundos à nossa função `getSearchResults` para ser amigável aos *servers* do *Old Bailey Online*: - -``` python -# cria URLs para páginas de resultados de busca e armazena os ficheiros. -def getSearchResults(query, kwparse, fromYear, fromMonth, toYear, toMonth, entries): - - import urllib.request, math, os, re, time - - cleanQuery = re.sub(r'\W+', '', query) - if not os.path.exists(cleanQuery): - os.makedirs(cleanQuery) - - startValue = 0 - - # Determina quantos ficheiros precisam de ser baixados. - pageCount = entries / 10 - pageCount = math.ceil(pageCount) - - for pages in range(1, pageCount +1): - - # cada parte da URL. Dividida para facilitar a leitura. - url = 'https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=' - url += query - url += '&kwparse=' + kwparse - url += '&_divs_div0Type_div1Type=sessionsPaper_trialAccount' - url += '&fromYear=' + fromYear - url += '&fromMonth=' + fromMonth - url += '&toYear=' + toYear - url += '&toMonth=' + toMonth - url += '&start=' + str(startValue) - url += '&count=0' - - # faz o download da página e armazena o resultado. - response = urllib.request.urlopen(url) - webContent = response.read().decode('UTF-8') - filename = cleanQuery + '/' + 'search-result' + str(startValue) - f = open(filename + ".html", 'w') - f.write(webContent) - f.close - - startValue = startValue + 10 - - # pausa por 3 segundos. - time.sleep(3) -``` - -Finalmente, chame a função no programa `download-searches.py`: - -``` python -#download-searches.py -import obo - -query = 'mulatto*+negro*' - -obo.getSearchResults(query, "advanced", "1700", "00", "1750", "99", 13) - -obo.getIndivTrials(query) -``` - -Agora criou um programa que é capaz de fazer a solicitação e o *download* de ficheiros do *Old Bailey website*, baseado em parâmetros de busca que definiu, tudo sem visitar o *site*! - -### No Caso de um Ficheiro Não Ser Baixado - -Verifique se o *download* dos treze ficheiros foi realizado corretamente. Se esse for o caso, ótimo! No entanto, há a possibilidade de que esse programa tenha parado no meio do caminho. Isso porque o nosso programa, ao ser executado na nossa máquina, depende de dois fatores além do nosso controle imediato: a velocidade da internet e a o tempo de resposta do *server* do *Old Bailey Online* naquele momento. Uma coisa é pedir que o Python faça o *download* de um único ficheiro, mas quando começamos a solicitar um ficheiro a cada três segundos, há grandes chances de ocorrer um *time out* no *server* ou que ele falhe em nos enviar o ficheiro que estamos buscando. - -Se estivermos usando um navegador *web* para fazer essas solicitações, eventualmente receberíamos uma mensagem de que "a conexão expirou" ou algo do tipo. Todos nós vemos isso de tempos em tempos. No entanto, o nosso programa não foi desenvolvido para lidar ou retransmitir essas mensagens de erro, então só perceberá o problema quando o programa não tiver retornado o número esperado de ficheiros ou simplesmente não fizer nada. Para evitar frustrações e incertezas, queremos um sistema à prova de falha no nosso programa, que tentará baixar cada julgamento. Se por alguma razão ele falhar, apontaremos o problema e passaremos para o próximo julgamento. - -Para fazer isso, utilizaremos os mecanismos para lidar com erros do Python, [try / except](http://docs.python.org/tutorial/errors.html), bem como uma nova biblioteca: `socket`. `Try` e `Except` são muito parecidos com um `if / else` *statement*. Quando solicita que o Python `try` (em português, tente) algo, ele tentará executar o código; caso o código falhe em alcançar o que definiu, ele executará o código em `except` (em português, exceção). Isso é frequentemente usado ao lidar com erros, conhecido como “error handling”. Podemos usá-lo a nosso favor dizendo ao programa para tentar fazer o *download* de uma página. Caso o programa falhe, solicitaremos que ele nos informe qual ficheiro falhou e depois prossiga. Para fazer isso precisamos de usar a biblioteca `socket`, que nos permitirá definir um limite de tempo para um *download* antes de seguir em frente. Isso envolve alterar a função `getIndivTrials`. - -Primeiro, precisamos de carregar a biblioteca `socket`, o que deve ser feito da mesma forma que todos as outras importações de biblioteca. Depois, precisamos de importar a biblioteca `urllib.error`, que nos permite lidar com erros de *download*. Também precisamos de definir o tamanho do *timeout* padrão do *socket* - por quanto tempo desejamos tentar fazer o *download* de uma página antes de desistirmos. Isso deve entrar imediatamente após o comentário que começa com `# faz o download da página`: - - -``` python - import os, re, urllib.request, urllib.error, time, socket - - #... - # faz o download da página. - socket.setdefaulttimeout(10) -``` - -Então, precisamos de uma nova lista de Python que armazenará todas as urls cujo *download* falhou. Vamos chamá-la de `failedAttempts` e pode inserí-la imediatamente após as instruções de importação: - - -``` python -failedAttempts = [] -``` - -Finalmente, podemos adicionar o `try / except` *statement* de forma muito similar a como um `if / else` *statement* seria adicionado. Nesse caso, vamos colocar todo o código desenvolvido para fazer o *download* e armazenar os julgamentos no `try` *statement*, e no `except` *statement* vamos dizer ao programa o que desejamos que ele faça caso falhe. Aqui, vamos adicionar a url cujo *download* falhou à nossa nova lista, `failedAttempts`: - -``` python -#... - - socket.setdefaulttimeout(10) - - try: - response = urllib2.urlopen(url) - webContent = response.read().decode('UTF-8') - - # cria o nome de ficheiro e coloca-o no novo diretório "trials". - filename = items + '.html' - filePath = pjoin(newDir, filename) - - # armazena o ficheiro. - f = open(filePath, 'w') - f.write(webContent) - f.close - except urllib.error.URLError: - failedAttempts.append(url) -``` - -Finalmente, diremos ao programa para exibir os conteúdos da lista na Saída de Comando de modo que saibamos quais ficheiros falharam no *download*. Isso deve ser adicionado nas linhas finais da função: - -``` python -print("failed to download: " + str(failedAttempts)) -``` - -Agora ao executarmos o programa, caso haja algum problema no *download* de um ficheiro específico, receberá uma mensagem na janela de Saída de Comando do Komodo Edit. Essa mensagem irá conter quaisquer URLs dos ficheiros que falharam no *download*. Caso haja apenas um ou dois, provavelmente é mais fácil simplesmente visitar as páginas manualmente e usar o recurso de "Salvar Como" do seu navegador. Caso se esteja sentindo aventureiro, poderia modificar o programa para automaticamente fazer o *download* dos ficheiros faltantes. A versão final das suas funções `getSearchResults()` e `getIndivTrials()` deve-se parecer com isso: - -``` python -# cria URLs para páginas de resultados de busca e armazena os ficheiros. -def getSearchResults(query, kwparse, fromYear, fromMonth, toYear, toMonth, entries): - - import urllib.request, math, os, re, time - - cleanQuery = re.sub(r'\W+', '', query) - if not os.path.exists(cleanQuery): - os.makedirs(cleanQuery) - - startValue = 0 - - # determina quantos ficheiros precisam de ser baixados. - pageCount = entries / 10 - pageCount = math.ceil(pageCount) - - for pages in range(1, pageCount +1): - - # cada parte da URL. Dividida para facilitar a leitura. - url = 'https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=' - url += query - url += '&kwparse=' + kwparse - url += '&_divs_div0Type_div1Type=sessionsPaper_trialAccount' - url += '&fromYear=' + fromYear - url += '&fromMonth=' + fromMonth - url += '&toYear=' + toYear - url += '&toMonth=' + toMonth - url += '&start=' + str(startValue) - url += '&count=0' - - # faz o download da página e salva o resultado. - response = urllib.request.urlopen(url) - webContent = response.read().decode('UTF-8') - filename = cleanQuery + '/' + 'search-result' + str(startValue) - f = open(filename + ".html", 'w') - f.write(webContent) - f.close - - startValue = startValue + 10 - - # pausa por 3 segundos. - time.sleep(3) - -def getIndivTrials(query): - import os, re, urllib.request, urllib.error, time, socket - - failedAttempts = [] - - # importa funções python built-in para criar caminhos de ficheiro. - from os.path import join as pjoin - - cleanQuery = re.sub(r'\W+', '', query) - searchResults = os.listdir(cleanQuery) - - urls = [] - - # encontra páginas de resultados de busca. - for files in searchResults: - if files.find("search-result") != -1: - f = open(cleanQuery + "/" + files, 'r') - text = f.read().split(" ") - f.close() - - # busca por IDs de julgamento. - for words in text: - if words.find("browse.jsp?id=") != -1: - #isolate the id - urls.append(words[words.find("id=") +3: words.find("&")]) - - for items in urls: - # gera a URL. - url = "http://www.oldbaileyonline.org/print.jsp?div=" + items - - # faz o download da página. - socket.setdefaulttimeout(10) - try: - response = urllib.request.urlopen(url) - webContent = response.read().decode('UTF-8') - - # cria o nome do ficheiro e coloca-o no novo diretório. - filename = items + '.html' - filePath = pjoin(cleanQuery, filename) - - # armazena o ficheiro. - f = open(filePath, 'w') - f.write(webContent) - f.close - except urllib.error.URLError: - failedAttempts.append(url) - - # pausa por 3 segundos. - time.sleep(3) - - print("failed to download: " + str(failedAttempts)) -``` - -## Leituras Adicionais - -Para usuários mais avançados, ou para se tornar um usuário mais avançado, pode achar que vale a pena ler sobre como alcançar esse mesmo processo usando Interfaces de Programação de Aplicações (API). Geralmente, um *website* com uma API dá instruções de como solicitar certos documentos. É um processo bastante similar ao que acabamos de fazer interpretando a *Query String* de URL, mas sem o trabalho de investigação adicional necessário para decifrar o que cada variável faz. Caso esteja interessado no *Old Bailey Online*, recentemente liberaram uma API e a documentação pode ajudar bastante: - -- Old Bailey Online API () -- Melhor maneira de criar um diretório para gravação de ficheiros, se ele não existir, usando Python? () +--- +title: Download de Múltiplos Registros usando Query Strings +layout: lesson +collection: lessons +slug: download-multiplos-registros-query-strings +date: 2012-11-11 +translation_date: 2022-11-25 +authors: +- Adam Crymble +reviewers: +- Luke Bergmann +- Sharon Howard +- Frederik Elwert +editors: +- Fred Gibbs +translator: +- Felipe Lamarca +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- André Salvo +- Aracele Torres +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/465 +activity: acquiring +topics: [web-scraping] +abstract: "Fazer o download de um único registro de um website é fácil, mas fazer o download de vários registros de uma vez - uma necessidade cada vez mais frequente para um historiador - é muito mais eficiente usando uma linguagem de programação como o Python. Nessa lição, escreveremos um programa que fará o download de uma série de registros do Old Bailey Online usando critérios de busca personalizados e irá armazená-los num diretório no nosso computador." +redirect_from: /licoes/download-de-multiplos-registros-usando-query-strings/ +original: downloading-multiple-records-using-query-strings +avatar_alt: Figuras trabalhando numa mina, empurrando carrinhos +doi: 10.46430/phpt0034 +--- + +{% include toc.html %} + +
    +O site do Old Bailey Online foi recentemente atualizado. Infelizmente, devido às diversas mudanças, muitos (se não todos) os elementos do site de exemplo usado nesta lição não funcionarão conforme descrito. No entanto, as metodologias ensinadas por esta lição permanecem relevantes e podem ser adaptadas pelos leitores para um site de exemplo diferente. Estamos trabalhando na adaptação da lição para o novo site do Old Bailey Online, mas ainda não temos cronograma preciso de quando a lição será atualizada. [Abril de 2024] +
    + +## Objetivos do Módulo + +Fazer o *download* de um único registro de um website é fácil, mas fazer o *download* de vários registros de uma vez - uma necessidade cada vez mais frequente para um historiador - é muito mais eficiente usando uma linguagem de programação como o Python. Nesta lição, escreveremos um programa que fará o *download* de uma série de registros do *[Old Bailey Online](https://www.oldbaileyonline.org/)* usando critérios de investigação personalizados e irá armazená-los num diretório no nosso computador. Esse processo envolve interpretar e manipular *Query Strings* de URL. Nesse caso, o tutorial buscará fazer o *download* de fontes que contenham referências a afrodescendentes que foram publicadas no *Old Bailey Proceedings* entre 1700 e 1750. + +
    +Os exemplos nessa lição incluem linguagem histórica racializada que os leitores podem achar ofensiva. O autor não tolera o uso dessa linguagem, mas tentou usá-la no seu contexto histórico, reconhecendo que, de outra forma, é impossível encontrar os materiais desejados do estudo de caso. Qualquer pessoa que ensine com este material é aconselhada a adotar uma abordagem sensível em relação à linguagem e a aplicar as boas práticas ao ensinar sobre raça. O autor recomenda os muitos recursos do Teaching Tolerance; Peggy McIntosh, ‘White Privilege: Unpacking the Invisible Knapsack’, Peace and Freedom Magazine, (1989), 10-12; Binyavanga Wainaina, ‘How to Write About Africa’, Granta (92): 2006. +
    + +## Para Quem isso é Útil? + +Automatizar o processo de *download* de registros de uma base de dados *online* será útil para qualquer um que trabalhe com fontes históricas armazenadas *online* de forma ordenada e acessível e que deseje salvar cópias dessas fontes no seu próprio computador. É particularmente útil para alguém que deseja fazer o *download* de vários registros específicos, em vez de apenas um punhado. Caso deseje fazer o *download* de *todos* ou da *maioria* dos registros de uma base de dados em particular, pode achar o tutorial de Ian Milligan sobre [Automated Downloading with WGET](/en/lessons/automated-downloading-with-wget) mais adequado. + +O presente tutorial permitirá que faça *download* de forma isolada e discriminada de registros específicos que atendam às suas necessidades. Fazer o *download* de múltiplas fontes de forma automática economiza um tempo considerável. O que faz com as fontes baixadas depende dos seus objetivos de investigação. Pode desejar criar visualizações ou realizar uma série de métodos de análise de dados, ou simplesmente reformatá-las para facilitar a navegação. Ou pode desejar apenas manter uma cópia de *backup* para poder acessá-las sem acesso à internet. + +Essa lição é voltada para usuários de Python com nível intermediário. Caso ainda não tenha tentado as lições do [Básico de Programação em Python](/pt/licoes/introducao-instalacao-python), pode achá-las um ponto de partida útil. + +## Aplicando nosso Conhecimento Histórico + +Nesta lição, estamos tentando criar o nosso próprio corpus de casos relacionados com pessoas afrodescendentes. A partir do [caso de Benjamin Bowsey](https://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33) no *Old Bailey* em 1780, podemos notar que "*black*" pode ser uma palavra-chave útil para usarmos para localizar outros casos envolvendo réus de ascendência africana. No entanto, quando buscamos por *black* no *website* do *Old Bailey*, percebemos que esta palavra às vezes se refere a outros usos: *black horses* ou *black cloth*. A tarefa de desambiguar esse uso da linguagem terá que esperar por outra lição. Por enquanto, vamos nos voltar para casos mais fáceis. Como historiadores, provavelmente, podemos pensar em palavras-chave de termos historicamente racializados relacionados com afrodescendentes as quais valeria a pena buscar. A infame "*n-word*", é claro, não é útil, já que esse termo não era comumente utilizado até meados do século XIX. Outras expressões racializadas como "*negro*" e "*mulatto*" são, porém, muito mais relevantes para o início do século XVIII. Essas palavras-chave são menos ambíguas do que "*black*" e são muito mais propensas a serem referências imediatas a pessoas no nosso público-alvo. Se testarmos esses dois termos em buscas separadas simples no *Old Bailey website*, temos resultados como nessa captura de tela: + +{% include figure.html filename="SearchResultsNegro.png" caption="Resultados de investigação para 'negro' no *Old Bailey Online*" %} + +{% include figure.html filename="SearchResultsMulatto.png" caption="Resultados de investigação para 'mulatto' no *Old Bailey Online*" %} + +Depois de examinar estes resultados de busca, parece evidente que são referências a pessoas e não a cavalos, panos ou qualquer outra coisa que seja preta. Desejamos fazer o *download* de todas para usar na nossa análise. Poderíamos, é claro, fazer o *download* de uma por uma manualmente. Mas vamos encontrar uma maneira programática de automatizar essa tarefa. + +## A Investigação Avançada no OBO + +As ferramentas de pesquisa de cada *site* funcionam de maneira diferente. Embora as pesquisas funcionem de forma semelhante, as complexidades das pesquisas numa base de dados podem não ser totalmente óbvias. Portanto, é importante pensar criticamente sobre as opções de busca de uma base de dados e, quando disponível, ler a documentação fornecida pelo *website*. Investigadores de história prudentes sempre interrogam suas fontes; os procedimentos por trás das suas caixas de pesquisa devem receber a mesma atenção. O [formulário de busca avançada](https://www.oldbaileyonline.org/forms/formMain.jsp) do *Old Bailey Online* permite refinar as suas buscas com base em dez campos diferentes, incluindo palavras-chave simples, um intervalo de datas e um tipo de crime. Como as ferramentas de busca de cada *website* são diferentes, vale sempre a pena reservar um momento ou dois para testar e ler a respeito das opções de investigação disponíveis. Uma vez que já fizemos buscas simples por "*negro*" e "*mulatto*", sabemos que haverá resultados. No entanto, vamos usar a busca avançada para limitar os nossos resultados aos registros publicados no *Old Bailey Proceedings* que dizem respeito a julgamentos apenas de 1700 até 1750. É claro que pode alterá-lo para o que desejar, mas isso tornará o exemplo mais simples de ser acompanhado. Faça a busca mostrada na imagem abaixo. Certifique-se de que marcou o botão "*Advanced*" e incluiu as *wildcards* `*` para incluir entradas pluralizadas ou com um "e" extra no final. + +{% include figure.html filename="AdvancedSearchExample.png" caption="Exemplo de Busca Avançada no *Old Bailey*" %} + +Execute a busca e depois clique no *link* "*Calculate Total*" para ver quantas entradas existem. Agora temos 13 resultados (caso tenha um número diferente, volte e certifique-se de que copiou o exemplo acima da forma exata). O que queremos fazer neste ponto é o *download* de todos esses ficheiros de julgamento e analizá-los mais profundamente. Mais uma vez, para apenas 13 registros, também pode fazer o *download* de cada registro manualmente. Mas à medida que mais e mais dados são disponibilizados *online*, torna-se mais comum a necessidade de baixar 1.300 ou até 130.000 registros, caso no qual o *download* individual dos registros se torna impraticável e entender como automatizar o processo se torna muito valioso. Para automatizar o processo, precisamos de dar um passo atrás e lembrar como as URLs de busca são criadas no *Old Bailey website*, um método comum para muitas bases de dados *online* e *websites*. + +## Entendendo *Queries* de URL + +Observe a URL produzida com a última página de resultado de busca. Ela deve se parecer com isso: + +``` +https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=mulatto*+negro*&kwparse=advanced&_divs_div0Type_div1Type=sessionsPaper_trialAccount&fromYear=1700&fromMonth=00&toYear=1750&toMonth=99&start=0&count=0 +``` + +Vimos sobre URLs em [Noções básicas de páginas web e HTML](/pt/licoes/nocoes-basicas-paginas-web-html), mas isso parece muito mais complexo. Ainda que mais longo, *não* é verdadeiramente muito mais complexo. Mas é mais fácil de entender observando como os nossos critérios de busca são representados na URL. + +``` +https://www.oldbaileyonline.org/search.jsp +?gen=1 +&form=searchHomePage +&_divs_fulltext=mulatto*+negro* +&kwparse=advanced +&_divs_div0Type_div1Type=sessionsPaper_trialAccount +&fromYear=1700 +&fromMonth=00 +&toYear=1750 +&toMonth=99 +&start=0 +&count=0 +``` + +Nessa visão, vemos com mais clareza as 12 informações importantes que precisamos para realizar a nossa busca (uma por linha). Na primeira há a URL base do *Old Bailey website*, seguida por uma query "?" (não se preocupe com o *bit* `gen=1`; os desenvolvedores do *Old Bailey Online* dizem que ele não faz nada) e uma série de 10 pares *nome/valor* unidos por caracteres `&`. Juntos, esses 10 pares de nome/valor compõem a *query string* (expressão de busca), que informa ao mecanismo de busca quais variáveis usar em etapas específicas da investigação. Observe que cada par nome/valor contém um nome de variável: `toYear` e, em seguida, atribui a essa variável um valor: `1750`. Isso funciona exatamente da mesma forma que os *Argumentos de Função*, passando certas informações para variáveis específicas. Nesse caso, a variável mais importante é `_divs_fulltext=`, para a qual foi dado o valor: + +``` +mulatto*+negro* +``` + +Esta contém o termo que digitamos na caixa de busca. O programa adicionou automaticamente um sinal de soma `+` no lugar de um espaço em branco (URLs não podem conter espaçamentos); dito de outro modo, isso é exatamente o que pedimos que o *site* do *Old Bailey* encontrasse. As outras variáveis carregam valores que nós também definimos. `fromYear` e `toYear` contém o nosso intervalo de datas. Já que nenhum ano possui 99 meses, como sugerido na variável `toMonth`, podemos assumir que esse seja o modo através do qual o algoritmo garante que todos os registros daquele ano são incluídos. Não há regras difíceis ou rápidas para descobrir o que cada variável faz, porque a pessoa que criou o site as nomeou. Muitas vezes pode fazer uma suposição razoável. Todos os campos de busca possíveis na página de busca avançada possuem os seus próprios pares nome/valor. Caso deseje descobrir o nome da variável de modo a que possa utilizá-la, faça uma nova busca e certifique-se de colocar um valor no campo no qual está interessado. Após submeter a sua busca, verá o seu valor e o nome associado a ele como parte da URL da página dos resultados de busca. Com o *Old Bailey Online*, assim como com noutros *websites*, o formulário de busca (avançada ou não) ajuda, essencialmente, a construir URLs que informam à base de dados o que está buscando. Se puder entender como os campos de busca estão representados no URL - o que geralmente é algo bem direto -, então torna-se relativamente simples construir esses URLs programaticamente e automatizar o processo de *download* de registros. + +Agora tente alterar o `start=0` para `start=10` e pressione `enter`. Deve agora ter os resultados 11-13. A variável `start` informa ao *website* qual a entrada que deve ser mostrada no início da lista de resultados de busca. Nós devemos ser capazes de utilizar esse conhecimento para criar uma série de URLs que nos permitirão fazer o *download* de todos os 13 ficheiros. Vamos nos voltar para isso agora. + +## Fazendo o *Download* de Ficheiros Sistematicamente + +Na lição [Download de Páginas Web com Python](/pt/licoes/download-paginas-web-python), aprendemos que o Python pode fazer o *download* de uma página web desde que tenhamos a URL. Naquela lição, usamos a URL para fazer o *download* da transcrição do julgamento de Benjamin Bowsey. Nesse caso, estamos tentando fazer o *download* de múltiplas transcrições de julgamentos que atendem aos critérios de busca descritos acima sem precisar executar o programa repetidamente. Ao invés disso, queremos um programa que faça o *download* de tudo de uma vez. Neste ponto, temos a URL para a página de resultados de busca que contém as 10 primeiras entradas na nossa investigação. Também sabemos que ao mudarmos o valor de `start` na URL, podemos sequencialmente chamar cada uma das páginas de resultados de busca e finalmente recuperar todos os ficheiros de julgamento que elas possuem. É claro que os resultados de busca não nos oferecem os ficheiros do julgamento em si, mas apenas *links* para eles. Então precisamos de extrair esses *links* para os registros subjacentes dos resultados de busca. No *Old Bailey Online website*, as URLs para os registros individuais (os ficheiros de transcrição de julgamento) podem ser encontrados como *links* na página de resultados de busca. Sabemos que todas as transcrições de julgamento possuem um id de julgamento que assume a forma: "t" seguido por, pelo menos, 8 números (ex.: t17800628-33). Ao buscar *links* que contenham esse padrão, podemos identificar URLs de transcrição de julgamento. Como em lições anteriores, vamos desenvolver um algoritmo de modo a que possamos começar a enfrentar esse problema de uma maneira que o computador possa lidar. Parece que a tarefa pode ser realizada em 4 passos. Precisaremos: + +- Gerar as URLs para cada página de resultados de busca incrementando a variável `start` numa quantidade fixa um número apropriado de vezes. +- Fazer o *download* de cada página de resultados de busca como um ficheiro HTML. +- Extrair os URLs de cada transcrição de julgamento (usando o ID do julgamento como descrito acima) de cada ficheiro HTML de resultados de busca. +- Percorrer essas URLs extraídas para baixar cada transcrição de avaliação e salvá-las num diretório no nosso computador. + +Perceberá que isso é razoavelmente similiar às tarefas que realizamos em [Download de Páginas Web com Python](/pt/licoes/download-paginas-web-python) e [De HTML para Lista de Palavras (parte 2)](/pt/licoes/HTML-lista-palavras-2). Primeiro, fazemos o *download* e, então, analisamos as informações que procuramos. E, nesse caso, fazemos mais alguns *downloads*. + +## Fazendo o *Download* das Páginas de Resultados de Busca + +Primeiro, precisamos de gerar as URLs para fazer o download de cada página de resultados de busca. Já temos a primeira usando a forma do próprio *website*. + +``` +https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=mulatto*+negro*&kwparse=advanced&_divs_div0Type_div1Type=sessionsPaper_trialAccount&fromYear=1700&fromMonth=00&toYear=1750&toMonth=99&start=0&count=0 +``` + +Poderíamos escrever essa URL duas vezes e alterar a variável `start` para obter todas as 13 entradas, mas vamos escrever um programa que funcionaria independentemente de quantas páginas de resultados de busca ou registros precisássemos de fazer *download*, não importando o que decidíssemos investigar. Estude esse código e, depois, adicione essa função ao seu módulo chamado `obo.py` (crie um ficheiro com esse nome e armazene-o no diretório onde deseja trabalhar). Os comentários no código destinam-se a ajudá-lo a decifrar as várias partes. + +``` python +# obo.py +def getSearchResults(query, kwparse, fromYear, fromMonth, toYear, toMonth): + + import urllib.request + + startValue = 0 + + # cada parte do URL. Dividido para facilitar a leitura + url = 'https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=' + url += query + url += '&kwparse=' + kwparse + url += '&_divs_div0Type_div1Type=sessionsPaper_trialAccount' + url += '&fromYear=' + fromYear + url += '&fromMonth=' + fromMonth + url += '&toYear=' + toYear + url += '&toMonth=' + toMonth + url += '&start=' + str(startValue) + url += '&count=0' + + # faz o download da página e armazena o resultado + response = urllib.request.urlopen(url) + webContent = response.read().decode('UTF-8') + filename = 'search-result' + f = open(filename + ".html", 'w') + f.write(webContent) + f.close +``` + +Nessa função, separamos os vários componentes da *Query String* e usamos Argumentos de Função para que a função possa ser reutilizada além dos nossos objetivos específicos atuais. Quando chamarmos por essa função, substituiremos os argumentos pelos valores que desejamos buscar. Depois, fazemos o *download* das páginas dos resultados de busca de maneira similiar a como foi feito em [Download de Páginas Web com Python](/pt/licoes/download-paginas-web-python). Agora, crie um novo ficheiro: `download-searches.py` e copie o código a seguir dentro dele. Observe: os valores que passamos como argumentos são exatamente os mesmos dos utilizados no exemplo acima. Sinta-se livre para testá-los para receber resultados diferentes ou ver como funcionam. + +``` python +#download-searches.py +import obo + +query = 'mulatto*+negro*' + +obo.getSearchResults(query, "advanced", "1700", "00", "1750", "99") +``` + +Quando executar esse código, deve encontrar um novo ficheiro: `search-result.html` no seu `diretório programming-historian` contendo a primeira página dos resultados de busca da sua investigação. Certifique-se de que o *download* foi realizado apropriadamente e apague o ficheiro. Vamos adaptar o nosso programa para fazer o *download* da outra página contendo as outras 3 entradas ao mesmo tempo, assim queremos ter certeza que obteremos as duas. Vamos refinar a nossa função `getSearchResults` adicionando outro argumento de função chamado `entries`, de modo a que possamos dizer ao programa quantas páginas de resultados de busca precisamos fazer o *download*. Usaremos o valor das entradas e matemática simples para determinar quantas páginas de resultado de busca existem. Isso é algo bastante direto uma vez que sabemos que há dez transcrições de julgamento listadas por página. Podemos calcular o número de páginas de resultados de busca dividindo o valor das entradas por 10. Armazenaremos esse resultado na variável chamada `pageCount`. Ela se parecerá com isso: + +``` python +# determina quantos ficheiros precisam ser baixados +pageCount = entries / 10 +``` + +No entanto, em casos em que o número de entradas não é um múltiplo de 10, isso resultará num número decimal. Pode testá-lo executando esse código no seu Terminal (Mac & Linux) / Linha de Comandos Python (Windows) e exibindo o valor mantido em `pageCount`. (Observe que, daqui em diante, usaremos a palavra Terminal para referir esse programa). + +``` python +entries = 13 +pageCount = entries / 10 +print(pageCount) +-> 1.3 +``` + +Sabemos que a contagem do número de página deve ser 2 (uma página contendo as entradas 1-10 e uma página contendo as entradas 11-13). Uma vez que sempre queremos o maior inteiro mais próximo, podemos arredondar o resultado da divisão. + +``` python +# determina quantos ficheiros precisam ser baixados +import math +pageCount = entries / 10 +pageCount = math.ceil(pageCount) +``` + +Se adicionarmos isso à nossa função `getSearchResults` abaixo da linha `startValue=0`, agora o código é capaz de calcular o número de páginas cujo *download* precisa de ser realizado. No entanto, nesta etapa ele irá fazer somente o *download* da primeira página, já que informamos à seção de *download* da função para executar somente uma vez. Para corrigir isso, podemos adicionar o código de *download* a um `for` *loop* que fará o *download* uma vez para cada número na variável `pageCount`. Caso ele leia 1, fará o *download* uma vez; caso ele leia 5, fará o *download* cinco vezes e assim por diante. Imediatamente após o `if` *statement* que acabou de escrever, adicione a linha a seguir e indente tudo antes de `f.close` com um espaçamento adicional de modo que tudo fique dentro do `for` *loop*: + +``` python +for pages in range(1, pageCount+1): + print(pages) +``` + +Uma vez que isso é um `for` *loop*, todo o código que desejamos executar repetidamente também precisa de ser planejado. Pode-se certificar de que fez isso corretamente verificando o código finalizado no exemplo abaixo. Esse *loop* aproveita a função [range](https://docs.python.org/3/tutorial/controlflow.html#the-range-function) do Python. Para entender esse `for` *loop* é melhor, provavelmente, pensar em `pageCount` igual a 2 como no exemplo. Portanto, essas duas linhas de código significam: comece a executar com um valor de *loop* inicial 1 e, a cada vez que executar, adicione uma unidade a esse valor. Quando o valor do *loop* é o mesmo de `pageCount`, executa mais uma vez e para. Isso é particularmente valioso porque significa que podemos dizer ao nosso programa para executar exatamente uma vez para cada página de resultados de busca e oferece uma nova habilidade flexível para controlar quantas vezes um `for` *loop* é executado. Caso deseje praticar essa nova e poderosa maneira de escrever *loops*, pode abrir o seu Terminal e brincar. + +``` python +pageCount = 2 +for pages in range(1, pageCount+1): + print(pages) + +-> 1 +-> 2 +``` + +Antes de adicionar todo esse código à nossa função `getSearchResults`, temos que fazer dois ajustes finais. No final do `for` *loop* (mas ainda dentro do *loop*) e depois que o nosso código de *download* for executado, precisamos de mudar nossa variável `startValue`, que é usada na construção da URL da página que desejamos fazer o *download*. Se nos esquecermos de fazer isso, o nosso programa fará repetidamente o *download* da primeira página de resultados de busca, já que não estamos verdadeiramente mudando nada na URL inicial. A variável `startValue`, como discutido acima, é o que controla em que página de resultados de busca desejamos fazer o *download*. Portanto, podemos solicitar a próxima página de resultados de busca incrementando o valor de `startvalue` em 10 unidades depois que o *download* inicial for concluído. Caso não tenha certeza de onde adicionar essa linha, pode espiar adiante o código finalizado no exemplo abaixo. + +Finalmente, queremos garantir que os nomes do ficheiros que fizemos o *download* são diferentes entre si. De outro modo, cada *download* será armazenado em cima do *download* anterior, deixando apenas um único ficheiro de resultados de busca. Para resolver isso, podemos ajustar os conteúdos da variável `filename` para incluir o valor armazenado em `startValue` de modo que a cada vez que fizermos o *download* de uma nova página, ela recebe um nome diferente. Já que a variável `startValue` é um inteiro, precisaremos de convertê-la para uma string antes de adicioná-la à variável `filename`. Ajuste a linha no seu programa que pertence à variável `filename` para ficar assim: + +``` python +filename = 'search-result' + str(startValue) +``` +Agora deve ser capaz de adicionar essas novas linhas de código à sua função `getSearchResults`. Lembre-se de que fizemos as adições a seguir: + +- Adicionar `entries` como um argumento de função adicional logo depois de `toMonth` +- Calcular o número de páginas de resultados de pesquisa e adicionar isso imediatamente após a linha que começa com `startValue = 0` (antes de construirmos a URL e começarmos o *download*) +- Imediatamente após isso, adicione um `for` *loop* que informará ao programa para executar uma vez para cada página de resultados de busca, e indentar o resto do código de modo a que ele esteja dentro do novo *loop* +- A última linha no `for` *loop* deve agora incrementar o valor da variável `startValue` a cada vez que o *loop* é executado +- Ajustar a variável `filename` existente de modo que a cada vez que for feito o *download* de uma página de resultados de busca ela forneça um nome único ao ficheiro. + +A função finalizada no seu ficheiro `obo.py` deve-se parecer com isso: + +``` python +# cria URLs para páginas de resultados de busca e armazena os ficheiros. +def getSearchResults(query, kwparse, fromYear, fromMonth, toYear, toMonth, entries): + + import urllib.request, math + + startValue = 0 + + # isso é novo! determina quantos ficheiros precisam ser baixados. + pageCount = entries / 10 + pageCount = math.ceil(pageCount) + + # essa linha é nova! + for pages in range(1, pageCount +1): + + # cada parte do URL. Dividido para facilitar a leitura. + url = 'https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=' + url += query + url += '&kwparse=' + kwparse + url += '&_divs_div0Type_div1Type=sessionsPaper_trialAccount' + url += '&fromYear=' + fromYear + url += '&fromMonth=' + fromMonth + url += '&toYear=' + toYear + url += '&toMonth=' + toMonth + url += '&start=' + str(startValue) + url += '&count=0' + + # faz o download da página e salva o resultado. + response = urllib.request.urlopen(url) + webContent = response.read().decode('UTF-8') + filename = 'search-result' + str(startValue) + f = open(filename + ".html", 'w') + f.write(webContent) + f.close + + # essa linha é nova! + startValue = startValue + 10 +``` + +Para executar essa nova função, adicione o argumento extra ao `download-searches.py` e execute o programa novamente: + +``` python +#download-searches.py +import obo + +query = 'mulatto*+negro*' + +obo.getSearchResults(query, "advanced", "1700", "00", "1750", "99", 13) +``` + +Ótimo! Agora temos as duas páginas de resultados de busca, chamadas `search-result0.html` e `search-result10.html`. Mas antes de seguirmos para o próximo passo do algoritmo, vamos cuidar de algumas "tarefas de organização". O nosso diretório `programming-historian` rapidamente se tornará difícil de controlar se fizermos o *download* de múltiplas páginas de resultados de busca e transcrições de julgamento. Vamos fazer com que o Python crie um novo diretório nomeado a partir dos nossos termos de busca. + +Desejamos adicionar essa nova funcionalidade em `getSearchResults`, de modo que os *downloads* das nossas páginas de resultados de busca sejam direcionadas a diretórios com o mesmo nome da nossa *query* de busca. Isso manterá o nosso diretório `programming-historian` mais organizado. Para fazê-lo, criaremos um novo diretório usando a biblioteca `os`, abreviação de "*operating system*" (sistema operacional). Essa biblioteca contém uma função chamada `makedirs` que, não surpreendentemente, cria um novo diretório. Pode testar usando o Terminal: + + +``` python +import os + +query = "meuNovoDiretório" +if not os.path.exists(query): + os.makedirs(query) +``` + +Esse programa irá verificar se o seu computador já possui um diretório com esse nome. Caso não possua, agora deve possuir um diretório chamado `meuNovoDiretório` no seu computador. Num Mac provavelmente está localizado no seu diretório `/Users/username/`, e no Windows deve ser capaz de encontrá-lo no diretório `Python` no seu computador, o mesmo no qual abriu o programa da linha de comandos. Se isso funcionou, pode deletar o diretório do seu disco rígido, já que isso foi só uma prática. Uma vez que desejamos criar um novo diretório nomeado a partir da *query* que inserimos no *Old Bailey Online website*, vamos usar diretamente esse argumento de função `query` da função `getSearchResults`. Para fazer isso, importe a biblioteca `os` após as outras e, depois, adicione o código que acabou de escrever imediatamente abaixo. A sua função `getSearchResults` deve agora se parecer com isso: + +``` python +# cria URLs para páginas de resultados de busca e armazena os ficheiros. +def getSearchResults(query, kwparse, fromYear, fromMonth, toYear, toMonth, entries): + + import urllib.request, math, os + + # Essa linha é nova! Cria um novo diretório. + if not os.path.exists(query): + os.makedirs(query) + + startValue = 0 + + # Determina quantos ficheiros precisam ser baixados. + pageCount = entries / 10 + pageCount = math.ceil(pageCount) + + for pages in range(1, pageCount +1): + + # cada parte do URL. Dividido para facilitar a leitura. + url = 'https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=' + url += query + url += '&kwparse=' + kwparse + url += '&_divs_div0Type_div1Type=sessionsPaper_trialAccount' + url += '&fromYear=' + fromYear + url += '&fromMonth=' + fromMonth + url += '&toYear=' + toYear + url += '&toMonth=' + toMonth + url += '&start=' + str(startValue) + url += '&count=0' + + # faz o download da página e salva o resultado. + response = urllib.request.urlopen(url) + webContent = response.read().decode('UTF-8') + + # armazena o resultado num novo diretório. + filename = 'search-result' + str(startValue) + + f = open(filename + ".html", 'w') + f.write(webContent) + f.close + + startValue = startValue + 10 +``` + +O último passo para essa função é garantir que, quando salvarmos as nossas páginas de resultados de busca, as armazenaremos nesse novo diretório. Para fazer isso, podemos fazer um pequeno ajuste à variável `filename` de modo a que o ficheiro termine no lugar certo. Há muitas formas de o fazer e a mais fácil é simplesmente adicionar o nome do novo diretório mais uma barra no nome do ficheiro: + +``` python +filename = query + '/' + 'search-result' + str(startValue) +``` + +Caso o seu computador esteja executando o Windows, precisará de uma barra invertida em vez da barra do exemplo acima. Adicione a linha acima à sua função `getSearchResults` no lugar da descrição atual do `filename`. + +Se estiver executando o Windows, é provável que o seu programa `downloadSearches.py` falhe quando o executar porque está tentando criar um diretório com um \* nele. O Windows não gosta disso. Para resolver esse problema podemos usar [expressões regulares](https://docs.python.org/3/library/re.html) para remover qualquer caractere não compatível com o Windows. Usamos expressões regulares anteriormente em [Contagem de Frequências de Palavras com Python](/pt/licoes/contar-frequencias-palavras-python). Para remover caracteres não-alfanuméricos da *query*, primeiro importe a biblioteca de expressões regulares imediatamente após importar a biblioteca `os` e, depois, use a função `re.sub()` para criar uma nova string chamada `cleanQuery` que contém apenas caracteres alfanuméricos. Depois precisará de substituir `cleanQuery` como a variável usada nas declarações de `os.path.exists()`, `os.makedirs()` e `filename`. + +``` python +import urllib.request, math, os, re +cleanQuery = re.sub(r'\W+', '', query) +if not os.path.exists(cleanQuery): + os.makedirs(cleanQuery) + +... + +filename = cleanQuery + '/' + 'search-result' + str(startValue) +``` + +A versão final da sua função deve-se parecer com isso: + +``` python +# cria URLs para páginas de resultados de busca e armazena os ficheiros. +def getSearchResults(query, kwparse, fromYear, fromMonth, toYear, toMonth, entries): + + import urllib.request, math, os, re + + cleanQuery = re.sub(r'\W+', '', query) + if not os.path.exists(cleanQuery): + os.makedirs(cleanQuery) + + startValue = 0 + + # Determina quantos ficheiros precisam ser baixados + pageCount = entries / 10 + pageCount = math.ceil(pageCount) + + for pages in range(1, pageCount +1): + + # cada parte do URL. Dividido para facilitar a leitura. + url = 'https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=' + url += query + url += '&kwparse=' + kwparse + url += '&_divs_div0Type_div1Type=sessionsPaper_trialAccount' + url += '&fromYear=' + fromYear + url += '&fromMonth=' + fromMonth + url += '&toYear=' + toYear + url += '&toMonth=' + toMonth + url += '&start=' + str(startValue) + url += '&count=0' + + # faz o download da página e salva o resultado. + response = urllib.request.urlopen(url) + webContent = response.read().decode('UTF-8') + filename = cleanQuery + '/' + 'search-result' + str(startValue) + f = open(filename + ".html", 'w') + f.write(webContent) + f.close + + startValue = startValue + 10 +``` + +Dessa vez dizemos ao programa para fazer o *download* dos julgamentos e armazená-los num novo diretório ao invés do nosso diretório `programming-historian`. Execute o programa `download-searches.py` mais uma vez para se certificar de que ele funcionou e que entendeu como armazenar os ficheiros num diretório particular usando Python. + +### Fazendo o *Download* das Entradas de Julgamento Individuais + +A este ponto, criamos uma função que é capaz de fazer o *download* de todos os ficheiros HTML de resultados de busca a partir do website *Old Bailey Online* para uma busca avançada que definimos e desenvolvemos de forma programática. Agora o próximo passo do algoritmo: extrair as URLs de cada transcrição de julgamento dos ficheiros HTML de resultados de busca. Nas lições que precedem esta (ex.: [Download de Páginas Web com Python](/pt/licoes/download-paginas-web-python)), trabalhamos com as versões para exibição das transcrições dos julgamentos e continuaremos a fazer isso. Sabemos que a versão de exibição do julgamento de Benjamin Bowsey está localizada na URL: + +``` +http://www.oldbaileyonline.org/print.jsp?div=t17800628-33 +``` + +Da mesma forma que alterar as *query strings* nas URLs gera resultados de busca diferentes, alterar a URL dos registros de julgamento - no caso, substituir um ID de julgamento por outro - nos fará obter a transcrição para aquele novo julgamento. Isso significa que, para encontrar e fazer o *download* dos 13 ficheiros que buscamos, tudo o que precisamos são esses IDs de julgamento. Uma vez que sabemos que essas páginas de resultados de busca geralmente contém um *link* para as páginas descritas, há uma boa chance de que consigamos encontrar esses *links* integrados ao código HTML. Se formos capazes de raspar essa informação das páginas de resultados de busca em que fizemos *download*, podemos então usar essa informação para gerar uma URL que nos permitirá fazer o *download* de cada transcrição de julgamento. Essa é uma técnica que irá utilizar para a maioria das páginas de resultados de busca, não só o *Old Bailey Online*! Para fazer isso, primeiro precisamos encontrar onde os IDs de julgamento estão no código HTML dos ficheiros que fizemos o *download* e, depois, determinar uma maneira de isolá-los consistentemente usando código de modo a que, independentemente de qual página de resultado de busca fizermos o *download*, sejamos capazes de encontrar as transcrições de julgamento. Primeiro, abra `search-results0.html` no Komodo Edit e dê uma olhada na lista de julgamentos. A primeira entrada começa com "Anne Smith", então pode usar o recurso `find` no Komodo Edit para pular imediatamente para o lugar certo. Observe que o nome de Anne faz parte de um *link*: + +``` +browse.jsp?id=t17160113-18&div=t17160113-18&terms=mulatto*_negro*#highlight +``` + +Perfeito, o *link* contém o ID do julgamento! Percorra as entradas restantes e verá que isso é verdade em todos os casos. Para nossa sorte, o *site* é bem formatado e parece que cada *link* começa com `browse.jsp?id=` seguido pelo ID do julgamento e termina com um `&`, no caso de Anne: `browse.jsp?id=t17160113-18&`. Podemos escrever algumas linhas de código que sejam capazes de isolar esses IDs. Veja a função a seguir. Essa função também usa a biblioteca `os`, nesse caso para listar todos os ficheiros localizados no diretório criado na seção anterior. A biblioteca `os` possui uma gama de funções úteis que imitam os tipos de tarefas que esperaria ser capaz de fazer com o seu mouse no Mac Finder ou Windows, como abrir, fechar, criar, deletar e mover ficheiros e diretórios, e é uma boa biblioteca a ser masterizada - ou pelo menos para se familiarizar. + +``` python +def getIndivTrials(query): + import os, re + + cleanQuery = re.sub(r'\W+', '', query) + searchResults = os.listdir(cleanQuery) + + print(searchResults) +``` + +Crie e execute um novo programa chamado `extract-trials-ids.py` com o código a seguir. Certifique-se de inserir o mesmo valor nos argumentos da *query* como fez no exemplo anterior: + +``` python +import obo + +obo.getIndivTrials("mulatto*+negro*") +``` + +Se tudo correu bem, deve ver uma lista contendo o nome de todos os ficheiros no seu novo diretório `mulatto*+negro*`, que a essa altura devem ser as duas páginas de resultados de busca. Certifique-se de que isso funcionou antes de prosseguir. Uma vez que armazenamos todas as páginas de resultados de busca com um nome de ficheiro que inclui `search-results`, agora desejamos abrir todos os ficheiros cujo nome contenha `search-results` e extrair todos os IDs de julgamento encontrados neles. Nesse caso sabemos que temos 2, mas desejamos que o nosso código seja o mais reutilizável possível (com razão, é claro!). Restringir essa ação a ficheiros denominados `search-results` significará que este programa funcionará como pretendido, mesmo que o diretório contenha muitos outros ficheiros não relacionados, já que o programa ignorará qualquer coisa com nome diferente. + +Adicione o código a seguir à sua função `getIndivTrials()`, que verificará se cada ficheiro contém `search-results` no seu nome. Em caso verdadeiro, o ficheiro será aberto e o conteúdo será salvo na variável chamada `text`. Essa variável `text` será analisada na busca por um ID de julgamento, que sabemos que sempre segue `browse.jsp?id=`. Se e quando o ID de julgamento for encontrado, ele será armazenado numa lista e exibido na Saída de Comando, que nos deixa com todas as informações que precisamos para então escrever o programa que fará o *download* dos julgamentos desejados. + +``` python +def getIndivTrials(query): + import os, re + + cleanQuery = re.sub(r'\W+', '', query) + searchResults = os.listdir(cleanQuery) + + urls = [] + + # encontra as páginas de resultados de busca. + for files in searchResults: + if files.find("search-result") != -1: + f = open(cleanQuery + "/" + files, 'r') + text = f.read().split(" ") + f.close() + + # busca os IDs de julgamento. + for words in text: + if words.find("browse.jsp?id=") != -1: + # isola o ID + urls.append(words[words.find("id=") +3: words.find("&")]) + + print(urls) +``` + +Essa última linha do `for` *loop* pode parecer confusa, mas certifique-se de que entendeu antes de seguir em frente. A variável `words` é verificada para saber se contém os caracteres `id=` (sem aspas), que obviamente se referem a um ID específico de transcrição de julgamento. Caso contenha, usamos o método de string `slice` para capturar apenas o trecho entre `id=` e `&` e o adicionamos à lista de url. Se soubéssemos as posições exatas dos índices dessa substring, poderíamos ter usado esses valores numéricos no lugar. No entanto, ao utilizar o método de string `find()`, criamos um programa muito mais flexível. O código a seguir faz exatamente a mesma coisa que essa última linha, mas de maneira menos condensada: + +``` python +idStart = words.find("id=") + 3 +idEnd = words.find("&") +trialID = words[idStart: idEnd] + +urls.append(trialID) +``` + +Ao executar novamente o programa `extract-trial-ids.py`, deve ver uma lista de todos os IDs de julgamento. Podemos adicionar algumas linhas extra para transformá-los em URLs propriamente ditos e fazer o *download* de toda a lista para o nosso novo diretório. Também vamos usar a biblioteca `time` para pausar o nosso programa por 3 segundos entre cada *download* - uma técnica chamada *throttling* (em português, estrangulamento). É considerada uma boa forma de não sobrecarregar o servidor de alguém com muitas solicitações por segundo; e o pequeno retardamento torna mais fácil que todos esses ficheiros sejam, de fato, baixados ao invés de ocorrer um [time out](https://en.wikipedia.org/wiki/Timeout_(computing)). Adicione o código a seguir ao final da sua função `getIndivTrials()`. Esse código vai gerar uma URL para cada página individualmente, fará o *download* da página no seu computador, irá colocá-lo no seu diretório, armazenar o ficheiro e pausar por 3 segundos antes de continuar para o próximo julgamento. Todo esse trabalho está contido num `for` *loop* e será executado uma vez para cada julgamento na sua lista de urls. + + +``` python +def getIndivTrials(query): + #... + import urllib.request, time + + # importa funções python built-in para criar caminhos de ficheiro. + from os.path import join as pjoin + + for items in urls: + # gera a URL. + url = "http://www.oldbaileyonline.org/print.jsp?div=" + items + + # faz o download da página. + response = urllib.request.urlopen(url) + webContent = response.read().decode('UTF-8') + + # cria o nome do ficheiro e coloca-o no novo diretório. + filename = items + '.html' + filePath = pjoin(cleanQuery, filename) + + # armazena o ficheiro. + f = open(filePath, 'w') + f.write(webContent) + f.close + + # pausa por 3 segundos. + time.sleep(3) +``` + +Se unirmos tudo numa única função, ela deve-se parecer com isso (note que adicionamos todas as chamadas por `import` no início para manter as coisas claras): + +``` python +def getIndivTrials(query): + import os, re, urllib.request, time + + # importa funções python built-in para criar caminhos de ficheiro. + from os.path import join as pjoin + + cleanQuery = re.sub(r'\W+', '', query) + searchResults = os.listdir(cleanQuery) + + urls = [] + + # encontra páginas de resultados de busca. + for files in searchResults: + if files.find("search-result") != -1: + f = open(cleanQuery + "/" + files, 'r') + text = f.read().split(" ") + f.close() + + # busca por IDs de julgamento. + for words in text: + if words.find("browse.jsp?id=") != -1: + # isola o id + urls.append(words[words.find("id=") +3: words.find("&")]) + + # novo daqui em diante! + for items in urls: + # gera o URL + url = "http://www.oldbaileyonline.org/print.jsp?div=" + items + + # faz o download da página. + response = urllib.request.urlopen(url) + webContent = response.read().decode('UTF-8') + + # cria o nome do ficheiro e coloca-o no novo diretório. + filename = items + '.html' + filePath = pjoin(cleanQuery, filename) + + # armazena o ficheiro. + f = open(filePath, 'w') + f.write(webContent) + f.close + + # pausa por 3 segundos. + time.sleep(3) +``` + +Vamos adicionar a mesma pausa de três segundos à nossa função `getSearchResults` para ser amigável aos *servers* do *Old Bailey Online*: + +``` python +# cria URLs para páginas de resultados de busca e armazena os ficheiros. +def getSearchResults(query, kwparse, fromYear, fromMonth, toYear, toMonth, entries): + + import urllib.request, math, os, re, time + + cleanQuery = re.sub(r'\W+', '', query) + if not os.path.exists(cleanQuery): + os.makedirs(cleanQuery) + + startValue = 0 + + # Determina quantos ficheiros precisam de ser baixados. + pageCount = entries / 10 + pageCount = math.ceil(pageCount) + + for pages in range(1, pageCount +1): + + # cada parte da URL. Dividida para facilitar a leitura. + url = 'https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=' + url += query + url += '&kwparse=' + kwparse + url += '&_divs_div0Type_div1Type=sessionsPaper_trialAccount' + url += '&fromYear=' + fromYear + url += '&fromMonth=' + fromMonth + url += '&toYear=' + toYear + url += '&toMonth=' + toMonth + url += '&start=' + str(startValue) + url += '&count=0' + + # faz o download da página e armazena o resultado. + response = urllib.request.urlopen(url) + webContent = response.read().decode('UTF-8') + filename = cleanQuery + '/' + 'search-result' + str(startValue) + f = open(filename + ".html", 'w') + f.write(webContent) + f.close + + startValue = startValue + 10 + + # pausa por 3 segundos. + time.sleep(3) +``` + +Finalmente, chame a função no programa `download-searches.py`: + +``` python +#download-searches.py +import obo + +query = 'mulatto*+negro*' + +obo.getSearchResults(query, "advanced", "1700", "00", "1750", "99", 13) + +obo.getIndivTrials(query) +``` + +Agora criou um programa que é capaz de fazer a solicitação e o *download* de ficheiros do *Old Bailey website*, baseado em parâmetros de busca que definiu, tudo sem visitar o *site*! + +### No Caso de um Ficheiro Não Ser Baixado + +Verifique se o *download* dos treze ficheiros foi realizado corretamente. Se esse for o caso, ótimo! No entanto, há a possibilidade de que esse programa tenha parado no meio do caminho. Isso porque o nosso programa, ao ser executado na nossa máquina, depende de dois fatores além do nosso controle imediato: a velocidade da internet e a o tempo de resposta do *server* do *Old Bailey Online* naquele momento. Uma coisa é pedir que o Python faça o *download* de um único ficheiro, mas quando começamos a solicitar um ficheiro a cada três segundos, há grandes chances de ocorrer um *time out* no *server* ou que ele falhe em nos enviar o ficheiro que estamos buscando. + +Se estivermos usando um navegador *web* para fazer essas solicitações, eventualmente receberíamos uma mensagem de que "a conexão expirou" ou algo do tipo. Todos nós vemos isso de tempos em tempos. No entanto, o nosso programa não foi desenvolvido para lidar ou retransmitir essas mensagens de erro, então só perceberá o problema quando o programa não tiver retornado o número esperado de ficheiros ou simplesmente não fizer nada. Para evitar frustrações e incertezas, queremos um sistema à prova de falha no nosso programa, que tentará baixar cada julgamento. Se por alguma razão ele falhar, apontaremos o problema e passaremos para o próximo julgamento. + +Para fazer isso, utilizaremos os mecanismos para lidar com erros do Python, [try / except](https://docs.python.org/tutorial/errors.html), bem como uma nova biblioteca: `socket`. `Try` e `Except` são muito parecidos com um `if / else` *statement*. Quando solicita que o Python `try` (em português, tente) algo, ele tentará executar o código; caso o código falhe em alcançar o que definiu, ele executará o código em `except` (em português, exceção). Isso é frequentemente usado ao lidar com erros, conhecido como “error handling”. Podemos usá-lo a nosso favor dizendo ao programa para tentar fazer o *download* de uma página. Caso o programa falhe, solicitaremos que ele nos informe qual ficheiro falhou e depois prossiga. Para fazer isso precisamos de usar a biblioteca `socket`, que nos permitirá definir um limite de tempo para um *download* antes de seguir em frente. Isso envolve alterar a função `getIndivTrials`. + +Primeiro, precisamos de carregar a biblioteca `socket`, o que deve ser feito da mesma forma que todos as outras importações de biblioteca. Depois, precisamos de importar a biblioteca `urllib.error`, que nos permite lidar com erros de *download*. Também precisamos de definir o tamanho do *timeout* padrão do *socket* - por quanto tempo desejamos tentar fazer o *download* de uma página antes de desistirmos. Isso deve entrar imediatamente após o comentário que começa com `# faz o download da página`: + + +``` python + import os, re, urllib.request, urllib.error, time, socket + + #... + # faz o download da página. + socket.setdefaulttimeout(10) +``` + +Então, precisamos de uma nova lista de Python que armazenará todas as urls cujo *download* falhou. Vamos chamá-la de `failedAttempts` e pode inserí-la imediatamente após as instruções de importação: + + +``` python +failedAttempts = [] +``` + +Finalmente, podemos adicionar o `try / except` *statement* de forma muito similar a como um `if / else` *statement* seria adicionado. Nesse caso, vamos colocar todo o código desenvolvido para fazer o *download* e armazenar os julgamentos no `try` *statement*, e no `except` *statement* vamos dizer ao programa o que desejamos que ele faça caso falhe. Aqui, vamos adicionar a url cujo *download* falhou à nossa nova lista, `failedAttempts`: + +``` python +#... + + socket.setdefaulttimeout(10) + + try: + response = urllib2.urlopen(url) + webContent = response.read().decode('UTF-8') + + # cria o nome de ficheiro e coloca-o no novo diretório "trials". + filename = items + '.html' + filePath = pjoin(newDir, filename) + + # armazena o ficheiro. + f = open(filePath, 'w') + f.write(webContent) + f.close + except urllib.error.URLError: + failedAttempts.append(url) +``` + +Finalmente, diremos ao programa para exibir os conteúdos da lista na Saída de Comando de modo que saibamos quais ficheiros falharam no *download*. Isso deve ser adicionado nas linhas finais da função: + +``` python +print("failed to download: " + str(failedAttempts)) +``` + +Agora ao executarmos o programa, caso haja algum problema no *download* de um ficheiro específico, receberá uma mensagem na janela de Saída de Comando do Komodo Edit. Essa mensagem irá conter quaisquer URLs dos ficheiros que falharam no *download*. Caso haja apenas um ou dois, provavelmente é mais fácil simplesmente visitar as páginas manualmente e usar o recurso de "Salvar Como" do seu navegador. Caso se esteja sentindo aventureiro, poderia modificar o programa para automaticamente fazer o *download* dos ficheiros faltantes. A versão final das suas funções `getSearchResults()` e `getIndivTrials()` deve-se parecer com isso: + +``` python +# cria URLs para páginas de resultados de busca e armazena os ficheiros. +def getSearchResults(query, kwparse, fromYear, fromMonth, toYear, toMonth, entries): + + import urllib.request, math, os, re, time + + cleanQuery = re.sub(r'\W+', '', query) + if not os.path.exists(cleanQuery): + os.makedirs(cleanQuery) + + startValue = 0 + + # determina quantos ficheiros precisam de ser baixados. + pageCount = entries / 10 + pageCount = math.ceil(pageCount) + + for pages in range(1, pageCount +1): + + # cada parte da URL. Dividida para facilitar a leitura. + url = 'https://www.oldbaileyonline.org/search.jsp?gen=1&form=searchHomePage&_divs_fulltext=' + url += query + url += '&kwparse=' + kwparse + url += '&_divs_div0Type_div1Type=sessionsPaper_trialAccount' + url += '&fromYear=' + fromYear + url += '&fromMonth=' + fromMonth + url += '&toYear=' + toYear + url += '&toMonth=' + toMonth + url += '&start=' + str(startValue) + url += '&count=0' + + # faz o download da página e salva o resultado. + response = urllib.request.urlopen(url) + webContent = response.read().decode('UTF-8') + filename = cleanQuery + '/' + 'search-result' + str(startValue) + f = open(filename + ".html", 'w') + f.write(webContent) + f.close + + startValue = startValue + 10 + + # pausa por 3 segundos. + time.sleep(3) + +def getIndivTrials(query): + import os, re, urllib.request, urllib.error, time, socket + + failedAttempts = [] + + # importa funções python built-in para criar caminhos de ficheiro. + from os.path import join as pjoin + + cleanQuery = re.sub(r'\W+', '', query) + searchResults = os.listdir(cleanQuery) + + urls = [] + + # encontra páginas de resultados de busca. + for files in searchResults: + if files.find("search-result") != -1: + f = open(cleanQuery + "/" + files, 'r') + text = f.read().split(" ") + f.close() + + # busca por IDs de julgamento. + for words in text: + if words.find("browse.jsp?id=") != -1: + #isolate the id + urls.append(words[words.find("id=") +3: words.find("&")]) + + for items in urls: + # gera a URL. + url = "http://www.oldbaileyonline.org/print.jsp?div=" + items + + # faz o download da página. + socket.setdefaulttimeout(10) + try: + response = urllib.request.urlopen(url) + webContent = response.read().decode('UTF-8') + + # cria o nome do ficheiro e coloca-o no novo diretório. + filename = items + '.html' + filePath = pjoin(cleanQuery, filename) + + # armazena o ficheiro. + f = open(filePath, 'w') + f.write(webContent) + f.close + except urllib.error.URLError: + failedAttempts.append(url) + + # pausa por 3 segundos. + time.sleep(3) + + print("failed to download: " + str(failedAttempts)) +``` + +## Leituras Adicionais + +Para usuários mais avançados, ou para se tornar um usuário mais avançado, pode achar que vale a pena ler sobre como alcançar esse mesmo processo usando Interfaces de Programação de Aplicações (API). Geralmente, um *website* com uma API dá instruções de como solicitar certos documentos. É um processo bastante similar ao que acabamos de fazer interpretando a *Query String* de URL, mas sem o trabalho de investigação adicional necessário para decifrar o que cada variável faz. Caso esteja interessado no *Old Bailey Online*, recentemente liberaram uma API e a documentação pode ajudar bastante: + +- Old Bailey Online API () +- Melhor maneira de criar um diretório para gravação de ficheiros, se ele não existir, usando Python? () diff --git a/pt/licoes/download-paginas-web-python.md b/pt/licoes/download-paginas-web-python.md index 14405f5a58..a365033b8f 100644 --- a/pt/licoes/download-paginas-web-python.md +++ b/pt/licoes/download-paginas-web-python.md @@ -1,168 +1,168 @@ ---- -title: Download de páginas Web com Python -layout: lesson -slug: download-paginas-web-python -date: 2012-07-17 -translation_date: 2021-03-26 -authors: -- William J. Turkel -- Adam Crymble -reviewers: -- Jim Clifford -- Frederik Elwert -editors: -- Miriam Posner -translator: -- Bruno Gasparotto Ponne -translation-editor: -- Josir Cardoso Gomes -translation-reviewer: -- Felipe Lamarca -- Daniel Alves -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/360 -activity: acquiring -topics: [python] -abstract: "Esta lição apresenta o conceito de *Localizador Uniforme de Recursos* (URL em inglês) e explica como usar o Python para fazer o download de uma página *Web* no seu disco local." -original: working-with-web-pages -avatar_alt: Um homem alto ao lado de uma mulher baixa -doi: 10.46430/phpt0010 ---- - - -{% include toc.html %} - -
    -O site do Old Bailey Online foi recentemente atualizado. Infelizmente, devido às diversas mudanças, muitos (se não todos) os elementos do site de exemplo usado nesta lição não funcionarão conforme descrito. No entanto, as metodologias ensinadas por esta lição permanecem relevantes e podem ser adaptadas pelos leitores para um site de exemplo diferente. Estamos trabalhando na adaptação da lição para o novo site do Old Bailey Online, mas ainda não temos cronograma preciso de quando a lição será atualizada. [Abril de 2024] -
    - -### Objetivos da Lição - -Esta lição apresenta o conceito de *Localizador Uniforme de Recursos* (URL em inglês) e explica como usar o Python para fazer o download de uma página *Web* no seu disco local. - -### Sobre URLs - -Uma página *Web* é um ficheiro hospedado noutro computador, conhecido como *servidor*. Quando um site é acessado, na realidade, o seu computador (o *cliente*) envia um pedido ao *servidor de hospedagem* por meio da rede e o servidor responde enviando uma cópia da página ao seu computador. Uma forma de acessar uma página por meio do seu navegador é seguir um link. É possível também colar ou digitar uma URL (localizador uniforme de recursos) diretamente no seu navegador. A URL informa ao seu navegador onde encontrar um recurso online, especificando o servidor, o diretório e o nome do ficheiro a ser recuperado, bem como o tipo de *protocolo* que o servidor e o seu navegador utilizarão para troca de informações (como o HTTP, *protocolo de transferência de hipertexto*). A estrutura básica de uma URL é - -``` -protocol://host:port/path?query -``` - -Vejamos alguns exemplos: - -``` xml -http://oldbaileyonline.org -``` - -O tipo mais básico de URL especifica apenas o protocolo e o domínio. Quando inserido em seu navegador, essa URL retornará a página principal do site [Old Bailey Online](https://www.oldbaileyonline.org). O pressuposto convencional é que a página principal num determinado diretório se chamará *index*, geralmente `index.html`. - -A URL pode incluir também um *número de porta* opcional. Sem entrar em muitos detalhes, o protocolo de rede em que se baseia a troca de informações na Internet permite que computadores se conectem de diferentes maneiras. Números de portas são utilizados para distinguir esses diferentes tipos de conexão. Uma vez que a porta padrão para HTTP é a 80, a seguinte URL é equivalente à anterior. - -``` xml -http://oldbaileyonline.org:80 -``` - -Geralmente há diversas páginas *Web* num determinado site. Essas páginas são armazenadas em diretórios no servidor e é possível especificar o caminho para uma página em particular. A página "About" para o site *The Old Bailey Online* tem a seguinte URL: - -``` xml -http://oldbaileyonline.org/static/Project.jsp -``` - -Por fim, algumas páginas permitem inserir *queries*, termo em inglês que significa pedido, solicitação. O site *The Old Bailey Online*, por exemplo, foi desenvolvido de forma que é possível requisitar uma de suas páginas utilizando uma *query string* (conjunto de caracteres que contém uma solicitação). A seguinte URL acessará uma página de resultado de buscas por registros de julgamentos criminais contendo a palavra "arsenic". - -``` xml -https://www.oldbaileyonline.org/search.jsp?form=searchHomePage&_divs_fulltext=arsenic&kwparse=and&_persNames_surname=&_persNames_given=&_persNames_alias=&_offences_offenceCategory_offenceSubcategory=&_verdicts_verdictCategory_verdictSubcategory=&_punishments_punishmentCategory_punishmentSubcategory=&_divs_div0Type_div1Type=&fromMonth=&fromYear=&toMonth=&toYear=&ref=&submit.x=0&submit.y=0 -``` - -O fragmento a seguir ao sinal "?" representa a *query*. Aprenda mais sobre como criar *queries* na lição [Downloading Multiple Records Using Query Strings](/en/lessons/downloading-multiple-records-using-query-strings) (em inglês). - -### Acessando URLs com Python - -Como um historiador da era digital, você frenquentemente desejará utilizar dados mantidos em sites acadêmicos. Para acessar esses dados, seria possível abrir as URLs uma por uma e copiar e colar os conteúdos num ficheiro de texto. Alternativamente, é possível utilizar Python para, automaticamente, coletar e processar os dados. Para isso, é preciso aprender como abrir uma URL por meio do seu próprio código. A linguagem Python inclui uma série de padrões para fazer isso. - -Como exemplo, vamos trabalhar com o tipo de documento que provavelmente você vai encontrar ao realizar uma pesquisa na área de História. Suponhamos que haja interesse nas relações raciais na Inglaterra do século XVIII. O site *The Old Bailey Online* é uma fonte rica de informações históricas e disponibiliza transcrições de julgamentos que ocorreram entre 1674 e 1913. - -{% include figure.html filename="old-bailey.png" caption="A homepage do site The Old Bailey Online" %} - -Para esse exemplo, utilizaremos a transcrição do julgamento de Benjamin Bowsey, um negro condenado por perturbar a paz durante os protestos de Gordon em 1780. A URL para o registro é - -``` xml -http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33 -``` - -Estudando a URL, podemos verificar algumas coisas. Primeiro, o site é programado em JSP (*JavaServer Pages*, uma linguagem de programação para a *web* cujo resultado é um ficheiro HTML). Segundo, é possível acessar registros de julgamentos individuais fazendo uso de *query strings*. Cada registro recebe um número único (*id=t* na URL), formado a partir da data da sessão de julgamento no formato (*AAAAMMDD*) e o número do julgamento naquela sessão do tribunal. Neste caso, *33*. Caso as duas ocorrências de `33` sejam trocadas por `34` no link acima, o seu navegador o encaminhará ao próximo julgamento. Infelizmente, nem todos os sites possuem URLs tão acessíveis e confiáveis quanto essa. - -{% include figure.html filename="bowsey-trial-page.png" caption="Transcrição do julgamento de Benjamin Bowsey, 1780" %} - -Observe a página do julgamento de Benjamin Bowsey. Mais importante do que o conteúdo são os elementos presentes na página. Note o link [View as XML](http://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/17800628.xml&div=t17800628-33&xml=yes) na parte inferior. Esse link apresenta uma versão repleta de marcações no texto que podem ser úteis para certos tipos de pesquisa. O [documento original digitalizado](http://www.oldbaileyonline.org/images.jsp?doc=178006280084) do julgamento também pode ser acessado. - -Agora vamos tentar abrir a página utilizando Python. Copie o seguinte programa no *Komodo Edit* e salve o ficheiro como `open-webpage.py`. Quando executar o programa, a página do julgamento será acessada, seus conteúdos serão lidos e copiados numa string chamada `webContent`. Na sequência, os primeiros 300 caracteres serão exibidos no *painel de saída de comandos*. Utilize `Ferramentas -> Ferramentas do Navegador -> Fonte da página` no navegador Firefox para verificar que o código HTML da página é o mesmo que o seu programa acessou. Outros navegadores podem ter caminhos distintos para acessar o código fonte. Caso não consiga encontrar o caminho no seu navegador, tente utilizar um mecanismo de busca para encontrá-lo. (Consulte a biblioteca de referência do Python para aprender mais sobre [urllib](https://docs.python.org/3/library/urllib.html?highlight=urllib).) - -``` python -# open-webpage.py - -import urllib.request, urllib.error, urllib.parse - -url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' - -response = urllib.request.urlopen(url) -webContent = response.read().decode('UTF-8') - -print(webContent[0:300]) -``` - -Utilizando apenas essas cinco linhas de código, é possível obter resultados substanciais. Agora, vamos nos assegurar de que cada linha de código está clara e que é possível distinguir os blocos que permitem ao programa realizar a tarefa que desejamos. - -*url*, *response* e *webContent* são todas variáveis nomeadas por nós. - -*url* contém a URL da página que queremos baixar. Neste exemplo, trata-se do julgamento de Benjamin Bowsey. - -Na linha seguinte, chamamos a função `urlopen`, contida no módulo do Python chamado `urllib.py`, e solicitamos que ela acesse o site especificado na variável *url*. Em seguida, salvamos o resultado desse processo numa variável chamada *response*. Essa variável contém agora uma versão aberta do site solicitado. - -No próximo passo, utilizamos o método `read`, que já utilizamos anteriormente, para copiar os conteúdos do site numa nova variável chamada *webContent*. - -Assegure-se de ser capaz de identificar as variáveis (3), o módulo (1), os métodos (2) e os parâmetros (1) antes de prosseguir. - -No resultado do código acima, alguns marcadores da linguagem HTML poderão ser identificados: - -``` xml - - - - Browse - Central Criminal Court - Open File` no Firefox, abra o ficheiro criado no seu disco local (`obo-t17800628-33.html`) para confirmar que a cópia salva é a mesma que a online. - -``` python -# save-webpage.py - -import urllib.request, urllib.error, urllib.parse - -url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' - -response = urllib.request.urlopen(url) -webContent = response.read().decode('UTF-8') - -f = open('obo-t17800628-33.html', 'w') -f.write(webContent) -f.close -``` - -Se é possível salvar um único ficheiro dessa maneira, seria possível escrever um programa para baixar um conjunto de ficheiros? Por exemplo, seria possível percorrer os identificadores de um conjunto de páginas e copiá-las para o seu computador? Sim. Aprenda como na lição [Downloading Multiple Files using Query Strings](/en/lessons/downloading-multiple-records-using-query-strings) (em inglês), que recomendamos depois que tenha terminado as lições introdutórias dessa série. - -### Leitura Sugerida - -- Mitchell, Ryan. “Web Scraping com Python: Coletando Mais Dados da Web Moderna" (O’Reilly, 2019). - -### Sincronização do Código - -Para acompanhar futuras lições, é importante ter os ficheiros e programas corretos no seu diretório “programming-historian”. Ao final de cada lição, é possível baixar o ficheiro zip “programming-historian” para ter certeza de que o ficheiro correto está sendo utilizado. - -- programming-historian-1 ([zip](/assets/python-lessons1.zip)) - +--- +title: Download de páginas Web com Python +layout: lesson +slug: download-paginas-web-python +date: 2012-07-17 +translation_date: 2021-03-26 +authors: +- William J. Turkel +- Adam Crymble +reviewers: +- Jim Clifford +- Frederik Elwert +editors: +- Miriam Posner +translator: +- Bruno Gasparotto Ponne +translation-editor: +- Josir Cardoso Gomes +translation-reviewer: +- Felipe Lamarca +- Daniel Alves +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/360 +activity: acquiring +topics: [python] +abstract: "Esta lição apresenta o conceito de *Localizador Uniforme de Recursos* (URL em inglês) e explica como usar o Python para fazer o download de uma página *Web* no seu disco local." +original: working-with-web-pages +avatar_alt: Um homem alto ao lado de uma mulher baixa +doi: 10.46430/phpt0010 +--- + + +{% include toc.html %} + +
    +O site do Old Bailey Online foi recentemente atualizado. Infelizmente, devido às diversas mudanças, muitos (se não todos) os elementos do site de exemplo usado nesta lição não funcionarão conforme descrito. No entanto, as metodologias ensinadas por esta lição permanecem relevantes e podem ser adaptadas pelos leitores para um site de exemplo diferente. Estamos trabalhando na adaptação da lição para o novo site do Old Bailey Online, mas ainda não temos cronograma preciso de quando a lição será atualizada. [Abril de 2024] +
    + +### Objetivos da Lição + +Esta lição apresenta o conceito de *Localizador Uniforme de Recursos* (URL em inglês) e explica como usar o Python para fazer o download de uma página *Web* no seu disco local. + +### Sobre URLs + +Uma página *Web* é um ficheiro hospedado noutro computador, conhecido como *servidor*. Quando um site é acessado, na realidade, o seu computador (o *cliente*) envia um pedido ao *servidor de hospedagem* por meio da rede e o servidor responde enviando uma cópia da página ao seu computador. Uma forma de acessar uma página por meio do seu navegador é seguir um link. É possível também colar ou digitar uma URL (localizador uniforme de recursos) diretamente no seu navegador. A URL informa ao seu navegador onde encontrar um recurso online, especificando o servidor, o diretório e o nome do ficheiro a ser recuperado, bem como o tipo de *protocolo* que o servidor e o seu navegador utilizarão para troca de informações (como o HTTP, *protocolo de transferência de hipertexto*). A estrutura básica de uma URL é + +``` +protocol://host:port/path?query +``` + +Vejamos alguns exemplos: + +``` xml +http://oldbaileyonline.org +``` + +O tipo mais básico de URL especifica apenas o protocolo e o domínio. Quando inserido em seu navegador, essa URL retornará a página principal do site [Old Bailey Online](https://www.oldbaileyonline.org). O pressuposto convencional é que a página principal num determinado diretório se chamará *index*, geralmente `index.html`. + +A URL pode incluir também um *número de porta* opcional. Sem entrar em muitos detalhes, o protocolo de rede em que se baseia a troca de informações na Internet permite que computadores se conectem de diferentes maneiras. Números de portas são utilizados para distinguir esses diferentes tipos de conexão. Uma vez que a porta padrão para HTTP é a 80, a seguinte URL é equivalente à anterior. + +``` xml +http://oldbaileyonline.org:80 +``` + +Geralmente há diversas páginas *Web* num determinado site. Essas páginas são armazenadas em diretórios no servidor e é possível especificar o caminho para uma página em particular. A página "About" para o site *The Old Bailey Online* tem a seguinte URL: + +``` xml +http://oldbaileyonline.org/static/Project.jsp +``` + +Por fim, algumas páginas permitem inserir *queries*, termo em inglês que significa pedido, solicitação. O site *The Old Bailey Online*, por exemplo, foi desenvolvido de forma que é possível requisitar uma de suas páginas utilizando uma *query string* (conjunto de caracteres que contém uma solicitação). A seguinte URL acessará uma página de resultado de buscas por registros de julgamentos criminais contendo a palavra "arsenic". + +``` xml +https://www.oldbaileyonline.org/search.jsp?form=searchHomePage&_divs_fulltext=arsenic&kwparse=and&_persNames_surname=&_persNames_given=&_persNames_alias=&_offences_offenceCategory_offenceSubcategory=&_verdicts_verdictCategory_verdictSubcategory=&_punishments_punishmentCategory_punishmentSubcategory=&_divs_div0Type_div1Type=&fromMonth=&fromYear=&toMonth=&toYear=&ref=&submit.x=0&submit.y=0 +``` + +O fragmento a seguir ao sinal "?" representa a *query*. Aprenda mais sobre como criar *queries* na lição [Downloading Multiple Records Using Query Strings](/en/lessons/downloading-multiple-records-using-query-strings) (em inglês). + +### Acessando URLs com Python + +Como um historiador da era digital, você frenquentemente desejará utilizar dados mantidos em sites acadêmicos. Para acessar esses dados, seria possível abrir as URLs uma por uma e copiar e colar os conteúdos num ficheiro de texto. Alternativamente, é possível utilizar Python para, automaticamente, coletar e processar os dados. Para isso, é preciso aprender como abrir uma URL por meio do seu próprio código. A linguagem Python inclui uma série de padrões para fazer isso. + +Como exemplo, vamos trabalhar com o tipo de documento que provavelmente você vai encontrar ao realizar uma pesquisa na área de História. Suponhamos que haja interesse nas relações raciais na Inglaterra do século XVIII. O site *The Old Bailey Online* é uma fonte rica de informações históricas e disponibiliza transcrições de julgamentos que ocorreram entre 1674 e 1913. + +{% include figure.html filename="old-bailey.png" caption="A homepage do site The Old Bailey Online" %} + +Para esse exemplo, utilizaremos a transcrição do julgamento de Benjamin Bowsey, um negro condenado por perturbar a paz durante os protestos de Gordon em 1780. A URL para o registro é + +``` xml +http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33 +``` + +Estudando a URL, podemos verificar algumas coisas. Primeiro, o site é programado em JSP (*JavaServer Pages*, uma linguagem de programação para a *web* cujo resultado é um ficheiro HTML). Segundo, é possível acessar registros de julgamentos individuais fazendo uso de *query strings*. Cada registro recebe um número único (*id=t* na URL), formado a partir da data da sessão de julgamento no formato (*AAAAMMDD*) e o número do julgamento naquela sessão do tribunal. Neste caso, *33*. Caso as duas ocorrências de `33` sejam trocadas por `34` no link acima, o seu navegador o encaminhará ao próximo julgamento. Infelizmente, nem todos os sites possuem URLs tão acessíveis e confiáveis quanto essa. + +{% include figure.html filename="bowsey-trial-page.png" caption="Transcrição do julgamento de Benjamin Bowsey, 1780" %} + +Observe a página do julgamento de Benjamin Bowsey. Mais importante do que o conteúdo são os elementos presentes na página. Note o link [View as XML](https://www.oldbaileyonline.org/browse.jsp?foo=bar&path=sessionsPapers/17800628.xml&div=t17800628-33&xml=yes) na parte inferior. Esse link apresenta uma versão repleta de marcações no texto que podem ser úteis para certos tipos de pesquisa. O [documento original digitalizado](https://www.oldbaileyonline.org/images.jsp?doc=178006280084) do julgamento também pode ser acessado. + +Agora vamos tentar abrir a página utilizando Python. Copie o seguinte programa no *Komodo Edit* e salve o ficheiro como `open-webpage.py`. Quando executar o programa, a página do julgamento será acessada, seus conteúdos serão lidos e copiados numa string chamada `webContent`. Na sequência, os primeiros 300 caracteres serão exibidos no *painel de saída de comandos*. Utilize `Ferramentas -> Ferramentas do Navegador -> Fonte da página` no navegador Firefox para verificar que o código HTML da página é o mesmo que o seu programa acessou. Outros navegadores podem ter caminhos distintos para acessar o código fonte. Caso não consiga encontrar o caminho no seu navegador, tente utilizar um mecanismo de busca para encontrá-lo. (Consulte a biblioteca de referência do Python para aprender mais sobre [urllib](https://docs.python.org/3/library/urllib.html?highlight=urllib).) + +``` python +# open-webpage.py + +import urllib.request, urllib.error, urllib.parse + +url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' + +response = urllib.request.urlopen(url) +webContent = response.read().decode('UTF-8') + +print(webContent[0:300]) +``` + +Utilizando apenas essas cinco linhas de código, é possível obter resultados substanciais. Agora, vamos nos assegurar de que cada linha de código está clara e que é possível distinguir os blocos que permitem ao programa realizar a tarefa que desejamos. + +*url*, *response* e *webContent* são todas variáveis nomeadas por nós. + +*url* contém a URL da página que queremos baixar. Neste exemplo, trata-se do julgamento de Benjamin Bowsey. + +Na linha seguinte, chamamos a função `urlopen`, contida no módulo do Python chamado `urllib.py`, e solicitamos que ela acesse o site especificado na variável *url*. Em seguida, salvamos o resultado desse processo numa variável chamada *response*. Essa variável contém agora uma versão aberta do site solicitado. + +No próximo passo, utilizamos o método `read`, que já utilizamos anteriormente, para copiar os conteúdos do site numa nova variável chamada *webContent*. + +Assegure-se de ser capaz de identificar as variáveis (3), o módulo (1), os métodos (2) e os parâmetros (1) antes de prosseguir. + +No resultado do código acima, alguns marcadores da linguagem HTML poderão ser identificados: + +``` xml + + + + Browse - Central Criminal Court + Open File` no Firefox, abra o ficheiro criado no seu disco local (`obo-t17800628-33.html`) para confirmar que a cópia salva é a mesma que a online. + +``` python +# save-webpage.py + +import urllib.request, urllib.error, urllib.parse + +url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' + +response = urllib.request.urlopen(url) +webContent = response.read().decode('UTF-8') + +f = open('obo-t17800628-33.html', 'w') +f.write(webContent) +f.close +``` + +Se é possível salvar um único ficheiro dessa maneira, seria possível escrever um programa para baixar um conjunto de ficheiros? Por exemplo, seria possível percorrer os identificadores de um conjunto de páginas e copiá-las para o seu computador? Sim. Aprenda como na lição [Downloading Multiple Files using Query Strings](/en/lessons/downloading-multiple-records-using-query-strings) (em inglês), que recomendamos depois que tenha terminado as lições introdutórias dessa série. + +### Leitura Sugerida + +- Mitchell, Ryan. “Web Scraping com Python: Coletando Mais Dados da Web Moderna" (O’Reilly, 2019). + +### Sincronização do Código + +Para acompanhar futuras lições, é importante ter os ficheiros e programas corretos no seu diretório “programming-historian”. Ao final de cada lição, é possível baixar o ficheiro zip “programming-historian” para ter certeza de que o ficheiro correto está sendo utilizado. + +- programming-historian-1 ([zip](/assets/python-lessons1.zip)) + diff --git a/pt/licoes/explorar-analisar-dados-rede-python.md b/pt/licoes/explorar-analisar-dados-rede-python.md index 333ee6680e..115f109904 100644 --- a/pt/licoes/explorar-analisar-dados-rede-python.md +++ b/pt/licoes/explorar-analisar-dados-rede-python.md @@ -1,604 +1,604 @@ ---- -title: "Explorar e Analisar Dados de Rede com Python" -slug: explorar-analisar-dados-rede-python -original: exploring-and-analyzing-network-data-with-python -layout: lesson -collection: lessons -date: 2017-06-16 -translation_date: 2023-05-12 -authors: -- John R. Ladd -- Jessica Otis -- Christopher N. Warren -- Scott Weingart -reviewers: -- Elisa Beshero-Bondar -- Anne Chao -- Qiwei Li -editors: -- Brandon Walsh -translator: -- João Domingues Pereira -translation-editor: -- Eric Brasil -translation-reviewer: -- Josir Cardoso Gomes -- Daniel Alves -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/446 -difficulty: 2 -activity: analyzing -topics: [network-analysis, data-visualization] -abstract: Esta lição introduz métricas de rede e como tirar conclusões das mesmas quando se trabalha com dados de Humanidades. O leitor aprenderá como usar o pacote NetworkX do Python para produzir e trabalhar com estas estatísticas de rede. -avatar_alt: Caminhos-de-ferro intrincados -doi: 10.46430/phpt0041 -modified: 2023-08-25 -lesson-testers: John R. Ladd -tested-date: 2023-08-21 ---- - -{% include toc.html %} - -# Introdução - -## Objetivos da Lição - -Neste tutorial, o leitor irá aprender: -- A usar o pacote [**NetworkX**](https://perma.cc/F574-RREU) para trabalhar com dados de rede em [**Python**](/pt/licoes/introducao-instalacao-python); e -- A analisar dados de rede de Humanidades para encontrar: - - Estruturas de rede e comprimentos de caminho, - - Nós importantes ou centrais, e - - Comunidades e subgrupos. - -**n.b.**: Este é um tutorial para explorar estatísticas e métricas de rede. Assim sendo, iremos focar-nos em maneiras de analisar e tirar conclusões a partir de redes sem visualizá-las. Provavelmente, o leitor quererá uma combinação de visualização e métricas de rede no seu próprio projeto, e, por isso, nós recomendamos este artigo como um complemento a [este tutorial anterior do *Programming Historian*](/en/lessons/creating-network-diagrams-from-historical-sources) (em inglês)[^1]. - -## Pré-Requisitos - -Este tutorial assume que o leitor: - -- Tem uma familiaridade básica com redes e/ou leu [*From Hermeneutics to Data to Networks: Data Extraction and Network Visualization of Historical Sources*](/en/lessons/creating-network-diagrams-from-historical-sources) (em inglês), de Marten Düring, aqui no *Programming Historian*; -- Instalou o Python 3, não o Python 2 que é nativamente instalado em sistemas operacionais com base no Unix, como os Macs (se precisar de assistência com a instalação do Python 3, veja [The Hitchhiker's Guide to Python](https://perma.cc/DP2N-B4EN) (em inglês); e -- Instalou o instalador de pacotes `pip`[^2]. - -É possível ter duas versões do Python (2 *e* 3) instaladas no seu computador ao mesmo tempo. Por esta razão, ao aceder ao Python 3, o leitor frequentemente terá que o declarar explicitamente digitando `python3` e `pip3` em vez de simplesmente `python` e `pip`. Consulte os tutoriais do *Programming Historian* sobre a [instalação do Python](/pt/licoes/introducao-instalacao-python) e o [uso do pip](/pt/licoes/instalacao-modulos-python-pip) para mais informações[^3]. - -## O Que o Leitor Pode Aprender a Partir dos Dados de Rede? - -Há muito que as redes interessam aos pesquisadores nas Humanidades, mas muitos académicos recentes progrediram dum interesse grandemente qualitativo e metafórico em links e conexões para um séquito mais formal de ferramentas quantitativas para estudar mediadores, *hubs* (nós importantes) e estruturas interconectadas. Como o sociólogo Mark S. Granovetter apontou no seu importante artigo de maio de 1973 [*The Strength of Weak Ties*](https://perma.cc/A4PC-WPKN) (em inglês), raramente é suficiente notar que duas pessoas estavam conectadas uma à outra. Fatores como a sua relação estrutural com outras pessoas e se essas pessoas adicionais estavam, elas próprias, conectadas umas às outras têm influência decisiva nos eventos. Na medida em que até o mais perspicaz dos académicos tem dificuldade em perceber, digamos, o contorno geral duma rede (a sua "Topologia" de rede) e em identificar os nós mais significativos para conectar grupos, a análise quantitativa de rede oferece aos académicos um modo de transitar relativamente fluidamente entre o objeto social de larga escala (o "grafo") e as particularidades minuciosas das pessoas e laços sociais. - -Este tutorial irá ajudá-lo a responder questões como: -- Qual é a estrutura geral da rede? -- Quem são as pessoas importantes, ou *hubs*, na rede? -- Quais são os subgrupos e comunidades na rede? - - -## O Nosso Exemplo: a Sociedade dos Amigos - -Antes que existissem amigos do Facebook, havia a Sociedade dos Amigos, conhecida como os *quakers*. Fundados na Inglaterra em meados do século XVII, os *quakers* eram cristãos protestantes que divergiram da oficial Igreja da Inglaterra e que promoviam uma ampla tolerância religiosa, preferindo a suposta "luz interior" (*inner light*; **nota de tradução**: este conceito tinha uma extrema importância na Teologia *quaker*) e as consciências dos cristãos à ortodoxia imposta pelo Estado. O número de *quakers* cresceu rapidamente de meados para os finais do século XVII e os seus membros espalharam-se pelas Ilhas Britânicas, pela Europa e pelas colônias do Novo Mundo---especialmente pela Pensilvânia, fundada pelo líder *quaker* William Penn e lar dos quatro autores. - -Visto que os académicos há muito que ligam o crescimento e a persistência dos *quakers* à eficácia das suas redes, os dados usados neste tutorial são uma lista de nomes e relações entre os primevos *quakers* do século XVII. Este *dataset* é derivado do [*Oxford Dictionary of National Biography*](http://www.oxforddnb.com) (em inglês) e do trabalho em progresso do projeto [*Six Degrees of Francis Bacon*](http://www.sixdegreesoffrancisbacon.com) (em inglês), o qual está a reconstruir as redes sociais da Grã-Bretanha moderna (1500-1700). - -# Preparação dos Dados e Instalação do NetworkX - -Antes de iniciar este tutorial, o leitor precisará de fazer o download de dois ficheiros que, combinados, constituem o *dataset* da nossa rede. O ficheiro [quakers_nodelist.csv](/assets/exploring-and-analyzing-network-data-with-python/quakers_nodelist.csv) é uma lista de *quakers* modernos (nós) e o ficheiro [quakers_edgelist.csv](/assets/exploring-and-analyzing-network-data-with-python/quakers_edgelist.csv) é uma lista de relações entre esses *quakers* (*edges*). Para fazer o download destes ficheiros, basta clicar com o botão direito do *mouse* nos *links* e escolher "Guardar ligação como". - -Será extremamente útil ao leitor familiarizar-se com a estrutura do *dataset* antes de continuar. Para mais informações sobre a estrutura geral dos *datasets* de rede, veja [este tutorial](/en/lessons/creating-network-diagrams-from-historical-sources#developing-a-coding-scheme) (em inglês). Quando o leitor abrir o ficheiro de nós no programa da sua escolha, verá que cada *quaker* é primeiramente identificado pelo seu *name* (nome). Cada nó dum *quaker* também tem um número de atributos associados, incluindo *historical significance* (em português, significado histórico), *gender* (em português, género), *birth*/*death dates* (em português, datas de nascimento/morte), e o SDFB ID---um identificador numérico exclusivo que lhe permitirá cruzar nós neste *dataset* com o *dataset* original do *Six Degrees of Francis Bacon*, se desejado. Aqui estão as primeiras linhas: - -``` -Name,Historical Significance,Gender,Birthdate,Deathdate,ID -Joseph Wyeth,religious writer,male,1663,1731,10013191 -Alexander Skene of Newtyle,local politician and author,male,1621,1694,10011149 -James Logan,colonial official and scholar,male,1674,1751,10007567 -Dorcas Erbery,Quaker preacher,female,1656,1659,10003983 -Lilias Skene,Quaker preacher and poet,male,1626,1697,10011152 -``` - -Note que, embora as colunas não estejam corretamente alinhadas como ocorre numa tabela de dados, as vírgulas mantêm tudo apropriadamente separado. - -Quando o leitor abrir o ficheiro de *edges*, verá que nós usamos os *names* do ficheiro de nós para identificar os nós conectados por cada *edge*. Estas *edges* começam num nó ***source*** (em português, origem) e acabam num nó ***target*** (em português, destino). Embora esta linguagem derive das chamadas estruturas de rede **direcionadas**, nós usaremos os nossos dados como uma rede **não direcionada**: se a Pessoa A conhece a Pessoa B, então a Pessoa B também deve conhecer a Pessoa A. Nas redes direcionadas, as relações não precisam de ser recíprocas (a Pessoa A pode enviar uma carta à B sem receber uma em troca), mas nas redes não direcionadas as conexões são sempre recíprocas, ou **simétricas**. Uma vez que esta é uma rede de quem conhecia quem ao invés de, digamos, uma rede epistolar, um conjunto de relações não direcionadas é o mais apropriado. As relações simétricas nas redes não direcionadas são úteis sempre que estiver preocupado com relações que definem o mesmo papel para ambas as partes. Dois amigos têm uma relação simétrica: cada um deles é um amigo do outro. O autor e o destinatário duma carta têm uma relação assimétrica porque cada um tem um papel diferente. Tanto as redes direcionadas como as não direcionadas têm os seus próprios recursos (e, por vezes, as suas próprias métricas), e o leitor quererá escolher aquela que melhor se adapta aos tipos de relações que está a registrar e às questões que quer clarificar. Aqui estão as primeiras *edges* na rede *quaker* não direcionada: - -``` -Source,Target -George Keith,Robert Barclay -George Keith,Benjamin Furly -George Keith,Anne Conway Viscountess Conway and Killultagh -George Keith,Franciscus Mercurius van Helmont -George Keith,William Penn -``` - -Agora que fez o download dos dados *quakers* e viu como estão estruturados, está na hora de começar a trabalhar com esses dados no Python. Assim que tanto o Python como o pip estiverem instalados (ver Pré-Requisitos, acima), quererá instalar o NetworkX, digitando isto na sua [linha de comandos](/en/lessons/intro-to-bash) (em inglês):[^4] - -```python -pip3 install networkx==3.1 -``` - -Uma nota curta sobre controle de versão: este tutorial usa NetworkX 3.1, mas a biblioteca está em desenvolvimento ativo e é atualizada com frequência. Recomendamos usar o comando de instalação acima para garantir que a sua versão do NetworkX corresponde ao código abaixo (em vez de simplesmente instalar a versão mais recente). Se já tiver uma versão mais antiga do NetworkX instalada, execute `pip3 install networkx==3.1 --upgrade` antes de tentar o tutorial[^5]. - -Está feito! Está preparado para começar a codificar. - -# Começando - -## Ler Ficheiros, Importar Dados - -Inicie um novo ficheiro de texto simples, em branco, no mesmo diretório que os seus ficheiros de dados chamado `quaker_network.py` (para mais detalhes sobre a instalação e execução do Python, ver [este tutorial](/pt/licoes/instalacao-windows)). No topo desse ficheiro, importe as bibliotecas de que precisa. O leitor precisará de três bibliotecas---aquela que acabámos de instalar, e duas bibliotecas incorporadas no Python. Pode digitar: - -```python -import csv -from operator import itemgetter -import networkx as nx -from networkx.algorithms import community # Esta parte do NetworkX, para a deteção de comunidades, precisa de ser importada separadamente. -``` - -Agora pode ordenar ao programa para ler os seus ficheiros de CSV e retirar os dados de que precisa. Ironicamente, ler ficheiros e reorganizar os dados geralmente requer um código mais complexo que as funções para executar uma análise de redes sociais, portanto pedimos que tenha paciência connosco ao longo deste primeiro bloco de código. Aqui está um conjunto de comandos para abrir e ler os ficheiros das nossas listas de nós e de *edges*: - -```python -with open('quakers_nodelist.csv', 'r') as nodecsv: # Abra o ficheiro - nodereader = csv.reader(nodecsv) # Leia o CSV - # Retire os dados (usando a list comprehension e a list slicing do Python para remover a linha de cabeçalho, veja a nota de rodapé 6) - nodes = [n for n in nodereader][1:] - -node_names = [n[0] for n in nodes] # Obtenha uma lista apenas dos nomes dos nós - -with open('quakers_edgelist.csv', 'r') as edgecsv: # Abra o ficheiro - edgereader = csv.reader(edgecsv) # Leia o CSV - edges = [tuple(e) for e in edgereader][1:] # Retire os dados -``` - -Este código executa funções similares às [deste tutorial](/pt/licoes/trabalhando-ficheiros-texto-python), mas usa o módulo CSV para carregar os seus nós e *edges*. Mais tarde, o leitor voltará a atuar sobre os dados e obterá mais informação sobre os nós, mas, por agora, precisa de duas coisas: a lista completa de nós e uma lista de pares *edges* (como énuplos de nós)[^6]. Estas são as formas de que o NetworkX precisará para criar um "objeto grafo", um tipo de dados especial do NetworkX sobre o qual o leitor aprenderá na próxima secção. - -Nesta fase, antes de começar a usar o NetworkX, o leitor pode fazer algumas verificações de sanidade básicas para se certificar que os seus dados foram corretamente carregados usando funções e métodos incorporados no Python. Digitando: - -```python -print(len(node_names)) -``` - -e: - -```python -print(len(edges)) -``` - -e, depois, executando o seu *script* lhe mostrará quantos nós e *edges* carregou com sucesso no Python. Se o leitor vir 119 nós e 174 *edges*, então tem todos os dados necessários. - - -## Noções Básicas do NetworkX: Criar o Grafo - -Agora o leitor tem os seus dados como duas listas do Python: uma lista de nós (`node_names`) e uma lista de *edges* (`edges`). No NetworkX, o leitor pode juntar estas duas listas num só objeto rede que compreende como os nós e as *edges* se relacionam. Este objeto é chamado de **Grafo**, referindo-se a um dos termos comuns para dados organizados como uma rede **n.b.**: não se refere a alguma representação visual dos dados. Aqui, grafo é usado puramente num sentido matemático, de análise de rede. Primeiro, o leitor deve *inicializar* um objeto Grafo com o seguinte comando: - -```python -G = nx.Graph() -``` - -> **Nota de tradução**: em inglês, 'gráfico' pode ser traduzido como '*graphic*' ou, de forma diminutiva, como '*graph*', que também pode significar 'grafo', o termo aqui referido. Esta homografia não ocorre no português. - -Isto criará um novo objeto grafo, *G*, com nada nele. Agora, o leitor pode adicionar as suas listas de nós e de *edges* assim: - -```python -G.add_nodes_from(node_names) -G.add_edges_from(edges) -``` - -Esta é uma de várias maneiras de adicionar dados a um objeto rede. O leitor pode verificar a [documentação do NetworkX](https://perma.cc/3QVU-FLPF) (em inglês) para obter mais informações sobre como adicionar *weighted edges*, ou adicionar nós e *edges* uma de cada vez. - -Finalmente, o leitor pode obter informação básica sobre a sua rede recém-criada usando a função `info`: - -```python -print(G) -``` - -A função `info` informa o tipo da sua rede (neste caso, é um objeto Graph padrão) e o número de nós e arestas na mesma. O _output_ deve ser parecido a este: - -``` -Name: -Type: Graph -Number of nodes: 119 -Number of edges: 174 -Average degree: 2.9244 -``` - -Esta é uma forma rápida de obter informação geral sobre o seu grafo, mas como o leitor aprenderá em secções subsequentes, está apenas a passar pela superfície do que o NetworkX lhe pode indicar sobre os seus dados. - -Para recapitular, de momento o seu *script* será semelhante a isto: - -```python -import csv -from operator import itemgetter -import networkx as nx -from networkx.algorithms import community - -# Leia no ficheiro da lista de nós -with open('quakers_nodelist.csv', 'r') as nodecsv: - nodereader = csv.reader(nodecsv) - nodes = [n for n in nodereader][1:] - -# Obtenha uma lista apenas dos nomes dos nós (o primeiro item em cada linha) -node_names = [n[0] for n in nodes] - -# Leia no ficheiro da lista de edges -with open('quakers_edgelist.csv', 'r') as edgecsv: - edgereader = csv.reader(edgecsv) - edges = [tuple(e) for e in edgereader][1:] - -# Obtenha o número de nós e de edges nas nossas duas listas -print(len(node_names)) -print(len(edges)) - -G = nx.Graph() # Inicialize um objeto Grafo -G.add_nodes_from(node_names) # Adicione nós ao Grafo -G.add_edges_from(edges) # Adicione edges ao Grafo -print(G) # Obtenha informação sobre o Grafo -``` - -Até agora, o leitor leu dados de nós e de *edges* no Python a partir de ficheiros CSV, e, depois, contou esses nós e *edges*. Depois disso, o leitor criou um objeto grafo usando o NetworkX e carregou os seus dados para esse objeto. - -## Adicionar Atributos - -Para o NetworkX, um objeto grafo é uma coisa grande (a sua rede) composta por dois tipos de coisas mais pequenas (os seus nós e as suas *edges*). Até agora, o leitor carregou nós e *edges* (como pares de nós), mas o NetworkX permite-lhe adicionar *atributos* tanto aos nós como às *edges*, providenciando mais informação sobre cada um deles. Mais à frente neste tutorial, o leitor executará métricas e adicionará alguns dos resultados de volta ao Grafo como atributos. Por agora, vamos certificar-nos que o seu Grafo contém todos os atributos que estão atualmente no seu CSV. - -O leitor quererá retornar a uma lista que criou no início do seu *script*: `nodes`. Esta lista contém todas as linhas do `quakers_nodelist.csv`, incluindo colunas para o *name*, a *historical significance*, o *gender*, o *birth year*, o *death year* e o SDFB ID. O leitor quererá iterar por esta lista e adicionar esta informação ao nosso grafo. Existem algumas maneiras de fazer isto, mas o NetworkX providencia duas funções convenientes para adicionar atributos a todos os nós e *edges* dum Grafo duma só vez: `nx.set_node_attributes()` e `nx.set_edge_attributes()`. Para usar estas funções, o leitor irá precisar que os seus dados de atributos estejam na forma dum *dicionário* Python, no qual os nomes dos nós são as *chaves* e os atributos que quer adicionar são os *valores*[^7]. O leitor quererá criar um dicionário para cada um dos seus atributos, e, depois, adicioná-los usando as funções acima. A primeira coisa que o leitor deve fazer é criar cinco dicionários em branco, usando chavetas: - -```python -hist_sig_dict = {} -gender_dict = {} -birth_dict = {} -death_dict = {} -id_dict = {} -``` - -Agora nós podemos fazer o *loop* através da nossa lista de `nodes` e adicionar os itens apropriados a cada dicionário. Nós fazemos isto sabendo antecipadamente a posição, ou *índice*, de cada atributo. Porque o nosso ficheiro `quaker_nodelist.csv` está bem organizado, nós sabemos que o *name* da pessoa será sempre o primeiro item no lista: índice 0, visto que começamos sempre a contar do 0 no Python. A *historical significance* da pessoa será o índice 1, o seu *gender* será o índice 2, e assim por diante. Portanto, nós podemos construir os nossos dicionários desta forma[^8]: - -```python -for node in nodes: # Itere pela lista, uma linha de cada vez - hist_sig_dict[node[0]] = node[1] - gender_dict[node[0]] = node[2] - birth_dict[node[0]] = node[3] - death_dict[node[0]] = node[4] - id_dict[node[0]] = node[5] -``` - -Agora o leitor tem um conjunto de dicionários que pode usar para adicionar atributos a nós no seu objeto Grafo. A função `set_node_attributes` toma três variáveis: o Grafo ao qual o leitor está a adicionar o atributo, o dicionário de pares id-atributo, e o nome do novo atributo. O código para adicionar os seus seis atributos assemelha-se a isto: - -```python -nx.set_node_attributes(G, hist_sig_dict, 'historical_significance') -nx.set_node_attributes(G, gender_dict, 'gender') -nx.set_node_attributes(G, birth_dict, 'birth_year') -nx.set_node_attributes(G, death_dict, 'death_year') -nx.set_node_attributes(G, id_dict, 'sdfb_id') -``` - -Agora todos os seus nós têm estes seis atributos, e o leitor pode aceder a eles a qualquer momento. Por exemplo, o leitor pode obter todos os *birth years* dos seus nós iterando por eles e acedendo ao atributo `birth_year`, assim: - -```python -for n in G.nodes(): # Itere por cada nó, entre os nossos dados "n" estará o nome da pessoa - print(n, G.nodes[n]['birth_year']) # Aceda a cada nó pelo seu nome, e, depois, pelo atributo "birth_year" -``` - -A partir desta instrução, o leitor obterá uma linha de *output* para cada nó na rede. Deve parecer-se como uma simples lista de nomes e anos: - -``` -Anne Camm 1627 -Sir Charles Wager 1666 -John Bellers 1654 -Dorcas Erbery 1656 -Mary Pennyman 1630 -Humphrey Woolrich 1633 -John Stubbs 1618 -Richard Hubberthorne 1628 -Robert Barclay 1648 -William Coddington 1601 -``` - -Os passos acima são um método comum para adicionar atributos a nós que o leitor usará repetidamente mais tarde neste tutorial. Aqui está uma recapitulação do bloco de código desta secção: - -```python -# Crie um dicionário em branco para cada atributo -hist_sig_dict = {} -gender_dict = {} -birth_dict = {} -death_dict = {} -id_dict = {} - -for node in nodes: # Itere pela lista de nós, uma linha de cada vez - hist_sig_dict[node[0]] = node[1] # Aceda ao item correto, adicione-o ao dicionário correspondente - gender_dict[node[0]] = node[2] - birth_dict[node[0]] = node[3] - death_dict[node[0]] = node[4] - id_dict[node[0]] = node[5] - -# Adicione cada dicionário como um atributo de nó ao objeto Grafo -nx.set_node_attributes(G, hist_sig_dict, 'historical_significance') -nx.set_node_attributes(G, gender_dict, 'gender') -nx.set_node_attributes(G, birth_dict, 'birth_year') -nx.set_node_attributes(G, death_dict, 'death_year') -nx.set_node_attributes(G, id_dict, 'sdfb_id') - -# Itere por cada nó, para aceder e obter todos os atributos "birth_year" -for n in G.nodes(): - print(n, G.nodes[n]['birth_year']) -``` - -Agora o leitor aprendeu como criar um objeto Grafo e adicionar atributos ao mesmo. Nesta próxima secção, o leitor aprenderá sobre uma variedade de métricas disponíveis no NetworkX e como aceder às mesmas. Mas relaxe, acabou de aprender o maior parte do código de que precisará para o resto do tutorial! - -# Métricas Disponíveis no NetworkX - -Quando o leitor começa a trabalhar num novo *dataset*, é uma boa ideia obter uma visão geral dos dados. A primeira etapa, descrita acima, consiste simplesmente em abrir os ficheiros e ver o que está lá dentro. Porque é uma rede, o leitor sabe que existirão nós e *edges*, mas quantos de cada um existem? Que informação está anexada a cada nó ou *edge*? - -No nosso caso, existem 174 *edges* e 119 nós. Estas *edges* não têm direções (isto é, existe uma relação simétrica entre pessoas), nem incluem informação adicional. Para os nós, nós sabemos os seus *names*, a sua *historical significance*, o seu *genders*, a sua *birth date* e *death date*, e o SDFB ID. - -Estes detalhes informam o que o leitor pode ou devia fazer com o seu *dataset*. Muitos poucos nós (digamos, 15), e uma análise de rede é menos útil que desenhar uma imagem ou fazer algumas leituras; Demasiadas (digamos, 15 milhões), e o leitor deveria considerar começar com um subconjunto ou encontrar um supercomputador. - -As propriedades da rede também guiam a sua análise. Porque esta rede é **não direcionada**, a sua análise tem que usar métricas que exigem *edges* simétricas entre nós. Por exemplo, o leitor pode determinar em que comunidades as pessoas se encontram, mas não pode determinar as rotas *direcionais* pelas quais a informação poderá fluir ao longo da rede (precisaria duma rede direcionada para isso). Ao usar as relações simétricas e não direcionadas neste caso, o leitor será capaz de encontrar subcomunidades e as pessoas que são importantes nessas comunidades, um processo que seria mais difícil (embora ainda que possível) com uma rede direcionada. O NetworkX permite-lhe realizar a maior parte das análises que o leitor pode conceber, mas deve compreender as possibilidades do seu *dataset* e perceber que alguns logaritmos do NetworkX são mais apropriados do que outros. - -### O Formato da Rede - -Após ver a aparência do *dataset*, é importante ver a aparência da *rede*. Estas são coisas diferentes. O *dataset* é uma representação abstrata do que o leitor assume serem conexões entre entidades; a rede é a instanciação específica dessas suposições. A rede, pelo menos neste contexto, é como o computador, lê as conexões que o leitor codificou num *dataset*. A rede tem uma [Topologia](https://perma.cc/8M84-GESG), ou uma forma conectiva, que pode ser centralizada ou descentralizada; densa ou esparsa; cíclica ou linear. Um *dataset* não tem, fora da estrutura da tabela na qual está digitado. - -O formato e as propriedades básicas da rede irão dar-lhe uma ideia sobre com o que está a trabalhar e que análises parecem razoáveis. O leitor já sabe o número de nós e de *edges*, mas a que a rede se 'assemelha'? Os nós agrupam-se, ou estão espalhados de forma regular? Existem estruturas complexas, ou cada nó está organizado numa linha reta? - -A visualização abaixo, criada na ferramenta de visualização de redes [Gephi](https://gephi.org/), lhe dará uma ideia da Topologia desta rede[^9]. O leitor poderia criar um gráfico similar no Palladio usando [este tutorial](/en/lessons/creating-network-diagrams-from-historical-sources) (em inglês). - -{% include figure.html filename="exploring-and-analyzing-network-data-with-python-1.png" alt="Imagem com uma representação de um gráfico de redes" caption="Visualização de rede baseada em força dos dados *quakers*, criado no Gephi." %} - -Existem várias formas de visualizar uma rede, e um [*layout* baseado em força](https://perma.cc/AM7G-BTWV) (em inglês), do qual a imagem acima é um exemplo, encontra-se entre as mais comuns. Grafos baseados em força tentam encontrar o posicionamento ideal para nós com uma calculação baseada na [tensão de cordas segundo a Lei de Hooke](https://perma.cc/2RTL-CYVL) (em inglês), a qual, para grafos mais pequenos, normalmente cria visualizações limpas e de leitura fácil. A visualização embutida acima mostra-lhe que existe um único grande **componente** de nós conectados (no centro) e vários componentes pequenos com apenas uma ou duas conexões nas periferias. Esta é uma estrutura de rede relativamente comum. Sabendo que existem múltiplos componentes na rede irá limitar de forma útil as calculações que o leitor quererá realizar nela. Ao dispor o número de conexões (conhecidas como **grau**, ver abaixo) como o tamanho dos nós, a visualização também mostra que existem alguns nós com muitas conexões que mantêm o componente central intricado. Estes grandes nós são conhecidos como ***hubs***, e o facto de eles aparecem tão claramente aqui dá-lhe uma pista em relação ao que o leitor encontrará quando medir a **centralidade** na próxima secção. - -Visualizações, no entanto, apenas o levam até certo ponto. Com quantas mais redes trabalhar, mais o leitor se aperceberá que a maior parte parece similar o suficiente ao ponto de ser difícil distinguir uma da outra. Métricas quantitativas deixam-no diferenciar redes, aprender sobre as suas Topologias, e tornar uma confusão de nós e *edges* em algo a partir do qual se pode aprender. - -Uma boa métrica com a qual começar é a **densidade** de rede. Isto é, simplesmente, o rácio de *edges* reais na rede face a todas as *edges* possíveis na rede. Numa rede não direcionada como esta, *poderia* haver uma única *edge* entre quaisquer dois nós, mas como o leitor viu na visualização, apenas algumas dessas *edges* possíveis estão realmente presentes. A densidade de rede dá-lhe uma ideia rápida do quão intimamente próxima a sua rede é. - -E as boas notícias são que muitas destas métricas requerem comandos simples e unilineares no Python. Daqui para a frente, o leitor pode continuar a construir o seu bloco de código das secções anteriores. O leitor não tem de apagar nada que já tenha digitado, e porque criou o seu objeto rede `G` no bloco de código acima, todas as métricas a partir daqui devem trabalhar corretamente. - -O leitor pode calcular a densidade da rede executando `nx.density(G)`. No entanto, a melhor maneira de fazer isto é armazenar a sua métrica numa variável para referência futura, e imprimir essa variável, como: - -```python -density = nx.density(G) -print("Network density:", density) -``` - -O *output* da densidade é um número, então é isso que o leitor verá quando imprimir o valor. Neste caso, a densidade da nossa rede é, aproximadamente, 0.0248. Numa escala de 0 a 1, não é uma rede muito densa, o que confere com o que o leitor consegue ver na visualização[^10]. Um 0 significaria que não existem quaisquer conexões de todo, e um 1 indicaria que todas as *edges possíveis* estão presentes (uma rede perfeitamente conectada): esta rede *quaker* está na extremidade inferior dessa escala, mas, mesmo assim, longe do 0. - -Uma medida de caminho mais curta é um pouco mais complexa. Ela calcula a série mais curta possível de nós e *edges* que se situam entre quaisquer dois nós, algo difícil de ver em visualizações de grandes redes. Esta medida corresponde, essencialmente, a encontrar amigos de amigos---se a minha mãe conhece alguém que eu não conheço, então a minha mãe é o caminho mais curto entre mim e essa pessoa. O jogo *Six Degrees of Kevin Bacon*, a partir do qual o [nosso projeto](http://sixdegreesoffrancisbacon.com/) (em inglês) retira o nome, é basicamente um jogo que consiste em encontrar os caminhos mais curtos (com um **comprimento de caminho** de seis ou menos) de Kevin Bacon a qualquer outro ator. - -Para calcular um caminho mais curto, o leitor precisa de passar por várias variáveis de *input* (informação que dá a uma função do Python): o grafo inteiro, o seu nó *source*, e o seu nó *target*. Vamos procurar o caminho mais curto entre Margaret Fell e George Whitehead. Como usámos nomes para identificar unicamente os nossos nós nesta rede, o leitor pode aceder a esses nós (como a ***source*** e o ***target*** do seu caminho) usando os nomes diretamente. - -```python -fell_whitehead_path = nx.shortest_path(G, source="Margaret Fell", target="George Whitehead") - -print("Shortest path between Fell and Whitehead:", fell_whitehead_path) -``` - -Dependendo do tamanho da sua rede, isto pode demorar algum tempo para calcular, visto que o Python primeiro encontra todos os caminhos possíveis e depois escolhe o mais curto. O *output* de `shortest_path` será uma lista dos nós que incluí a "source" (Fell), o "target" (Whitehead), e os nós entre eles. Neste caso, nós podemos ver que o fundador dos *quakers*, George Fox, se encontra no caminho mais curto entre eles. Como Fox é também um ***hub*** (ver centralidade de grau, abaixo) com muitas conexões, nós podemos supor que vários caminhos mais curtos passam por ele como mediador. O que é que isto pode indicar sobre a importância dos fundadores dos *quakers* para a sua rede social? - -O Python incluí várias ferramentas que calculam os caminhos mais curtos. Existem funções para os comprimentos dos caminhos mais curtos, para todos os caminhos mais curtos, e para saber se um caminho existe ou não de todo na [documentação](https://perma.cc/3MJE-7MQQ) (em inglês). O leitor poderia usar uma função separada para encontrar o comprimento do caminho *Fell-Whitehead* que acabámos de calcular, ou poderia simplesmente tomar o comprimento da lista menos um[^11], assim: - -```python -print("Length of that path:", len(fell_whitehead_path)-1) -``` - -Existem muitas métricas de rede derivadas dos comprimentos de caminho mais curtos. Uma tal medida é o **diâmetro**, que é o mais longo de todos os caminhos mais curtos. Depois de calcular todos os caminhos mais curtos entre cada par de nós possível na rede, o diâmetro é o comprimento do caminho entre os dois nós que estão mais afastados. A medida está projetada para lhe dar um senso do tamanho geral da rede, a distância duma extremidade da rede à outra. - -O diâmetro usa um comando simples: `nx.diameter(G)`. No entanto, executar este comando no grafo *quaker* dará uma mensagem de erro indicando que o Grafo não está conectado ("*not connected*"). Isto significa apenas que o seu grafo, como o leitor já viu, tem mais que um componente. Porque existem alguns nós que não têm um caminho de todo com outros, é impossível encontrar todos os caminhos mais curtos. Veja novamente a visualização do seu grafo: - -{% include figure.html filename="exploring-and-analyzing-network-data-with-python-1.png" alt="Imagem com uma representação de um gráfico de redes" caption="Visualização de rede baseada em força dos dados *quakers*, criado no Gephi." %} - -Como não há caminho entre nós dum componente e nós doutro, `nx.diameter()` retorna a mensagem de erro "*not connected*". O leitor pode remediar isto, primeiro, ao descobrir se o seu Grafo está conectado ("*is connected*") (*i.e.* tudo um componente) e, se não conectado, descobrir apenas o componente mais largo e calcular o diâmetro somente desse componente. Aqui está o código: - -```python -# Se o seu Grafo tiver mais do que um componente, isto retornará como 'False' -print(nx.is_connected(G)) - -# A seguir, use nx.connected_components para obter a lista de componentes, -# depois, use o comando max() para encontrar o mais pesado: -components = nx.connected_components(G) -largest_component = max(components, key=len) - -# Crie um 'Subgrafo' apenas com o componente mais pesado, -# depois, calcule o diâmetro do Subgrafo, tal como fez com a densidade. - -subgraph = G.subgraph(largest_component) -diameter = nx.diameter(subgraph) -print("Network diameter of largest component:", diameter) -``` - -Como nós tomámos o componente mais largo, nós podemos assumir que não há nenhum diâmetro mais largo para os outros componentes. Portanto, esta figura é uma boa representação para o diâmetro de todo o Grafo. O diâmetro de rede do componente mais largo desta rede é 8: existe um comprimento de rede de 8 entre os dois nós mais afastados na rede. Ao contrário da densidade, que é apresentada de 0 a 1, é difícil saber a partir deste número somente se 8 é um diâmetro largo ou curto. Para algumas métricas globais, pode ser melhor compará-lo a redes de tamanho e forma similar[^12]. - -O cálculo estrutural final que o leitor fará nesta rede concerne o conceito de **fechamento triádico**. Fechamento triádico supõe que se duas pessoas conhecem a mesma pessoa, elas provavelmente conhecem-se mutuamente. Se Fox conhece tanto Fell como Whitehead, então Fell e Whitehead podem perfeitamente conhecer-se mutuamente, completando um **triângulo** na visualização de três *edges* conectando Fox, Fell e Whitehead. O número destes triângulos fechados na rede pode ser usado para descobrir aglomerados e comunidades de indivíduos que se conhecem todos intimamente. - -Uma forma de medir o fechamento triádico é o chamado **coeficiente de aglomeração** por causa desta tendência aglomeradora, mas a medida estrutural de rede que o leitor aprenderá é conhecida como **transitividade**[^13]. Transitividade é o rácio de todos os triângulos sobre todos os triângulos possíveis. Um triângulo possível existe quando uma pessoa (Fox) conhece duas pessoas (Fell e Whitehead). Então, transitividade, como a densidade, expressa quão interconectado um grafo é em termos dum rácio de conexões reais sobre as possíveis. Lembre-se, medidas como a transitividade e a densidade lidam com *probabilidades* e não com *certezas*. Todos os *outputs* do seu *script* no Python devem ser interpretados, como qualquer outro objeto de pesquisa. A transitividade permite-lhe uma forma de pensar sobre todas as relações no seu grafo que *podem* existir, mas que, atualmente, não existem. - -O leitor pode calcular a transitividade numa só linha, da mesma forma que calculou a densidade: - -```python -triadic_closure = nx.transitivity(G) -print("Triadic closure:", triadic_closure) -``` - -Tal como a densidade, transitividade é numerada de 0 a 1, e o leitor pode ver que a transitividade da rede é de cerca de 0.1694, um valor um pouco mais alto que o da sua densidade de 0.0248. Porque o grafo não é muito denso, existem menos *triângulos possíveis*, o que pode resultar numa transitividade relativamente mais elevada. Isto é, nós que já têm várias conexões farão provavelmente parte destes triângulos fechados. Para suportar isto, o leitor quererá saber mais sobre nós com muitas conexões. - -## Centralidade - -Depois de obter algumas medidas básicas da estrutura da rede inteira, um bom próximo passo é descobrir quais nós são os mais importantes na sua rede. Na análise de redes, medidas da importância dos nós são referidas como medidas de **centralidade**. Porque existem várias maneiras de abordar a questão "Que nós são os mais importantes?", existem várias formas diferentes de calcular a centralidade. Aqui, o leitor aprenderá sobre as três medidas de centralidade mais comuns: o grau, a centralidade de intermediação, e a centralidade adjacente. - -O **grau** é a forma mais simples e comum de encontrar nós importantes. O grau dum nó é a soma das suas *edges*. Se um nó tem três linhas a estenderem-se a outros nós, o seu grau é de três. Cinco *edges*, o seu grau é de cinco. É extremamente simples. Como cada uma dessas edges terá sempre um nó na outra extremidade, o leitor pode pensar no grau como o número de pessoas às quais qualquer pessoa está diretamente conectada. Os nós com os graus mais elevados numa rede social são as pessoas que conhecem mais pessoas. Estes nós são geralmente referidos como ***hubs***, e calcular o grau é a forma mais rápida de identificar os *hubs*. - -Calcular a centralidade para cada nó no NetworkX não é exatamente tão simples como as métricas de toda a rede acima, mas continua a envolver comandos unilineares. Todos os comandos de centralidade que o leitor aprenderá nesta secção produzem dicionários nos quais as chaves são os nós e os valores são as medidas de centralidade. Isto significa que eles estão prontos para adicionar de volta à nossa rede como um atributo de nó, como o leitor fez na última secção. Comece por calcular o grau e adicione-o como um atributo à sua rede. - -```python -degree_dict = dict(G.degree(G.nodes())) -nx.set_node_attributes(G, degree_dict, 'degree') -``` - -O leitor acabou de executar o método `G.degree()` na lista completa de nós na sua rede (`G.nodes()`). Como o leitor adicionou-o como um atributo, agora pode ver o grau de William Penn, bem como com o resto da sua informação se aceder ao seu nó diretamente: - -```python -print(G.nodes['William Penn']) -``` - -Mas estes resultados são úteis para mais do que simplesmente adicionar atributos ao seu objeto Grafo. Como o leitor já está no Python, pode organizar e compará-los. O leitor pode usar a função incorporada `sorted()` para organizar um dicionário com as suas chaves ou valores e encontrar o *top* vinte dos nós por grau. Para fazer isto, o leitor vai precisar de usar `itemgetter`, o qual nós importámos no início do tutorial. Usando `sorted` e `itemgetter`, pode organizar o dicionário de graus assim: - -```python -sorted_degree = sorted(degree_dict.items(), key=itemgetter(1), reverse=True) -``` - -Aqui, há muitas coisas a acontecer nos bastidores, mas concentre-se só nas três variáveis de *input* que o leitor deu a `sorted()`. A primeira é o dicionário, `degree_dict.items()`, que quer organizar. A segunda é o que organizar por: neste caso, item "1" é o segundo item no par, ou o valor do seu dicionário. Finalmente, o leitor diz a `sorted()` para ir em `reverse` para que os nós de grau mais elevado apareçam primeiro na lista resultante. Assim que o leitor tiver criado esta lista organizada, pode iterar por ela e usar a *list slicing*[^6] para obter somente os primeiros 20 nós: - -```python -print("Top 20 nodes by degree:") -for d in sorted_degree[:20]: - print(d) -``` - -Como o leitor pode ver, o grau de Penn é 18, relativamente elevado para esta rede. Mas digitar estas informações de classificação ilustra as limitações do grau como uma medida de centralidade. O leitor provavelmente não precisava que o NetworkX lhe dissesse que William Penn, líder *quaker* e fundador da Pensilvânia, era importante. A maioria das redes sociais terão somente alguns *hubs* de grau muito elevado, com o resto de grau similar e baixo[^14]. O grau pode informá-lo sobre os maiores *hubs*, mas não pode dizer-lhe muito sobre o resto dos nós. E, em muitos casos, esses *hubs* sobre os quaiso está a informar (como o Penn ou como a cofundadora do Quakerismo, Margaret Fell, com um grau de 13) não são especialmente surpreendentes. Neste caso, quase todos os *hubs* são fundadores da religião ou, noutros casos, figuras políticas importantes. - -Felizmente, existem outras medidas de centralidade que lhe podem dizer mais do que só os *hubs*. A [centralidade adjacente](https://perma.cc/VF28-JDCR) (em inglês) é um tipo de extensão do grau---analisa uma combinação dos *edges* dum nó e as *edges* dos vizinhos desse nó. Centralidade adjacente preocupa-se se um nó é um *hub*, mas também se preocupa com quantos *hubs* um nó está conectado. É calculado como um valor de 0 a 1: quanto mais próximo do um, maior a centralidade. A centralidade adjacente é útil para compreender que nós podem obter informação a outros nós rapidamente. Se o leitor conhece muitas pessoas bem-conectadas, poderia espalhar uma mensagem muito eficientemente. Se o leitor usou o Google, então está já mais ou menos familiarizado com a centralidade adjacente. O seu algoritmo de PageRank usa uma extensão desta fórmula para decidir que páginas de internet são colocadas no topo da lista de resultados. - -A [centralidade de intermediação](https://perma.cc/C55J-7XAJ) (em inglês) é um pouco diferente das outras duas calculações na medida em que não se preocupa com o número de *edges* que qualquer nó ou grupo de nós tem. A centralidade de intermediação observa todos os **caminhos mais curtos** que passam por um nó em particular (ver acima). Para fazer isto, tem que primeiro calcular todos os possíveis caminhos mais curtos na sua rede, por isso mantenha em mente que a centralidade de intermediação vai demorar mais tempo para calcular que as outras medidas de centralidade (mas não será um problema num *dataset* desta dimensão). A centralidade de intermediação, que também é expressa numa escala de 0 a 1, é particularmente boa a encontrar nós que conectam duas partes distintas duma rede. Se o leitor é a única coisa conectando dois aglomerados, cada comunicação entre esses aglomerados tem que passar por si. Em contraste com um *hub*, este tipo de nó é regularmente referido como um ***broker***. A centralidade de intermediação não é a única maneira de encontrar *brokerage* (e outros métodos são mais sistemáticos), mas é uma forma rápida de lhe dar uma ideia de quais nós são importantes, não porque têm muitas conexões eles próprios, mas porque eles situam-se *entre* grupos, dando à rede conectividade e coesão. - -Estas duas medidas de centralidade são ainda mais simples de executar que um grau---eles não precisam de receber uma lista de nós, só o grafo `G`. O leitor pode executá-las com estas funções: - -```python -betweenness_dict = nx.betweenness_centrality(G) # Execute a centralidade de intermediação -eigenvector_dict = nx.eigenvector_centrality(G) # Execute a centralidade adjacente - -# Atribua cada a um atributo na sua rede -nx.set_node_attributes(G, betweenness_dict, 'betweenness') -nx.set_node_attributes(G, eigenvector_dict, 'eigenvector') -``` - -O leitor pode organizar a centralidade de intermediação (ou a adjacente) ao mudar os nomes das variáveis no código organizador acima, como: - -```python -sorted_betweenness = sorted(betweenness_dict.items(), key=itemgetter(1), reverse=True) - -print("Top 20 nodes by betweenness centrality:") -for b in sorted_betweenness[:20]: - print(b) -``` - -O leitor notará que muitos, mas não todos, dos nós que têm graus elevados também têm uma centralidade de intermediação alta. De facto, centralidade de intermediação apresenta duas mulheres, Elizabeth Leavens e Mary Penington, cuja importância tinha sido obscurecida pela métrica da centralidade de grau. Uma vantagem de fazer estes cálculos no Python é que o leitor pode rapidamente comparar dois conjuntos de cálculos. E se o leitor quiser saber quais dos nós com alta centralidade de intermediação têm graus baixos? Isto é o mesmo que dizer: quais nós de alta intermediação são inesperados? Pode usar uma combinação da lista organizada acima: - -```python -# Primeiro, obtenha uma lista do top 20 nós por intermediação -top_betweenness = sorted_betweenness[:20] - -# Depois, encontre e obtenha o grau de cada um -for tb in top_betweenness: # Itere por top_betweenness - degree = degree_dict[tb[0]] # Use degree_dict para aceder ao grau dum nó, veja a nota de rodapé 4 - print("Name:", tb[0], "| Betweenness Centrality:", tb[1], "| Degree:", degree) -``` - -O leitor pode confirmar a partir destes resultados que algumas pessoas, como Leavens e Penington, têm alta centralidade de intermediação, mas baixo grau. Isto pode significar que estas mulheres eram *brokers* importantes, conectando partes díspares do grafo. O leitor também pode aprender coisas inesperadas sobre pessoas sobre as quais já se sabe algo---nesta lista, consegue ver que Penn tem um grau inferior ao do fundador *quaker* George Fox, mas uma centralidade de intermediação mais elevada. Isto é o mesmo que dizer, simplesmente conhecer mais pessoas não é tudo. - -Isto aborda somente a superfície do que pode ser feito com métricas de rede no Python. O NetworkX oferece dezenas de funções e medidas para o leitor usar em várias combinações, e pode usar Python para estender estas medidas de formas quase ilimitadas. Uma linguagem de programação como o Python ou o R dar-lhe-á a flexibilidade para explorar a sua rede computacionalmente de formas que outros *interfaces* não podem ao permitir-lhe combinar e comparar os resultados estatísticos da sua rede com outros atributos dos seus dados (como as datas e ocupações que adicionou à rede no início deste tutorial!). - -## Noções Avançadas do NetworkX: Deteção de Comunidades com Modularidade - -Outra coisa regularmente questionada sobre o *dataset* duma rede é quais são os subgrupos e comunidades dentro da estrutura social mais larga. A sua rede é uma família grande e feliz na qual todos se conhecem? Ou é uma coleção de subgrupos mais pequenos que estão conectados por um ou dois intermediários? O campo da deteção de comunidades em redes está desenhado para responder a estas questões. Existem várias formas de calcular comunidades, cliques, e aglomerados na sua rede, mas o método mais popular atualmente é a **modularidade**. A modularidade é uma medida de densidade relativa na sua rede: uma comunidade (chamada um **módulo** ou **classe** modular) tem uma densidade elevada em relação a outros nós dentro do seu módulo, mas densidade baixa com os outros de fora. A modularidade dá-lhe uma pontuação geral de quão fracioanda a sua rede é, e essa pontuação pode ser usada para **repartir** a rede e evidenciar as comunidades individuais[^15]. - -Redes muito densas são geralmente mais difíceis de dividir em repartições sensatas. Felizmente, como o leitor descobriu anteriormente, esta rede não é assim tão densa. Não existem tantas conexões reais quanto conexões possíveis, e existem componentes desconectados de todo. Vale a pena repartir esta rede esparsa com modularidade e ver se os resultados fazem sentido histórico e analítico. - -A deteção e repartição de comunidades no NetworkX requere um pouco mais de configuração do que algumas das outras métricas. Existem algumas abordagens incorporadas para a deteção de comunidades (como o [*minimum cut*](https://perma.cc/K59Y-WZRX) (em inglês)), mas modularidade não vem incluída com o NetworkX. Felizmente, existe um [módulo adicional no Python](https://github.com/taynaud/python-louvain/) (em inglês) que o leitor pode usar com o NetworkX, e que já instalou e importou no início deste tutorial. O leitor pode ler a [documentação completa](https://perma.cc/KW5K-ZX67) (em inglês) para todas as funções que oferece, mas para a maior parte dos propósitos da deteção de comunidades, quererá apenas `best_partition()`: - -```python -communities = community.greedy_modularity_communities(G) -``` - -O método `greedy_modularity_communities()` tenta determinar o número de comunidades apropriadas para o grafo, e agrupa todos os nós em subconjuntos baseados nestas comunidades. Ao contrário das funções de centralidade, o código acima não criará um dicionário. Ao invés, criará uma lista especial de objetos "*frozenset*" (similar a listas). Existe um conjunto para cada grupo, e os conjuntos contêm os nomes das pessoas em cada grupo. Para adicionar esta informação à sua rede na maneira agora familiar, o leitor tem que primeiro criar um dicionário que classifique cada pessoa com um valor numérico para o grupo ao qual pertencem: - -```python -modularity_dict = {} # Crie um dicionário vazio -for i,c in enumerate(communities): # Itere pela lista de comunidades, mantendo em mente o número para a comunidade - for name in c: # Itere por cada pessoa numa comunidade - modularity_dict[name] = i # Crie uma entrada no dicionário para a pessoa, na qual o valor é o grupo ao qual pertence. - -# Agora, o leitor pode adicionar a informação de modularidade como fez com as outras métricas -nx.set_node_attributes(G, modularity_dict, 'modularity') -``` - -Como sempre, o leitor pode combinar estas medidas com outras. Por exemplo, aqui está como encontrar os nós de centralidade adjacente mais elevada na classe modular 0 (a primeira): - -```python -# Primeiro, obtenha uma lista apenas dos nós nessa classe -class0 = [n for n in G.nodes() if G.nodes[n]['modularity'] == 0] - -# Depois, crie um dicionário das centralidades adjacentes desses nós -class0_eigenvector = {n:G.nodes[n]['eigenvector'] for n in class0} - -# Depois, organize esse dicionário e obtenha os primeiros 5 resultados -class0_sorted_by_eigenvector = sorted(class0_eigenvector.items(), key=itemgetter(1), reverse=True) - -print("Modularity Class 0 Sorted by Eigenvector Centrality:") -for node in class0_sorted_by_eigenvector[:5]: - print("Name:", node[0], "| Eigenvector Centrality:", node[1]) -``` - -Usando a centralidade adjacente como um *ranking* pode dar-lhe uma ideia das pessoas importantes nesta classe modular. O leitor notará que algumas destas pessoas, especialmente William Penn, William Bradford (*não* o fundador de Plymouth em que estará a pensar[^16]) e James Logan, passaram muito tempo na América. Também, Bradford e Tace Sowle eram ambos impressores *quakers* proeminentes. Com um pouco de pesquisa, nós podemos descobrir que existem tanto razões geográficas como ocupacionais que explicam que este grupo de pessoas se juntem. Isto é uma indicação de que a modularidade está a trabalhar como esperado. - -Em redes mais pequenas como esta, uma tarefa comum é encontrar e listar todas as classes modulares e seus membros[^17]. O leitor pode fazer isto ao percorrer pela lista `communities`: - -```python -for i,c in enumerate(communities): # Itere pela lista de comunidades - if len(c) > 2: # Filtre as classes modulares com 2 ou menos nós - print('Class '+str(i)+':', list(c)) # Obtenha as classes e os seus membros -``` - -Note no código acima que está a filtrar qualquer classe modular com dois ou menos nós, na linha `if len(c) > 2`. O leitor recordar-se-á da visualização que existiam vários componentes pequenos da rede com apenas dois nós. A modularidade encontrará estes componentes e tratá-los-á como classes separadas (visto que eles não estão conectados a mais nada). Ao filtrá-los, o leitor obtém uma ideia melhor das classes modulares maiores dentro do principal componente da rede. - -Trabalhando só com o NetworkX trá-lo-á longe, e o leitor pode encontrar muito sobre classes modulares apenas ao trabalhar com os dados diretamente. Mas quase sempre quer visualizar os seus dados (e, talvez, expressar a modularidade como a cor de nó). Na próxima secção, o leitor irá aprender como exportar os seus dados do NetworkX para uso noutros programas. - - - -# Exportar Dados - -O NetworkX suporta um grande número de formatos de ficheiros para [exportação de dados](https://perma.cc/X65S-HRCF) (em inglês). Se o leitor quiser exportar uma lista de *edges* em texto simples para carregar no Palladio, existe um [*wrapper* conveniente](https://perma.cc/P9ES-57X3) (em inglês) para isso. Frequentemente, no *Six Degrees of Francis Bacon*, nós exportamos dados do NetworkX no [formato JSON especializado do D3](https://perma.cc/SF8Z-DWPW) (em inglês), para visualização no navegador de internet. O leitor poderia até [exportar](https://perma.cc/Y6QJ-5VM8) (em inglês) o seu grafo como um [*dataframe* do Pandas](https://perma.cc/87NA-KCK4) (em inglês) se existissem operações estatísticas mais avançadas que quisesse executar. Existem várias opções, e se o leitor tiver adicionado diligentemente todas as suas métricas de volta no seu objeto Grafo como atributos, todos os seus dados serão exportados duma só vez. - -A maior parte das opções de exportação funcionam da mesma maneira, por isso, para este tutorial o leitor aprenderá como exportar os seus dados para o formato GEXF do Gephi. Assim que tiver exportado o ficheiro, o leitor pode fazer o *upload* [diretamente para o Gephi](https://gephi.org/quickstart/) (em inglês) para a visualização. - -Exportar dados é, normalmente, um simples comando unilinear. Tudo o que é preciso é escolher um nome de ficheiro. Neste caso, usaremos `quaker_network.gexf`. Para exportar, digite: - -```python -nx.write_gexf(G, 'quaker_network.gexf') -``` - -É só! Quando executar o seu *script* no Python, colocará automaticamente o novo ficheiro GEXF no mesmo diretório que o seu ficheiro Python.[^18] - -# Conclusões - -Agora, tendo realizado e revisto uma panóplia de métricas de rede no Python, o leitor tem as evidências a partir das quais os argumentos se contrõem e se retiram conclusões sobre esta rede de *quakers* na Grã-Bretanha moderna. O leitor sabe, por exemplo, que a rede tem uma **densidade** relativamente baixa, sugerindo associações ténues e/ou dados originais imcompletos. O leitor sabe que a comunidade está organizada em torno de vários ***hubs*** desproporcionalmente grandes, entre eles fundadores da denominação, como Margaret Fell e George Fox, bem como líderes políticos e religiosos importantes, como William Penn. Mais útil, o leitor sabe sobre mulheres com graus relativamente baixos, como Elizabeth Leavens e Mary Penington, que (como resultado de centralidade de intermediação elevada) podem ter agido como ***brokers***, conectando múltiplos grupos. Finalmente, o leitor aprendeu que a rede é feita dum grande **componente** e muitos muito pequenos. No interior desse grande componente, existem várias **comunidades** distintas, algumas das quais parecem organizadas em torno do tempo ou local (como Penn e os seus associados estadunidenses). Por causa dos metadados que adicionou à sua rede, o leitor tem as ferramentas para explorar estas métricas em profundidade e para, potencialmente, explicar alguns dos recursos estruturais que identificou. - -Cada uma destas descobertas é um convite para mais pesquisa ao invés dum ponto final ou prova. A análise de redes é um conjunto de ferramentas para perguntar questões específicas sobre a estrutura das relações num *dataset*, e o NetworkX providencia um interface relativamente simples a muitas das técnicas e métricas comuns. As redes são uma maneira útil de estender a sua pesquisa a um grupo ao providenciar informações sobre a estrutura da comunidade, e nós esperamos que o leitor será inspirado por este tutorial para usar métricas para enriquecer a sua própria pesquisa e para explorar a flexibilidade da análise de redes para além da visualização. - -[^1]: **Nota de tradução**: Como o leitor poderá confirmar mais abaixo, os autores desta lição transformaram os dados aqui analisados num gráfico, sem explicar tal passo, visto que o artigo lida com a análise dos dados, e não com a sua visualização. Se desejar, pode ler também a lição aqui referida e voltar a esta para confirmar se o seu gráfico se assemelha ao dos quatro autores. Aconselhamos que o faça após ter concluído todos os passos aqui descritos. - -[^2]: Em muitos (mas não todos os) casos, `pip` ou `pip3` serão instalados automaticamente com o Python3. - -[^3]: **Nota de tradução**: Isto pode estender-se ao uso de comandos, na sua *shell*, nomeadamente aquando da instalação do pip e de pacotes (ver Preparação dos Dados e Instalação do NetworkX). - -[^4]: Algumas instalações só quererão que o leitor digite `pip` sem "3," mas no Python 3, `pip3` é a mais comum. Se um não funcionar, tente o outro! - -[^5]: **Nota de tradução**: É importante lembrar que existem variações entre as diferentes versões do NetworkX que podem resultar em erros ou outputs diferentes. Tal é o caso da 2.6, com a qual obtivemos uma mensagem de erro durante a avaliação da modularidade e uma resposta diferente com a função print(nx.info(G)) daquela apresentada com a 2.4. - -[^6]: Existem algumas técnicas *pythónicas* que este código usa. A primeira é a 'compreensão de lista' (*list comprehensions*), que incorpora *loops* (`for n in nodes`) para criar novas listas (em parêntesis retos), assim: `new_list = [item for item in old_list]`. A segunda é a *list slicing*, que permite-lhe subdividir ou "*slice*" ("cortar") a lista. A notação da *list slicing* `[1:]` toma tudo *exceto* o primeiro item na lista. O 1 informa o Python para começar com o segundo item nesta lista (no Python, o leitor começa a contar do 0), e os dois pontos dizem ao Python para tomar tudo até ao fim da lista. Como a primeira linha em ambas destas listas é a fila de cabeçalho de cada CSV, nós não queremos que esses cabeçalhos sejam incluídos nos nossos dados. - -[^7]: Dicionários são um tipo de dados incorporados no Python, construídos com pares de chave-valor. Pense numa chave como a palavra-chave num dicionário, e o valor como a sua definição. Chaves têm que ser únicas (só uma de cada por dicionário), mas os valores podem ser qualquer coisa. Dicionários são representados por chavetas, com chaves e valores separados por dois pontos: `{key1:value1, key2:value2, ...}`. Dicionários são uma das maneiras mais rápidas de armazenar valores que o leitor pode necessitar mais tarde. De facto, um objeto Grafo do NetworkX é, ele próprio, feito de dicionários aninhados. - -[^8]: Note que este código usa parêntesis retos de duas formas. Usa números em parêntesis retos para aceder índices específicos numa lista de nós (por exemplo, o ano de nascimento no `node[4]`), mas também para designar uma *chave* (sempre `node[0]`, o ID) a qualquer um dos nossos dicionários vazios: `dictionary[key] = value`. Conveniente! - -[^9]: Por uma questão de simplicidade, removemos quaisquer nós que *não estão conectados a quaisquer outros* do *dataset* antes de termos começado. Isto foi feito simplesmente para reduzir a desordem, mas também é muito comum de se ver muitos destes nós solteiros no seu *dataset* de rede comum. - -[^10]: Mas mantenha em mente que isto é a densidade de *toda* a rede, incluindo esses componentes não conectados a flutuar em órbita. Existem várias conexões possíveis entre e com eles. Se o leitor tivesse tomado a densidade somente do componente maior, poderia ter obtido um número diferente. O leitor poderia fazê-lo ao encontrar o componente mais largo como nós lhe mostramos na próxima secção sobre o **diâmetro**, e, depois, ao executar o mesmo método de densidade somente nesse componente. - -[^11]: Nós tomamos o comprimento da lista *menos um* porque nós queremos o número de *edges* (ou passos) entre os nós listados aqui, ao invés do número de nós. - -[^12]: A forma mais correta de fazer este tipo de comparação é criar *grafos aleatórios* de tamanho idêntico para ver se as métricas diferem da norma. O NetworkX oferece várias ferramentas para [gerar grafos aleatórios](https://perma.cc/7Z4U-KAY7) (em inglês). - -[^13]: Porque se chama transitividade? O leitor pode recordar-se da propriedade transitiva de Geometria das aulas de Matemática no Ensino Secundário: se A=B e B=C, o A deve ser igual a C. Semelhantemente, no fechamento triádico, se a pessoa A conhece a pessoa B e a pessoa B conhece a pessoa C, então a pessoa A provavelmente conhece a pessoa C: logo, transitividade. - -[^14]: Aqueles com experiência em Estatística notarão que grau em redes sociais segue tipicamente uma *lei de potência*, mas isto não é nem pouco usual, nem especialmente útil saber. - -[^15]: Embora não venhamos a cobri-lo neste tutorial, é geralmente boa ideia obter a clasificação modular global primeiro para determinar se o leitor aprenderá qualquer coisa ao repartir a sua rede de acordo com a modularidade. Para ver a classificação geral da modularidade, tome as comunidades que calculou com `communities = community.best_partition(G)` e execute `global_modularity = community.modularity(communities, G)`. E depois basta aplicar `print(global_modularity)`. - -[^16]: **Nota de tradução**: [Plymouth](https://perma.cc/2EKN-TJPW) foi a primeira colónia inglesa permanente na região da Nova Inglaterra, no nordeste dos Estados Unidos da América, tendo sido fundada em 1620 por vários colonos puritanos, entre os quais um tal [William Bradford](https://perma.cc/UA8V-J4CX). Este [outro](https://perma.cc/TW4C-QWUY) referido foi um importante impressor *quaker*. - -[^17]: Em redes grandes, as listas seriam provavelmente ilegivelmente longas, mas o leitor poderia obter uma ideia de todas as classes modulares duma só vez ao visualizar a rede e adicionar cor aos nós baseada na sua classe modular. - -[^18]: Cada formato de ficheiro que é exportável é também importável. Se o leitor tiver um ficheiro GEXF do Gephi que quer pôr no NetworkX, digitaria `G = nx.read_gexf('some_file.gexf')`. - +--- +title: "Explorar e Analisar Dados de Rede com Python" +slug: explorar-analisar-dados-rede-python +original: exploring-and-analyzing-network-data-with-python +layout: lesson +collection: lessons +date: 2017-06-16 +translation_date: 2023-05-12 +authors: +- John R. Ladd +- Jessica Otis +- Christopher N. Warren +- Scott Weingart +reviewers: +- Elisa Beshero-Bondar +- Anne Chao +- Qiwei Li +editors: +- Brandon Walsh +translator: +- João Domingues Pereira +translation-editor: +- Eric Brasil +translation-reviewer: +- Josir Cardoso Gomes +- Daniel Alves +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/446 +difficulty: 2 +activity: analyzing +topics: [network-analysis, data-visualization] +abstract: Esta lição introduz métricas de rede e como tirar conclusões das mesmas quando se trabalha com dados de Humanidades. O leitor aprenderá como usar o pacote NetworkX do Python para produzir e trabalhar com estas estatísticas de rede. +avatar_alt: Caminhos-de-ferro intrincados +doi: 10.46430/phpt0041 +modified: 2023-08-25 +lesson-testers: John R. Ladd +tested-date: 2023-08-21 +--- + +{% include toc.html %} + +# Introdução + +## Objetivos da Lição + +Neste tutorial, o leitor irá aprender: +- A usar o pacote [**NetworkX**](https://perma.cc/F574-RREU) para trabalhar com dados de rede em [**Python**](/pt/licoes/introducao-instalacao-python); e +- A analisar dados de rede de Humanidades para encontrar: + - Estruturas de rede e comprimentos de caminho, + - Nós importantes ou centrais, e + - Comunidades e subgrupos. + +**n.b.**: Este é um tutorial para explorar estatísticas e métricas de rede. Assim sendo, iremos focar-nos em maneiras de analisar e tirar conclusões a partir de redes sem visualizá-las. Provavelmente, o leitor quererá uma combinação de visualização e métricas de rede no seu próprio projeto, e, por isso, nós recomendamos este artigo como um complemento a [este tutorial anterior do *Programming Historian*](/en/lessons/creating-network-diagrams-from-historical-sources) (em inglês)[^1]. + +## Pré-Requisitos + +Este tutorial assume que o leitor: + +- Tem uma familiaridade básica com redes e/ou leu [*From Hermeneutics to Data to Networks: Data Extraction and Network Visualization of Historical Sources*](/en/lessons/creating-network-diagrams-from-historical-sources) (em inglês), de Marten Düring, aqui no *Programming Historian*; +- Instalou o Python 3, não o Python 2 que é nativamente instalado em sistemas operacionais com base no Unix, como os Macs (se precisar de assistência com a instalação do Python 3, veja [The Hitchhiker's Guide to Python](https://perma.cc/DP2N-B4EN) (em inglês); e +- Instalou o instalador de pacotes `pip`[^2]. + +É possível ter duas versões do Python (2 *e* 3) instaladas no seu computador ao mesmo tempo. Por esta razão, ao aceder ao Python 3, o leitor frequentemente terá que o declarar explicitamente digitando `python3` e `pip3` em vez de simplesmente `python` e `pip`. Consulte os tutoriais do *Programming Historian* sobre a [instalação do Python](/pt/licoes/introducao-instalacao-python) e o [uso do pip](/pt/licoes/instalacao-modulos-python-pip) para mais informações[^3]. + +## O Que o Leitor Pode Aprender a Partir dos Dados de Rede? + +Há muito que as redes interessam aos pesquisadores nas Humanidades, mas muitos académicos recentes progrediram dum interesse grandemente qualitativo e metafórico em links e conexões para um séquito mais formal de ferramentas quantitativas para estudar mediadores, *hubs* (nós importantes) e estruturas interconectadas. Como o sociólogo Mark S. Granovetter apontou no seu importante artigo de maio de 1973 [*The Strength of Weak Ties*](https://perma.cc/A4PC-WPKN) (em inglês), raramente é suficiente notar que duas pessoas estavam conectadas uma à outra. Fatores como a sua relação estrutural com outras pessoas e se essas pessoas adicionais estavam, elas próprias, conectadas umas às outras têm influência decisiva nos eventos. Na medida em que até o mais perspicaz dos académicos tem dificuldade em perceber, digamos, o contorno geral duma rede (a sua "Topologia" de rede) e em identificar os nós mais significativos para conectar grupos, a análise quantitativa de rede oferece aos académicos um modo de transitar relativamente fluidamente entre o objeto social de larga escala (o "grafo") e as particularidades minuciosas das pessoas e laços sociais. + +Este tutorial irá ajudá-lo a responder questões como: +- Qual é a estrutura geral da rede? +- Quem são as pessoas importantes, ou *hubs*, na rede? +- Quais são os subgrupos e comunidades na rede? + + +## O Nosso Exemplo: a Sociedade dos Amigos + +Antes que existissem amigos do Facebook, havia a Sociedade dos Amigos, conhecida como os *quakers*. Fundados na Inglaterra em meados do século XVII, os *quakers* eram cristãos protestantes que divergiram da oficial Igreja da Inglaterra e que promoviam uma ampla tolerância religiosa, preferindo a suposta "luz interior" (*inner light*; **nota de tradução**: este conceito tinha uma extrema importância na Teologia *quaker*) e as consciências dos cristãos à ortodoxia imposta pelo Estado. O número de *quakers* cresceu rapidamente de meados para os finais do século XVII e os seus membros espalharam-se pelas Ilhas Britânicas, pela Europa e pelas colônias do Novo Mundo---especialmente pela Pensilvânia, fundada pelo líder *quaker* William Penn e lar dos quatro autores. + +Visto que os académicos há muito que ligam o crescimento e a persistência dos *quakers* à eficácia das suas redes, os dados usados neste tutorial são uma lista de nomes e relações entre os primevos *quakers* do século XVII. Este *dataset* é derivado do [*Oxford Dictionary of National Biography*](https://www.oxforddnb.com) (em inglês) e do trabalho em progresso do projeto [*Six Degrees of Francis Bacon*](https://www.sixdegreesoffrancisbacon.com) (em inglês), o qual está a reconstruir as redes sociais da Grã-Bretanha moderna (1500-1700). + +# Preparação dos Dados e Instalação do NetworkX + +Antes de iniciar este tutorial, o leitor precisará de fazer o download de dois ficheiros que, combinados, constituem o *dataset* da nossa rede. O ficheiro [quakers_nodelist.csv](/assets/exploring-and-analyzing-network-data-with-python/quakers_nodelist.csv) é uma lista de *quakers* modernos (nós) e o ficheiro [quakers_edgelist.csv](/assets/exploring-and-analyzing-network-data-with-python/quakers_edgelist.csv) é uma lista de relações entre esses *quakers* (*edges*). Para fazer o download destes ficheiros, basta clicar com o botão direito do *mouse* nos *links* e escolher "Guardar ligação como". + +Será extremamente útil ao leitor familiarizar-se com a estrutura do *dataset* antes de continuar. Para mais informações sobre a estrutura geral dos *datasets* de rede, veja [este tutorial](/en/lessons/creating-network-diagrams-from-historical-sources#developing-a-coding-scheme) (em inglês). Quando o leitor abrir o ficheiro de nós no programa da sua escolha, verá que cada *quaker* é primeiramente identificado pelo seu *name* (nome). Cada nó dum *quaker* também tem um número de atributos associados, incluindo *historical significance* (em português, significado histórico), *gender* (em português, género), *birth*/*death dates* (em português, datas de nascimento/morte), e o SDFB ID---um identificador numérico exclusivo que lhe permitirá cruzar nós neste *dataset* com o *dataset* original do *Six Degrees of Francis Bacon*, se desejado. Aqui estão as primeiras linhas: + +``` +Name,Historical Significance,Gender,Birthdate,Deathdate,ID +Joseph Wyeth,religious writer,male,1663,1731,10013191 +Alexander Skene of Newtyle,local politician and author,male,1621,1694,10011149 +James Logan,colonial official and scholar,male,1674,1751,10007567 +Dorcas Erbery,Quaker preacher,female,1656,1659,10003983 +Lilias Skene,Quaker preacher and poet,male,1626,1697,10011152 +``` + +Note que, embora as colunas não estejam corretamente alinhadas como ocorre numa tabela de dados, as vírgulas mantêm tudo apropriadamente separado. + +Quando o leitor abrir o ficheiro de *edges*, verá que nós usamos os *names* do ficheiro de nós para identificar os nós conectados por cada *edge*. Estas *edges* começam num nó ***source*** (em português, origem) e acabam num nó ***target*** (em português, destino). Embora esta linguagem derive das chamadas estruturas de rede **direcionadas**, nós usaremos os nossos dados como uma rede **não direcionada**: se a Pessoa A conhece a Pessoa B, então a Pessoa B também deve conhecer a Pessoa A. Nas redes direcionadas, as relações não precisam de ser recíprocas (a Pessoa A pode enviar uma carta à B sem receber uma em troca), mas nas redes não direcionadas as conexões são sempre recíprocas, ou **simétricas**. Uma vez que esta é uma rede de quem conhecia quem ao invés de, digamos, uma rede epistolar, um conjunto de relações não direcionadas é o mais apropriado. As relações simétricas nas redes não direcionadas são úteis sempre que estiver preocupado com relações que definem o mesmo papel para ambas as partes. Dois amigos têm uma relação simétrica: cada um deles é um amigo do outro. O autor e o destinatário duma carta têm uma relação assimétrica porque cada um tem um papel diferente. Tanto as redes direcionadas como as não direcionadas têm os seus próprios recursos (e, por vezes, as suas próprias métricas), e o leitor quererá escolher aquela que melhor se adapta aos tipos de relações que está a registrar e às questões que quer clarificar. Aqui estão as primeiras *edges* na rede *quaker* não direcionada: + +``` +Source,Target +George Keith,Robert Barclay +George Keith,Benjamin Furly +George Keith,Anne Conway Viscountess Conway and Killultagh +George Keith,Franciscus Mercurius van Helmont +George Keith,William Penn +``` + +Agora que fez o download dos dados *quakers* e viu como estão estruturados, está na hora de começar a trabalhar com esses dados no Python. Assim que tanto o Python como o pip estiverem instalados (ver Pré-Requisitos, acima), quererá instalar o NetworkX, digitando isto na sua [linha de comandos](/en/lessons/intro-to-bash) (em inglês):[^4] + +```python +pip3 install networkx==3.1 +``` + +Uma nota curta sobre controle de versão: este tutorial usa NetworkX 3.1, mas a biblioteca está em desenvolvimento ativo e é atualizada com frequência. Recomendamos usar o comando de instalação acima para garantir que a sua versão do NetworkX corresponde ao código abaixo (em vez de simplesmente instalar a versão mais recente). Se já tiver uma versão mais antiga do NetworkX instalada, execute `pip3 install networkx==3.1 --upgrade` antes de tentar o tutorial[^5]. + +Está feito! Está preparado para começar a codificar. + +# Começando + +## Ler Ficheiros, Importar Dados + +Inicie um novo ficheiro de texto simples, em branco, no mesmo diretório que os seus ficheiros de dados chamado `quaker_network.py` (para mais detalhes sobre a instalação e execução do Python, ver [este tutorial](/pt/licoes/instalacao-windows)). No topo desse ficheiro, importe as bibliotecas de que precisa. O leitor precisará de três bibliotecas---aquela que acabámos de instalar, e duas bibliotecas incorporadas no Python. Pode digitar: + +```python +import csv +from operator import itemgetter +import networkx as nx +from networkx.algorithms import community # Esta parte do NetworkX, para a deteção de comunidades, precisa de ser importada separadamente. +``` + +Agora pode ordenar ao programa para ler os seus ficheiros de CSV e retirar os dados de que precisa. Ironicamente, ler ficheiros e reorganizar os dados geralmente requer um código mais complexo que as funções para executar uma análise de redes sociais, portanto pedimos que tenha paciência connosco ao longo deste primeiro bloco de código. Aqui está um conjunto de comandos para abrir e ler os ficheiros das nossas listas de nós e de *edges*: + +```python +with open('quakers_nodelist.csv', 'r') as nodecsv: # Abra o ficheiro + nodereader = csv.reader(nodecsv) # Leia o CSV + # Retire os dados (usando a list comprehension e a list slicing do Python para remover a linha de cabeçalho, veja a nota de rodapé 6) + nodes = [n for n in nodereader][1:] + +node_names = [n[0] for n in nodes] # Obtenha uma lista apenas dos nomes dos nós + +with open('quakers_edgelist.csv', 'r') as edgecsv: # Abra o ficheiro + edgereader = csv.reader(edgecsv) # Leia o CSV + edges = [tuple(e) for e in edgereader][1:] # Retire os dados +``` + +Este código executa funções similares às [deste tutorial](/pt/licoes/trabalhando-ficheiros-texto-python), mas usa o módulo CSV para carregar os seus nós e *edges*. Mais tarde, o leitor voltará a atuar sobre os dados e obterá mais informação sobre os nós, mas, por agora, precisa de duas coisas: a lista completa de nós e uma lista de pares *edges* (como énuplos de nós)[^6]. Estas são as formas de que o NetworkX precisará para criar um "objeto grafo", um tipo de dados especial do NetworkX sobre o qual o leitor aprenderá na próxima secção. + +Nesta fase, antes de começar a usar o NetworkX, o leitor pode fazer algumas verificações de sanidade básicas para se certificar que os seus dados foram corretamente carregados usando funções e métodos incorporados no Python. Digitando: + +```python +print(len(node_names)) +``` + +e: + +```python +print(len(edges)) +``` + +e, depois, executando o seu *script* lhe mostrará quantos nós e *edges* carregou com sucesso no Python. Se o leitor vir 119 nós e 174 *edges*, então tem todos os dados necessários. + + +## Noções Básicas do NetworkX: Criar o Grafo + +Agora o leitor tem os seus dados como duas listas do Python: uma lista de nós (`node_names`) e uma lista de *edges* (`edges`). No NetworkX, o leitor pode juntar estas duas listas num só objeto rede que compreende como os nós e as *edges* se relacionam. Este objeto é chamado de **Grafo**, referindo-se a um dos termos comuns para dados organizados como uma rede **n.b.**: não se refere a alguma representação visual dos dados. Aqui, grafo é usado puramente num sentido matemático, de análise de rede. Primeiro, o leitor deve *inicializar* um objeto Grafo com o seguinte comando: + +```python +G = nx.Graph() +``` + +> **Nota de tradução**: em inglês, 'gráfico' pode ser traduzido como '*graphic*' ou, de forma diminutiva, como '*graph*', que também pode significar 'grafo', o termo aqui referido. Esta homografia não ocorre no português. + +Isto criará um novo objeto grafo, *G*, com nada nele. Agora, o leitor pode adicionar as suas listas de nós e de *edges* assim: + +```python +G.add_nodes_from(node_names) +G.add_edges_from(edges) +``` + +Esta é uma de várias maneiras de adicionar dados a um objeto rede. O leitor pode verificar a [documentação do NetworkX](https://perma.cc/3QVU-FLPF) (em inglês) para obter mais informações sobre como adicionar *weighted edges*, ou adicionar nós e *edges* uma de cada vez. + +Finalmente, o leitor pode obter informação básica sobre a sua rede recém-criada usando a função `info`: + +```python +print(G) +``` + +A função `info` informa o tipo da sua rede (neste caso, é um objeto Graph padrão) e o número de nós e arestas na mesma. O _output_ deve ser parecido a este: + +``` +Name: +Type: Graph +Number of nodes: 119 +Number of edges: 174 +Average degree: 2.9244 +``` + +Esta é uma forma rápida de obter informação geral sobre o seu grafo, mas como o leitor aprenderá em secções subsequentes, está apenas a passar pela superfície do que o NetworkX lhe pode indicar sobre os seus dados. + +Para recapitular, de momento o seu *script* será semelhante a isto: + +```python +import csv +from operator import itemgetter +import networkx as nx +from networkx.algorithms import community + +# Leia no ficheiro da lista de nós +with open('quakers_nodelist.csv', 'r') as nodecsv: + nodereader = csv.reader(nodecsv) + nodes = [n for n in nodereader][1:] + +# Obtenha uma lista apenas dos nomes dos nós (o primeiro item em cada linha) +node_names = [n[0] for n in nodes] + +# Leia no ficheiro da lista de edges +with open('quakers_edgelist.csv', 'r') as edgecsv: + edgereader = csv.reader(edgecsv) + edges = [tuple(e) for e in edgereader][1:] + +# Obtenha o número de nós e de edges nas nossas duas listas +print(len(node_names)) +print(len(edges)) + +G = nx.Graph() # Inicialize um objeto Grafo +G.add_nodes_from(node_names) # Adicione nós ao Grafo +G.add_edges_from(edges) # Adicione edges ao Grafo +print(G) # Obtenha informação sobre o Grafo +``` + +Até agora, o leitor leu dados de nós e de *edges* no Python a partir de ficheiros CSV, e, depois, contou esses nós e *edges*. Depois disso, o leitor criou um objeto grafo usando o NetworkX e carregou os seus dados para esse objeto. + +## Adicionar Atributos + +Para o NetworkX, um objeto grafo é uma coisa grande (a sua rede) composta por dois tipos de coisas mais pequenas (os seus nós e as suas *edges*). Até agora, o leitor carregou nós e *edges* (como pares de nós), mas o NetworkX permite-lhe adicionar *atributos* tanto aos nós como às *edges*, providenciando mais informação sobre cada um deles. Mais à frente neste tutorial, o leitor executará métricas e adicionará alguns dos resultados de volta ao Grafo como atributos. Por agora, vamos certificar-nos que o seu Grafo contém todos os atributos que estão atualmente no seu CSV. + +O leitor quererá retornar a uma lista que criou no início do seu *script*: `nodes`. Esta lista contém todas as linhas do `quakers_nodelist.csv`, incluindo colunas para o *name*, a *historical significance*, o *gender*, o *birth year*, o *death year* e o SDFB ID. O leitor quererá iterar por esta lista e adicionar esta informação ao nosso grafo. Existem algumas maneiras de fazer isto, mas o NetworkX providencia duas funções convenientes para adicionar atributos a todos os nós e *edges* dum Grafo duma só vez: `nx.set_node_attributes()` e `nx.set_edge_attributes()`. Para usar estas funções, o leitor irá precisar que os seus dados de atributos estejam na forma dum *dicionário* Python, no qual os nomes dos nós são as *chaves* e os atributos que quer adicionar são os *valores*[^7]. O leitor quererá criar um dicionário para cada um dos seus atributos, e, depois, adicioná-los usando as funções acima. A primeira coisa que o leitor deve fazer é criar cinco dicionários em branco, usando chavetas: + +```python +hist_sig_dict = {} +gender_dict = {} +birth_dict = {} +death_dict = {} +id_dict = {} +``` + +Agora nós podemos fazer o *loop* através da nossa lista de `nodes` e adicionar os itens apropriados a cada dicionário. Nós fazemos isto sabendo antecipadamente a posição, ou *índice*, de cada atributo. Porque o nosso ficheiro `quaker_nodelist.csv` está bem organizado, nós sabemos que o *name* da pessoa será sempre o primeiro item no lista: índice 0, visto que começamos sempre a contar do 0 no Python. A *historical significance* da pessoa será o índice 1, o seu *gender* será o índice 2, e assim por diante. Portanto, nós podemos construir os nossos dicionários desta forma[^8]: + +```python +for node in nodes: # Itere pela lista, uma linha de cada vez + hist_sig_dict[node[0]] = node[1] + gender_dict[node[0]] = node[2] + birth_dict[node[0]] = node[3] + death_dict[node[0]] = node[4] + id_dict[node[0]] = node[5] +``` + +Agora o leitor tem um conjunto de dicionários que pode usar para adicionar atributos a nós no seu objeto Grafo. A função `set_node_attributes` toma três variáveis: o Grafo ao qual o leitor está a adicionar o atributo, o dicionário de pares id-atributo, e o nome do novo atributo. O código para adicionar os seus seis atributos assemelha-se a isto: + +```python +nx.set_node_attributes(G, hist_sig_dict, 'historical_significance') +nx.set_node_attributes(G, gender_dict, 'gender') +nx.set_node_attributes(G, birth_dict, 'birth_year') +nx.set_node_attributes(G, death_dict, 'death_year') +nx.set_node_attributes(G, id_dict, 'sdfb_id') +``` + +Agora todos os seus nós têm estes seis atributos, e o leitor pode aceder a eles a qualquer momento. Por exemplo, o leitor pode obter todos os *birth years* dos seus nós iterando por eles e acedendo ao atributo `birth_year`, assim: + +```python +for n in G.nodes(): # Itere por cada nó, entre os nossos dados "n" estará o nome da pessoa + print(n, G.nodes[n]['birth_year']) # Aceda a cada nó pelo seu nome, e, depois, pelo atributo "birth_year" +``` + +A partir desta instrução, o leitor obterá uma linha de *output* para cada nó na rede. Deve parecer-se como uma simples lista de nomes e anos: + +``` +Anne Camm 1627 +Sir Charles Wager 1666 +John Bellers 1654 +Dorcas Erbery 1656 +Mary Pennyman 1630 +Humphrey Woolrich 1633 +John Stubbs 1618 +Richard Hubberthorne 1628 +Robert Barclay 1648 +William Coddington 1601 +``` + +Os passos acima são um método comum para adicionar atributos a nós que o leitor usará repetidamente mais tarde neste tutorial. Aqui está uma recapitulação do bloco de código desta secção: + +```python +# Crie um dicionário em branco para cada atributo +hist_sig_dict = {} +gender_dict = {} +birth_dict = {} +death_dict = {} +id_dict = {} + +for node in nodes: # Itere pela lista de nós, uma linha de cada vez + hist_sig_dict[node[0]] = node[1] # Aceda ao item correto, adicione-o ao dicionário correspondente + gender_dict[node[0]] = node[2] + birth_dict[node[0]] = node[3] + death_dict[node[0]] = node[4] + id_dict[node[0]] = node[5] + +# Adicione cada dicionário como um atributo de nó ao objeto Grafo +nx.set_node_attributes(G, hist_sig_dict, 'historical_significance') +nx.set_node_attributes(G, gender_dict, 'gender') +nx.set_node_attributes(G, birth_dict, 'birth_year') +nx.set_node_attributes(G, death_dict, 'death_year') +nx.set_node_attributes(G, id_dict, 'sdfb_id') + +# Itere por cada nó, para aceder e obter todos os atributos "birth_year" +for n in G.nodes(): + print(n, G.nodes[n]['birth_year']) +``` + +Agora o leitor aprendeu como criar um objeto Grafo e adicionar atributos ao mesmo. Nesta próxima secção, o leitor aprenderá sobre uma variedade de métricas disponíveis no NetworkX e como aceder às mesmas. Mas relaxe, acabou de aprender o maior parte do código de que precisará para o resto do tutorial! + +# Métricas Disponíveis no NetworkX + +Quando o leitor começa a trabalhar num novo *dataset*, é uma boa ideia obter uma visão geral dos dados. A primeira etapa, descrita acima, consiste simplesmente em abrir os ficheiros e ver o que está lá dentro. Porque é uma rede, o leitor sabe que existirão nós e *edges*, mas quantos de cada um existem? Que informação está anexada a cada nó ou *edge*? + +No nosso caso, existem 174 *edges* e 119 nós. Estas *edges* não têm direções (isto é, existe uma relação simétrica entre pessoas), nem incluem informação adicional. Para os nós, nós sabemos os seus *names*, a sua *historical significance*, o seu *genders*, a sua *birth date* e *death date*, e o SDFB ID. + +Estes detalhes informam o que o leitor pode ou devia fazer com o seu *dataset*. Muitos poucos nós (digamos, 15), e uma análise de rede é menos útil que desenhar uma imagem ou fazer algumas leituras; Demasiadas (digamos, 15 milhões), e o leitor deveria considerar começar com um subconjunto ou encontrar um supercomputador. + +As propriedades da rede também guiam a sua análise. Porque esta rede é **não direcionada**, a sua análise tem que usar métricas que exigem *edges* simétricas entre nós. Por exemplo, o leitor pode determinar em que comunidades as pessoas se encontram, mas não pode determinar as rotas *direcionais* pelas quais a informação poderá fluir ao longo da rede (precisaria duma rede direcionada para isso). Ao usar as relações simétricas e não direcionadas neste caso, o leitor será capaz de encontrar subcomunidades e as pessoas que são importantes nessas comunidades, um processo que seria mais difícil (embora ainda que possível) com uma rede direcionada. O NetworkX permite-lhe realizar a maior parte das análises que o leitor pode conceber, mas deve compreender as possibilidades do seu *dataset* e perceber que alguns logaritmos do NetworkX são mais apropriados do que outros. + +### O Formato da Rede + +Após ver a aparência do *dataset*, é importante ver a aparência da *rede*. Estas são coisas diferentes. O *dataset* é uma representação abstrata do que o leitor assume serem conexões entre entidades; a rede é a instanciação específica dessas suposições. A rede, pelo menos neste contexto, é como o computador, lê as conexões que o leitor codificou num *dataset*. A rede tem uma [Topologia](https://perma.cc/8M84-GESG), ou uma forma conectiva, que pode ser centralizada ou descentralizada; densa ou esparsa; cíclica ou linear. Um *dataset* não tem, fora da estrutura da tabela na qual está digitado. + +O formato e as propriedades básicas da rede irão dar-lhe uma ideia sobre com o que está a trabalhar e que análises parecem razoáveis. O leitor já sabe o número de nós e de *edges*, mas a que a rede se 'assemelha'? Os nós agrupam-se, ou estão espalhados de forma regular? Existem estruturas complexas, ou cada nó está organizado numa linha reta? + +A visualização abaixo, criada na ferramenta de visualização de redes [Gephi](https://gephi.org/), lhe dará uma ideia da Topologia desta rede[^9]. O leitor poderia criar um gráfico similar no Palladio usando [este tutorial](/en/lessons/creating-network-diagrams-from-historical-sources) (em inglês). + +{% include figure.html filename="exploring-and-analyzing-network-data-with-python-1.png" alt="Imagem com uma representação de um gráfico de redes" caption="Visualização de rede baseada em força dos dados *quakers*, criado no Gephi." %} + +Existem várias formas de visualizar uma rede, e um [*layout* baseado em força](https://perma.cc/AM7G-BTWV) (em inglês), do qual a imagem acima é um exemplo, encontra-se entre as mais comuns. Grafos baseados em força tentam encontrar o posicionamento ideal para nós com uma calculação baseada na [tensão de cordas segundo a Lei de Hooke](https://perma.cc/2RTL-CYVL) (em inglês), a qual, para grafos mais pequenos, normalmente cria visualizações limpas e de leitura fácil. A visualização embutida acima mostra-lhe que existe um único grande **componente** de nós conectados (no centro) e vários componentes pequenos com apenas uma ou duas conexões nas periferias. Esta é uma estrutura de rede relativamente comum. Sabendo que existem múltiplos componentes na rede irá limitar de forma útil as calculações que o leitor quererá realizar nela. Ao dispor o número de conexões (conhecidas como **grau**, ver abaixo) como o tamanho dos nós, a visualização também mostra que existem alguns nós com muitas conexões que mantêm o componente central intricado. Estes grandes nós são conhecidos como ***hubs***, e o facto de eles aparecem tão claramente aqui dá-lhe uma pista em relação ao que o leitor encontrará quando medir a **centralidade** na próxima secção. + +Visualizações, no entanto, apenas o levam até certo ponto. Com quantas mais redes trabalhar, mais o leitor se aperceberá que a maior parte parece similar o suficiente ao ponto de ser difícil distinguir uma da outra. Métricas quantitativas deixam-no diferenciar redes, aprender sobre as suas Topologias, e tornar uma confusão de nós e *edges* em algo a partir do qual se pode aprender. + +Uma boa métrica com a qual começar é a **densidade** de rede. Isto é, simplesmente, o rácio de *edges* reais na rede face a todas as *edges* possíveis na rede. Numa rede não direcionada como esta, *poderia* haver uma única *edge* entre quaisquer dois nós, mas como o leitor viu na visualização, apenas algumas dessas *edges* possíveis estão realmente presentes. A densidade de rede dá-lhe uma ideia rápida do quão intimamente próxima a sua rede é. + +E as boas notícias são que muitas destas métricas requerem comandos simples e unilineares no Python. Daqui para a frente, o leitor pode continuar a construir o seu bloco de código das secções anteriores. O leitor não tem de apagar nada que já tenha digitado, e porque criou o seu objeto rede `G` no bloco de código acima, todas as métricas a partir daqui devem trabalhar corretamente. + +O leitor pode calcular a densidade da rede executando `nx.density(G)`. No entanto, a melhor maneira de fazer isto é armazenar a sua métrica numa variável para referência futura, e imprimir essa variável, como: + +```python +density = nx.density(G) +print("Network density:", density) +``` + +O *output* da densidade é um número, então é isso que o leitor verá quando imprimir o valor. Neste caso, a densidade da nossa rede é, aproximadamente, 0.0248. Numa escala de 0 a 1, não é uma rede muito densa, o que confere com o que o leitor consegue ver na visualização[^10]. Um 0 significaria que não existem quaisquer conexões de todo, e um 1 indicaria que todas as *edges possíveis* estão presentes (uma rede perfeitamente conectada): esta rede *quaker* está na extremidade inferior dessa escala, mas, mesmo assim, longe do 0. + +Uma medida de caminho mais curta é um pouco mais complexa. Ela calcula a série mais curta possível de nós e *edges* que se situam entre quaisquer dois nós, algo difícil de ver em visualizações de grandes redes. Esta medida corresponde, essencialmente, a encontrar amigos de amigos---se a minha mãe conhece alguém que eu não conheço, então a minha mãe é o caminho mais curto entre mim e essa pessoa. O jogo *Six Degrees of Kevin Bacon*, a partir do qual o [nosso projeto](https://sixdegreesoffrancisbacon.com/) (em inglês) retira o nome, é basicamente um jogo que consiste em encontrar os caminhos mais curtos (com um **comprimento de caminho** de seis ou menos) de Kevin Bacon a qualquer outro ator. + +Para calcular um caminho mais curto, o leitor precisa de passar por várias variáveis de *input* (informação que dá a uma função do Python): o grafo inteiro, o seu nó *source*, e o seu nó *target*. Vamos procurar o caminho mais curto entre Margaret Fell e George Whitehead. Como usámos nomes para identificar unicamente os nossos nós nesta rede, o leitor pode aceder a esses nós (como a ***source*** e o ***target*** do seu caminho) usando os nomes diretamente. + +```python +fell_whitehead_path = nx.shortest_path(G, source="Margaret Fell", target="George Whitehead") + +print("Shortest path between Fell and Whitehead:", fell_whitehead_path) +``` + +Dependendo do tamanho da sua rede, isto pode demorar algum tempo para calcular, visto que o Python primeiro encontra todos os caminhos possíveis e depois escolhe o mais curto. O *output* de `shortest_path` será uma lista dos nós que incluí a "source" (Fell), o "target" (Whitehead), e os nós entre eles. Neste caso, nós podemos ver que o fundador dos *quakers*, George Fox, se encontra no caminho mais curto entre eles. Como Fox é também um ***hub*** (ver centralidade de grau, abaixo) com muitas conexões, nós podemos supor que vários caminhos mais curtos passam por ele como mediador. O que é que isto pode indicar sobre a importância dos fundadores dos *quakers* para a sua rede social? + +O Python incluí várias ferramentas que calculam os caminhos mais curtos. Existem funções para os comprimentos dos caminhos mais curtos, para todos os caminhos mais curtos, e para saber se um caminho existe ou não de todo na [documentação](https://perma.cc/3MJE-7MQQ) (em inglês). O leitor poderia usar uma função separada para encontrar o comprimento do caminho *Fell-Whitehead* que acabámos de calcular, ou poderia simplesmente tomar o comprimento da lista menos um[^11], assim: + +```python +print("Length of that path:", len(fell_whitehead_path)-1) +``` + +Existem muitas métricas de rede derivadas dos comprimentos de caminho mais curtos. Uma tal medida é o **diâmetro**, que é o mais longo de todos os caminhos mais curtos. Depois de calcular todos os caminhos mais curtos entre cada par de nós possível na rede, o diâmetro é o comprimento do caminho entre os dois nós que estão mais afastados. A medida está projetada para lhe dar um senso do tamanho geral da rede, a distância duma extremidade da rede à outra. + +O diâmetro usa um comando simples: `nx.diameter(G)`. No entanto, executar este comando no grafo *quaker* dará uma mensagem de erro indicando que o Grafo não está conectado ("*not connected*"). Isto significa apenas que o seu grafo, como o leitor já viu, tem mais que um componente. Porque existem alguns nós que não têm um caminho de todo com outros, é impossível encontrar todos os caminhos mais curtos. Veja novamente a visualização do seu grafo: + +{% include figure.html filename="exploring-and-analyzing-network-data-with-python-1.png" alt="Imagem com uma representação de um gráfico de redes" caption="Visualização de rede baseada em força dos dados *quakers*, criado no Gephi." %} + +Como não há caminho entre nós dum componente e nós doutro, `nx.diameter()` retorna a mensagem de erro "*not connected*". O leitor pode remediar isto, primeiro, ao descobrir se o seu Grafo está conectado ("*is connected*") (*i.e.* tudo um componente) e, se não conectado, descobrir apenas o componente mais largo e calcular o diâmetro somente desse componente. Aqui está o código: + +```python +# Se o seu Grafo tiver mais do que um componente, isto retornará como 'False' +print(nx.is_connected(G)) + +# A seguir, use nx.connected_components para obter a lista de componentes, +# depois, use o comando max() para encontrar o mais pesado: +components = nx.connected_components(G) +largest_component = max(components, key=len) + +# Crie um 'Subgrafo' apenas com o componente mais pesado, +# depois, calcule o diâmetro do Subgrafo, tal como fez com a densidade. + +subgraph = G.subgraph(largest_component) +diameter = nx.diameter(subgraph) +print("Network diameter of largest component:", diameter) +``` + +Como nós tomámos o componente mais largo, nós podemos assumir que não há nenhum diâmetro mais largo para os outros componentes. Portanto, esta figura é uma boa representação para o diâmetro de todo o Grafo. O diâmetro de rede do componente mais largo desta rede é 8: existe um comprimento de rede de 8 entre os dois nós mais afastados na rede. Ao contrário da densidade, que é apresentada de 0 a 1, é difícil saber a partir deste número somente se 8 é um diâmetro largo ou curto. Para algumas métricas globais, pode ser melhor compará-lo a redes de tamanho e forma similar[^12]. + +O cálculo estrutural final que o leitor fará nesta rede concerne o conceito de **fechamento triádico**. Fechamento triádico supõe que se duas pessoas conhecem a mesma pessoa, elas provavelmente conhecem-se mutuamente. Se Fox conhece tanto Fell como Whitehead, então Fell e Whitehead podem perfeitamente conhecer-se mutuamente, completando um **triângulo** na visualização de três *edges* conectando Fox, Fell e Whitehead. O número destes triângulos fechados na rede pode ser usado para descobrir aglomerados e comunidades de indivíduos que se conhecem todos intimamente. + +Uma forma de medir o fechamento triádico é o chamado **coeficiente de aglomeração** por causa desta tendência aglomeradora, mas a medida estrutural de rede que o leitor aprenderá é conhecida como **transitividade**[^13]. Transitividade é o rácio de todos os triângulos sobre todos os triângulos possíveis. Um triângulo possível existe quando uma pessoa (Fox) conhece duas pessoas (Fell e Whitehead). Então, transitividade, como a densidade, expressa quão interconectado um grafo é em termos dum rácio de conexões reais sobre as possíveis. Lembre-se, medidas como a transitividade e a densidade lidam com *probabilidades* e não com *certezas*. Todos os *outputs* do seu *script* no Python devem ser interpretados, como qualquer outro objeto de pesquisa. A transitividade permite-lhe uma forma de pensar sobre todas as relações no seu grafo que *podem* existir, mas que, atualmente, não existem. + +O leitor pode calcular a transitividade numa só linha, da mesma forma que calculou a densidade: + +```python +triadic_closure = nx.transitivity(G) +print("Triadic closure:", triadic_closure) +``` + +Tal como a densidade, transitividade é numerada de 0 a 1, e o leitor pode ver que a transitividade da rede é de cerca de 0.1694, um valor um pouco mais alto que o da sua densidade de 0.0248. Porque o grafo não é muito denso, existem menos *triângulos possíveis*, o que pode resultar numa transitividade relativamente mais elevada. Isto é, nós que já têm várias conexões farão provavelmente parte destes triângulos fechados. Para suportar isto, o leitor quererá saber mais sobre nós com muitas conexões. + +## Centralidade + +Depois de obter algumas medidas básicas da estrutura da rede inteira, um bom próximo passo é descobrir quais nós são os mais importantes na sua rede. Na análise de redes, medidas da importância dos nós são referidas como medidas de **centralidade**. Porque existem várias maneiras de abordar a questão "Que nós são os mais importantes?", existem várias formas diferentes de calcular a centralidade. Aqui, o leitor aprenderá sobre as três medidas de centralidade mais comuns: o grau, a centralidade de intermediação, e a centralidade adjacente. + +O **grau** é a forma mais simples e comum de encontrar nós importantes. O grau dum nó é a soma das suas *edges*. Se um nó tem três linhas a estenderem-se a outros nós, o seu grau é de três. Cinco *edges*, o seu grau é de cinco. É extremamente simples. Como cada uma dessas edges terá sempre um nó na outra extremidade, o leitor pode pensar no grau como o número de pessoas às quais qualquer pessoa está diretamente conectada. Os nós com os graus mais elevados numa rede social são as pessoas que conhecem mais pessoas. Estes nós são geralmente referidos como ***hubs***, e calcular o grau é a forma mais rápida de identificar os *hubs*. + +Calcular a centralidade para cada nó no NetworkX não é exatamente tão simples como as métricas de toda a rede acima, mas continua a envolver comandos unilineares. Todos os comandos de centralidade que o leitor aprenderá nesta secção produzem dicionários nos quais as chaves são os nós e os valores são as medidas de centralidade. Isto significa que eles estão prontos para adicionar de volta à nossa rede como um atributo de nó, como o leitor fez na última secção. Comece por calcular o grau e adicione-o como um atributo à sua rede. + +```python +degree_dict = dict(G.degree(G.nodes())) +nx.set_node_attributes(G, degree_dict, 'degree') +``` + +O leitor acabou de executar o método `G.degree()` na lista completa de nós na sua rede (`G.nodes()`). Como o leitor adicionou-o como um atributo, agora pode ver o grau de William Penn, bem como com o resto da sua informação se aceder ao seu nó diretamente: + +```python +print(G.nodes['William Penn']) +``` + +Mas estes resultados são úteis para mais do que simplesmente adicionar atributos ao seu objeto Grafo. Como o leitor já está no Python, pode organizar e compará-los. O leitor pode usar a função incorporada `sorted()` para organizar um dicionário com as suas chaves ou valores e encontrar o *top* vinte dos nós por grau. Para fazer isto, o leitor vai precisar de usar `itemgetter`, o qual nós importámos no início do tutorial. Usando `sorted` e `itemgetter`, pode organizar o dicionário de graus assim: + +```python +sorted_degree = sorted(degree_dict.items(), key=itemgetter(1), reverse=True) +``` + +Aqui, há muitas coisas a acontecer nos bastidores, mas concentre-se só nas três variáveis de *input* que o leitor deu a `sorted()`. A primeira é o dicionário, `degree_dict.items()`, que quer organizar. A segunda é o que organizar por: neste caso, item "1" é o segundo item no par, ou o valor do seu dicionário. Finalmente, o leitor diz a `sorted()` para ir em `reverse` para que os nós de grau mais elevado apareçam primeiro na lista resultante. Assim que o leitor tiver criado esta lista organizada, pode iterar por ela e usar a *list slicing*[^6] para obter somente os primeiros 20 nós: + +```python +print("Top 20 nodes by degree:") +for d in sorted_degree[:20]: + print(d) +``` + +Como o leitor pode ver, o grau de Penn é 18, relativamente elevado para esta rede. Mas digitar estas informações de classificação ilustra as limitações do grau como uma medida de centralidade. O leitor provavelmente não precisava que o NetworkX lhe dissesse que William Penn, líder *quaker* e fundador da Pensilvânia, era importante. A maioria das redes sociais terão somente alguns *hubs* de grau muito elevado, com o resto de grau similar e baixo[^14]. O grau pode informá-lo sobre os maiores *hubs*, mas não pode dizer-lhe muito sobre o resto dos nós. E, em muitos casos, esses *hubs* sobre os quaiso está a informar (como o Penn ou como a cofundadora do Quakerismo, Margaret Fell, com um grau de 13) não são especialmente surpreendentes. Neste caso, quase todos os *hubs* são fundadores da religião ou, noutros casos, figuras políticas importantes. + +Felizmente, existem outras medidas de centralidade que lhe podem dizer mais do que só os *hubs*. A [centralidade adjacente](https://perma.cc/VF28-JDCR) (em inglês) é um tipo de extensão do grau---analisa uma combinação dos *edges* dum nó e as *edges* dos vizinhos desse nó. Centralidade adjacente preocupa-se se um nó é um *hub*, mas também se preocupa com quantos *hubs* um nó está conectado. É calculado como um valor de 0 a 1: quanto mais próximo do um, maior a centralidade. A centralidade adjacente é útil para compreender que nós podem obter informação a outros nós rapidamente. Se o leitor conhece muitas pessoas bem-conectadas, poderia espalhar uma mensagem muito eficientemente. Se o leitor usou o Google, então está já mais ou menos familiarizado com a centralidade adjacente. O seu algoritmo de PageRank usa uma extensão desta fórmula para decidir que páginas de internet são colocadas no topo da lista de resultados. + +A [centralidade de intermediação](https://perma.cc/C55J-7XAJ) (em inglês) é um pouco diferente das outras duas calculações na medida em que não se preocupa com o número de *edges* que qualquer nó ou grupo de nós tem. A centralidade de intermediação observa todos os **caminhos mais curtos** que passam por um nó em particular (ver acima). Para fazer isto, tem que primeiro calcular todos os possíveis caminhos mais curtos na sua rede, por isso mantenha em mente que a centralidade de intermediação vai demorar mais tempo para calcular que as outras medidas de centralidade (mas não será um problema num *dataset* desta dimensão). A centralidade de intermediação, que também é expressa numa escala de 0 a 1, é particularmente boa a encontrar nós que conectam duas partes distintas duma rede. Se o leitor é a única coisa conectando dois aglomerados, cada comunicação entre esses aglomerados tem que passar por si. Em contraste com um *hub*, este tipo de nó é regularmente referido como um ***broker***. A centralidade de intermediação não é a única maneira de encontrar *brokerage* (e outros métodos são mais sistemáticos), mas é uma forma rápida de lhe dar uma ideia de quais nós são importantes, não porque têm muitas conexões eles próprios, mas porque eles situam-se *entre* grupos, dando à rede conectividade e coesão. + +Estas duas medidas de centralidade são ainda mais simples de executar que um grau---eles não precisam de receber uma lista de nós, só o grafo `G`. O leitor pode executá-las com estas funções: + +```python +betweenness_dict = nx.betweenness_centrality(G) # Execute a centralidade de intermediação +eigenvector_dict = nx.eigenvector_centrality(G) # Execute a centralidade adjacente + +# Atribua cada a um atributo na sua rede +nx.set_node_attributes(G, betweenness_dict, 'betweenness') +nx.set_node_attributes(G, eigenvector_dict, 'eigenvector') +``` + +O leitor pode organizar a centralidade de intermediação (ou a adjacente) ao mudar os nomes das variáveis no código organizador acima, como: + +```python +sorted_betweenness = sorted(betweenness_dict.items(), key=itemgetter(1), reverse=True) + +print("Top 20 nodes by betweenness centrality:") +for b in sorted_betweenness[:20]: + print(b) +``` + +O leitor notará que muitos, mas não todos, dos nós que têm graus elevados também têm uma centralidade de intermediação alta. De facto, centralidade de intermediação apresenta duas mulheres, Elizabeth Leavens e Mary Penington, cuja importância tinha sido obscurecida pela métrica da centralidade de grau. Uma vantagem de fazer estes cálculos no Python é que o leitor pode rapidamente comparar dois conjuntos de cálculos. E se o leitor quiser saber quais dos nós com alta centralidade de intermediação têm graus baixos? Isto é o mesmo que dizer: quais nós de alta intermediação são inesperados? Pode usar uma combinação da lista organizada acima: + +```python +# Primeiro, obtenha uma lista do top 20 nós por intermediação +top_betweenness = sorted_betweenness[:20] + +# Depois, encontre e obtenha o grau de cada um +for tb in top_betweenness: # Itere por top_betweenness + degree = degree_dict[tb[0]] # Use degree_dict para aceder ao grau dum nó, veja a nota de rodapé 4 + print("Name:", tb[0], "| Betweenness Centrality:", tb[1], "| Degree:", degree) +``` + +O leitor pode confirmar a partir destes resultados que algumas pessoas, como Leavens e Penington, têm alta centralidade de intermediação, mas baixo grau. Isto pode significar que estas mulheres eram *brokers* importantes, conectando partes díspares do grafo. O leitor também pode aprender coisas inesperadas sobre pessoas sobre as quais já se sabe algo---nesta lista, consegue ver que Penn tem um grau inferior ao do fundador *quaker* George Fox, mas uma centralidade de intermediação mais elevada. Isto é o mesmo que dizer, simplesmente conhecer mais pessoas não é tudo. + +Isto aborda somente a superfície do que pode ser feito com métricas de rede no Python. O NetworkX oferece dezenas de funções e medidas para o leitor usar em várias combinações, e pode usar Python para estender estas medidas de formas quase ilimitadas. Uma linguagem de programação como o Python ou o R dar-lhe-á a flexibilidade para explorar a sua rede computacionalmente de formas que outros *interfaces* não podem ao permitir-lhe combinar e comparar os resultados estatísticos da sua rede com outros atributos dos seus dados (como as datas e ocupações que adicionou à rede no início deste tutorial!). + +## Noções Avançadas do NetworkX: Deteção de Comunidades com Modularidade + +Outra coisa regularmente questionada sobre o *dataset* duma rede é quais são os subgrupos e comunidades dentro da estrutura social mais larga. A sua rede é uma família grande e feliz na qual todos se conhecem? Ou é uma coleção de subgrupos mais pequenos que estão conectados por um ou dois intermediários? O campo da deteção de comunidades em redes está desenhado para responder a estas questões. Existem várias formas de calcular comunidades, cliques, e aglomerados na sua rede, mas o método mais popular atualmente é a **modularidade**. A modularidade é uma medida de densidade relativa na sua rede: uma comunidade (chamada um **módulo** ou **classe** modular) tem uma densidade elevada em relação a outros nós dentro do seu módulo, mas densidade baixa com os outros de fora. A modularidade dá-lhe uma pontuação geral de quão fracioanda a sua rede é, e essa pontuação pode ser usada para **repartir** a rede e evidenciar as comunidades individuais[^15]. + +Redes muito densas são geralmente mais difíceis de dividir em repartições sensatas. Felizmente, como o leitor descobriu anteriormente, esta rede não é assim tão densa. Não existem tantas conexões reais quanto conexões possíveis, e existem componentes desconectados de todo. Vale a pena repartir esta rede esparsa com modularidade e ver se os resultados fazem sentido histórico e analítico. + +A deteção e repartição de comunidades no NetworkX requere um pouco mais de configuração do que algumas das outras métricas. Existem algumas abordagens incorporadas para a deteção de comunidades (como o [*minimum cut*](https://perma.cc/K59Y-WZRX) (em inglês)), mas modularidade não vem incluída com o NetworkX. Felizmente, existe um [módulo adicional no Python](https://github.com/taynaud/python-louvain/) (em inglês) que o leitor pode usar com o NetworkX, e que já instalou e importou no início deste tutorial. O leitor pode ler a [documentação completa](https://perma.cc/KW5K-ZX67) (em inglês) para todas as funções que oferece, mas para a maior parte dos propósitos da deteção de comunidades, quererá apenas `best_partition()`: + +```python +communities = community.greedy_modularity_communities(G) +``` + +O método `greedy_modularity_communities()` tenta determinar o número de comunidades apropriadas para o grafo, e agrupa todos os nós em subconjuntos baseados nestas comunidades. Ao contrário das funções de centralidade, o código acima não criará um dicionário. Ao invés, criará uma lista especial de objetos "*frozenset*" (similar a listas). Existe um conjunto para cada grupo, e os conjuntos contêm os nomes das pessoas em cada grupo. Para adicionar esta informação à sua rede na maneira agora familiar, o leitor tem que primeiro criar um dicionário que classifique cada pessoa com um valor numérico para o grupo ao qual pertencem: + +```python +modularity_dict = {} # Crie um dicionário vazio +for i,c in enumerate(communities): # Itere pela lista de comunidades, mantendo em mente o número para a comunidade + for name in c: # Itere por cada pessoa numa comunidade + modularity_dict[name] = i # Crie uma entrada no dicionário para a pessoa, na qual o valor é o grupo ao qual pertence. + +# Agora, o leitor pode adicionar a informação de modularidade como fez com as outras métricas +nx.set_node_attributes(G, modularity_dict, 'modularity') +``` + +Como sempre, o leitor pode combinar estas medidas com outras. Por exemplo, aqui está como encontrar os nós de centralidade adjacente mais elevada na classe modular 0 (a primeira): + +```python +# Primeiro, obtenha uma lista apenas dos nós nessa classe +class0 = [n for n in G.nodes() if G.nodes[n]['modularity'] == 0] + +# Depois, crie um dicionário das centralidades adjacentes desses nós +class0_eigenvector = {n:G.nodes[n]['eigenvector'] for n in class0} + +# Depois, organize esse dicionário e obtenha os primeiros 5 resultados +class0_sorted_by_eigenvector = sorted(class0_eigenvector.items(), key=itemgetter(1), reverse=True) + +print("Modularity Class 0 Sorted by Eigenvector Centrality:") +for node in class0_sorted_by_eigenvector[:5]: + print("Name:", node[0], "| Eigenvector Centrality:", node[1]) +``` + +Usando a centralidade adjacente como um *ranking* pode dar-lhe uma ideia das pessoas importantes nesta classe modular. O leitor notará que algumas destas pessoas, especialmente William Penn, William Bradford (*não* o fundador de Plymouth em que estará a pensar[^16]) e James Logan, passaram muito tempo na América. Também, Bradford e Tace Sowle eram ambos impressores *quakers* proeminentes. Com um pouco de pesquisa, nós podemos descobrir que existem tanto razões geográficas como ocupacionais que explicam que este grupo de pessoas se juntem. Isto é uma indicação de que a modularidade está a trabalhar como esperado. + +Em redes mais pequenas como esta, uma tarefa comum é encontrar e listar todas as classes modulares e seus membros[^17]. O leitor pode fazer isto ao percorrer pela lista `communities`: + +```python +for i,c in enumerate(communities): # Itere pela lista de comunidades + if len(c) > 2: # Filtre as classes modulares com 2 ou menos nós + print('Class '+str(i)+':', list(c)) # Obtenha as classes e os seus membros +``` + +Note no código acima que está a filtrar qualquer classe modular com dois ou menos nós, na linha `if len(c) > 2`. O leitor recordar-se-á da visualização que existiam vários componentes pequenos da rede com apenas dois nós. A modularidade encontrará estes componentes e tratá-los-á como classes separadas (visto que eles não estão conectados a mais nada). Ao filtrá-los, o leitor obtém uma ideia melhor das classes modulares maiores dentro do principal componente da rede. + +Trabalhando só com o NetworkX trá-lo-á longe, e o leitor pode encontrar muito sobre classes modulares apenas ao trabalhar com os dados diretamente. Mas quase sempre quer visualizar os seus dados (e, talvez, expressar a modularidade como a cor de nó). Na próxima secção, o leitor irá aprender como exportar os seus dados do NetworkX para uso noutros programas. + + + +# Exportar Dados + +O NetworkX suporta um grande número de formatos de ficheiros para [exportação de dados](https://perma.cc/X65S-HRCF) (em inglês). Se o leitor quiser exportar uma lista de *edges* em texto simples para carregar no Palladio, existe um [*wrapper* conveniente](https://perma.cc/P9ES-57X3) (em inglês) para isso. Frequentemente, no *Six Degrees of Francis Bacon*, nós exportamos dados do NetworkX no [formato JSON especializado do D3](https://perma.cc/SF8Z-DWPW) (em inglês), para visualização no navegador de internet. O leitor poderia até [exportar](https://perma.cc/Y6QJ-5VM8) (em inglês) o seu grafo como um [*dataframe* do Pandas](https://perma.cc/87NA-KCK4) (em inglês) se existissem operações estatísticas mais avançadas que quisesse executar. Existem várias opções, e se o leitor tiver adicionado diligentemente todas as suas métricas de volta no seu objeto Grafo como atributos, todos os seus dados serão exportados duma só vez. + +A maior parte das opções de exportação funcionam da mesma maneira, por isso, para este tutorial o leitor aprenderá como exportar os seus dados para o formato GEXF do Gephi. Assim que tiver exportado o ficheiro, o leitor pode fazer o *upload* [diretamente para o Gephi](https://gephi.org/quickstart/) (em inglês) para a visualização. + +Exportar dados é, normalmente, um simples comando unilinear. Tudo o que é preciso é escolher um nome de ficheiro. Neste caso, usaremos `quaker_network.gexf`. Para exportar, digite: + +```python +nx.write_gexf(G, 'quaker_network.gexf') +``` + +É só! Quando executar o seu *script* no Python, colocará automaticamente o novo ficheiro GEXF no mesmo diretório que o seu ficheiro Python.[^18] + +# Conclusões + +Agora, tendo realizado e revisto uma panóplia de métricas de rede no Python, o leitor tem as evidências a partir das quais os argumentos se contrõem e se retiram conclusões sobre esta rede de *quakers* na Grã-Bretanha moderna. O leitor sabe, por exemplo, que a rede tem uma **densidade** relativamente baixa, sugerindo associações ténues e/ou dados originais imcompletos. O leitor sabe que a comunidade está organizada em torno de vários ***hubs*** desproporcionalmente grandes, entre eles fundadores da denominação, como Margaret Fell e George Fox, bem como líderes políticos e religiosos importantes, como William Penn. Mais útil, o leitor sabe sobre mulheres com graus relativamente baixos, como Elizabeth Leavens e Mary Penington, que (como resultado de centralidade de intermediação elevada) podem ter agido como ***brokers***, conectando múltiplos grupos. Finalmente, o leitor aprendeu que a rede é feita dum grande **componente** e muitos muito pequenos. No interior desse grande componente, existem várias **comunidades** distintas, algumas das quais parecem organizadas em torno do tempo ou local (como Penn e os seus associados estadunidenses). Por causa dos metadados que adicionou à sua rede, o leitor tem as ferramentas para explorar estas métricas em profundidade e para, potencialmente, explicar alguns dos recursos estruturais que identificou. + +Cada uma destas descobertas é um convite para mais pesquisa ao invés dum ponto final ou prova. A análise de redes é um conjunto de ferramentas para perguntar questões específicas sobre a estrutura das relações num *dataset*, e o NetworkX providencia um interface relativamente simples a muitas das técnicas e métricas comuns. As redes são uma maneira útil de estender a sua pesquisa a um grupo ao providenciar informações sobre a estrutura da comunidade, e nós esperamos que o leitor será inspirado por este tutorial para usar métricas para enriquecer a sua própria pesquisa e para explorar a flexibilidade da análise de redes para além da visualização. + +[^1]: **Nota de tradução**: Como o leitor poderá confirmar mais abaixo, os autores desta lição transformaram os dados aqui analisados num gráfico, sem explicar tal passo, visto que o artigo lida com a análise dos dados, e não com a sua visualização. Se desejar, pode ler também a lição aqui referida e voltar a esta para confirmar se o seu gráfico se assemelha ao dos quatro autores. Aconselhamos que o faça após ter concluído todos os passos aqui descritos. + +[^2]: Em muitos (mas não todos os) casos, `pip` ou `pip3` serão instalados automaticamente com o Python3. + +[^3]: **Nota de tradução**: Isto pode estender-se ao uso de comandos, na sua *shell*, nomeadamente aquando da instalação do pip e de pacotes (ver Preparação dos Dados e Instalação do NetworkX). + +[^4]: Algumas instalações só quererão que o leitor digite `pip` sem "3," mas no Python 3, `pip3` é a mais comum. Se um não funcionar, tente o outro! + +[^5]: **Nota de tradução**: É importante lembrar que existem variações entre as diferentes versões do NetworkX que podem resultar em erros ou outputs diferentes. Tal é o caso da 2.6, com a qual obtivemos uma mensagem de erro durante a avaliação da modularidade e uma resposta diferente com a função print(nx.info(G)) daquela apresentada com a 2.4. + +[^6]: Existem algumas técnicas *pythónicas* que este código usa. A primeira é a 'compreensão de lista' (*list comprehensions*), que incorpora *loops* (`for n in nodes`) para criar novas listas (em parêntesis retos), assim: `new_list = [item for item in old_list]`. A segunda é a *list slicing*, que permite-lhe subdividir ou "*slice*" ("cortar") a lista. A notação da *list slicing* `[1:]` toma tudo *exceto* o primeiro item na lista. O 1 informa o Python para começar com o segundo item nesta lista (no Python, o leitor começa a contar do 0), e os dois pontos dizem ao Python para tomar tudo até ao fim da lista. Como a primeira linha em ambas destas listas é a fila de cabeçalho de cada CSV, nós não queremos que esses cabeçalhos sejam incluídos nos nossos dados. + +[^7]: Dicionários são um tipo de dados incorporados no Python, construídos com pares de chave-valor. Pense numa chave como a palavra-chave num dicionário, e o valor como a sua definição. Chaves têm que ser únicas (só uma de cada por dicionário), mas os valores podem ser qualquer coisa. Dicionários são representados por chavetas, com chaves e valores separados por dois pontos: `{key1:value1, key2:value2, ...}`. Dicionários são uma das maneiras mais rápidas de armazenar valores que o leitor pode necessitar mais tarde. De facto, um objeto Grafo do NetworkX é, ele próprio, feito de dicionários aninhados. + +[^8]: Note que este código usa parêntesis retos de duas formas. Usa números em parêntesis retos para aceder índices específicos numa lista de nós (por exemplo, o ano de nascimento no `node[4]`), mas também para designar uma *chave* (sempre `node[0]`, o ID) a qualquer um dos nossos dicionários vazios: `dictionary[key] = value`. Conveniente! + +[^9]: Por uma questão de simplicidade, removemos quaisquer nós que *não estão conectados a quaisquer outros* do *dataset* antes de termos começado. Isto foi feito simplesmente para reduzir a desordem, mas também é muito comum de se ver muitos destes nós solteiros no seu *dataset* de rede comum. + +[^10]: Mas mantenha em mente que isto é a densidade de *toda* a rede, incluindo esses componentes não conectados a flutuar em órbita. Existem várias conexões possíveis entre e com eles. Se o leitor tivesse tomado a densidade somente do componente maior, poderia ter obtido um número diferente. O leitor poderia fazê-lo ao encontrar o componente mais largo como nós lhe mostramos na próxima secção sobre o **diâmetro**, e, depois, ao executar o mesmo método de densidade somente nesse componente. + +[^11]: Nós tomamos o comprimento da lista *menos um* porque nós queremos o número de *edges* (ou passos) entre os nós listados aqui, ao invés do número de nós. + +[^12]: A forma mais correta de fazer este tipo de comparação é criar *grafos aleatórios* de tamanho idêntico para ver se as métricas diferem da norma. O NetworkX oferece várias ferramentas para [gerar grafos aleatórios](https://perma.cc/7Z4U-KAY7) (em inglês). + +[^13]: Porque se chama transitividade? O leitor pode recordar-se da propriedade transitiva de Geometria das aulas de Matemática no Ensino Secundário: se A=B e B=C, o A deve ser igual a C. Semelhantemente, no fechamento triádico, se a pessoa A conhece a pessoa B e a pessoa B conhece a pessoa C, então a pessoa A provavelmente conhece a pessoa C: logo, transitividade. + +[^14]: Aqueles com experiência em Estatística notarão que grau em redes sociais segue tipicamente uma *lei de potência*, mas isto não é nem pouco usual, nem especialmente útil saber. + +[^15]: Embora não venhamos a cobri-lo neste tutorial, é geralmente boa ideia obter a clasificação modular global primeiro para determinar se o leitor aprenderá qualquer coisa ao repartir a sua rede de acordo com a modularidade. Para ver a classificação geral da modularidade, tome as comunidades que calculou com `communities = community.best_partition(G)` e execute `global_modularity = community.modularity(communities, G)`. E depois basta aplicar `print(global_modularity)`. + +[^16]: **Nota de tradução**: [Plymouth](https://perma.cc/2EKN-TJPW) foi a primeira colónia inglesa permanente na região da Nova Inglaterra, no nordeste dos Estados Unidos da América, tendo sido fundada em 1620 por vários colonos puritanos, entre os quais um tal [William Bradford](https://perma.cc/UA8V-J4CX). Este [outro](https://perma.cc/TW4C-QWUY) referido foi um importante impressor *quaker*. + +[^17]: Em redes grandes, as listas seriam provavelmente ilegivelmente longas, mas o leitor poderia obter uma ideia de todas as classes modulares duma só vez ao visualizar a rede e adicionar cor aos nós baseada na sua classe modular. + +[^18]: Cada formato de ficheiro que é exportável é também importável. Se o leitor tiver um ficheiro GEXF do Gephi que quer pôr no NetworkX, digitaria `G = nx.read_gexf('some_file.gexf')`. + diff --git a/pt/licoes/extrair-paginas-ilustradas-com-python.md b/pt/licoes/extrair-paginas-ilustradas-com-python.md index 578c8beb38..011e56ce10 100644 --- a/pt/licoes/extrair-paginas-ilustradas-com-python.md +++ b/pt/licoes/extrair-paginas-ilustradas-com-python.md @@ -1,485 +1,485 @@ ---- -title: Extrair Páginas Ilustradas de Bibliotecas Digitais com Python -slug: extrair-paginas-ilustradas-com-python -layout: lesson -date: 2019-01-14 -translation_date: 2023-05-03 -authors: -- Stephen Krewson -reviewers: -- Catherine DeRose -- Taylor Arnold -editors: -- Anandi Silva Knuppel -translator: -- João Domingues Pereira -translation-editor: -- Eric Brasil -translation-reviewer: -- Felipe Lamarca -- Salete Farias -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/447 -difficulty: 2 -activity: acquiring -topics: [api] -abstract: A aprendizagem de máquina e as extensões de API do HathiTrust e do Internet Archive estão a tornar mais fácil a extração de regiões de página com interesse visual de volumes digitalizados. Esta lição mostra como extrair eficientemente essas regiões e, ao fazê-lo, como fomentar novas questões sobre a pesquisa visual. -avatar_alt: Instrumento Científico de Medição -original: extracting-illustrated-pages -doi: 10.46430/phpt0040 ---- - -{% include toc.html %} - -# Visão Geral - -E se só quisesse ver as imagens num livro? Este é um pensamento que já ocorreu tanto a jovens crianças como a pesquisadores adultos. Se soubesse que o livro está disponível através duma biblioteca digital, seria útil fazer o *download* somente das páginas com imagens e ignorar o resto. - -Aqui estão as miniaturas de página dum volume do HathiTrust com o identificador exclusivo `osu.32435078698222`. Após o processo descrito nesta lição, apenas as páginas com imagens (31 no total) foram baixadas como JPEGs para uma pasta. - -{% include figure.html filename="file-explorer-example.png" alt="Imagem com a apresentação das páginas de um livro que contêm imagens" caption="Visualização dum volume para o qual só as páginas com imagens foram baixadas." %} - -Para ver quantas páginas *não ilustradas* foram filtradas, compare com o [conjunto total de miniaturas](https://babel.hathitrust.org/cgi/pt?id=osu.32435078698222;view=thumb;seq=1) para todas as 148 páginas nesta edição revisada de 1845 do livro infantil *bestseller* de Samuel Griswold Goodrich, *The Tales of Peter Parley About America* (1827). - -{% include figure.html filename="parley-full-thumbnails.png" alt="Imagem com a visualização de todas as miniaturas das páginas de um livro" caption="Visualização das miniaturas do HathiTrust para todas as páginas." %} - -Esta lição mostra como completar estas etapas de filtragem e de *download* para volumes de texto em domínio público detidos pelo HathiTrust (HT) e pelo Internet Archive (IA), duas das maiores bibliotecas digitais no mundo. Será do interesse de qualquer um que deseje criar coleções de imagens com o fim de aprender sobre a História da Ilustração e o *layout* (*mise en page*) dos livros. As abordagens visuais à bibliografia digital estão a tornar-se populares, seguindo os esforços pioneiros do [EBBA](https://perma.cc/3QYS-XNSF) e do [Aida](https://perma.cc/SH49-K56K). Projetos recentemente concluídos ou financiados exploram maneiras de [identificar notas de rodapé](https://web.archive.org/web/20190526050917/http://culturalanalytics.org/2018/12/detecting-footnotes-in-32-million-pages-of-ecco/) e de [rastrear notas de margem de página](https://perma.cc/QB4J-55GU), para dar só dois [exemplos](https://perma.cc/9RC2-PJBL). - -A minha própria pesquisa tenta responder a questões empíricas sobre alterações na frequência e modo de ilustração em textos médicos e educacionais do século dezanove. Isto envolve agregar múltiplas imagens por livro e tentar estimar que processo de impressão foi usado para fazer tais imagens. Um caso de uso mais direcionado para a extração de páginas ilustradas pode ser a catalogação de ilustrações ao longo de [diferentes edições](https://perma.cc/2FCU-YW6D) do mesmo livro. Trabalhos futuros poderão investigar com sucesso as características visuais e o *significado* das imagens extraídas: a sua cor, o seu tamanho, o seu tema, o seu género, o número de figuras e assim por diante. - -Como obter informação *localizada* sobre regiões visuais de interesse está para além do âmbito desta lição, visto que o processo envolve uma quantidade significativa de aprendizagem de máquina. No entanto, a classificação sim/não de páginas com (ou sem) imagens é um primeiro passo prático para reduzir o enorme volume de *todas* as páginas para cada livro numa coleção visada, tornando viável a localização de ilustrações. Para dar um ponto de referência, os textos médicos do século dezanove contêm (em média) ilustrações em 1-3% das suas páginas. Se estiver a tentar estudar a ilustração no interior dum *corpus* duma biblioteca digital sobre o qual não tem qualquer informação preexistente, é, consequentemente, razoável assumir que 90+% das páginas nesse *corpus* NÃO estarão ilustradas. - -O HT e o IA permitem que a questão com imagens/sem imagens seja respondida indiretamente através da análise dos dados gerados pelo *software* *optical character recognition* (OCR) ou reconhecimento ótico de caracteres, em português (o OCR é aplicado após um volume físico ser digitalizado com o objetivo de gerar uma transcrição do texto muitas vezes desordenada). Aproveitar o resultado do *output* do OCR para encontrar páginas ilustradas foi proposto primeiramente por Kalev Leetaru numa [colaboração de 2014](https://perma.cc/3J79-4QA6) com o Internet Archive e o Flickr. Esta lição transfere a abordagem de Leetaru para o HathiTrust e tira proveito de bibliotecas de processamento de XML mais rápidas no Python, bem como da gama recentemente ampliada de formatos de ficheiro de imagem do IA. - -Uma vez que o HT e o IA expõem a sua informação derivada do OCR de maneiras ligeiramente diferentes, eu irei adiar a apresentação dos detalhes das "características visuais" de cada biblioteca para as suas secções respetivas. - -# Objetivos - -No final da lição, o leitor será capaz de: - -- Configurar a versão "mínima" da distribuição Anaconda do Python (Miniconda) e criar um ambiente; -- Salvar e iterar sobre uma lista de IDs de volumes do HT ou do IA gerados por uma pesquisa; -- Acessar aos *application programming interfaces* (APIs) ou interfaces de programação de aplicações, em português, de dados do HT e do IA através das bibliotecas do Python; -- Encontrar características visuais ao nível da página; -- Fazer o *download* dos JPEGs de páginas programaticamente. - -O grande objetivo é fortalecer as competências de coleta e exploração de dados ao criar um *corpus* de ilustração histórica. Combinar dados de imagem com os metadados dum volume permite a formulação de questões de pesquisa promissoras sobre a mudança visual ao longo do tempo. - -# Requisitos - -Os requisitos de *software* desta lição são mínimos: o acesso a uma máquina executando um sistema operacional padrão e um navegador de internet. O Miniconda está disponível em duas versões de 32 e de 64 *bits* para Windows, macOS e Linux. O Python 3 é a versão estável atual da linguagem e será suportado indefinidamente[^1]. - -Este tutorial assume um conhecimento básico da linha de comando e da linguagem de programação Python. O leitor deve compreender as convenções para comentários e comandos num tutorial baseado num *shell*. Eu recomendo a [*Introduction to the Bash Command Line*](/en/lessons/intro-to-bash), de Ian Milligan e James Baker, para aprender ou para rever as suas competências com a linha de comando. - -# Configuração - -## Dependências - -Os leitores mais experientes podem querer simplesmente instalar as dependências e executar os *notebooks* nos seus ambientes de escolha. Mais informações sobre a minha própria configuração do Miniconda (e algumas diferenças entre o Windows e o *nix) são providenciadas. - -> **Nota de tradução**: Para instalar as dependências, altere o seu diretório de trabalho para a pasta onde se encontra instalado o Python executando o comando `cd` e, depois, digite o comando `pip install` ou `pip3 install` acompanhado pelas seguintes linhas: - -- `hathitrust-api` ou `hathitrust_api` ([Documentos de Instalação](https://github.com/rlmv/hathitrust-api)); -- `internetarchive` ([Documentos de Instalação](https://archive.org/services/docs/api/internetarchive/)); -- `jupyter` ([Documentos de Instalação](https://jupyter.org/install)); -- `requests` ([Documentos de Instalação](https://requests.readthedocs.io/en/latest/user/install/#install)) [o criador recomenda a instalação do`pipenv`; para a instalação do `pip`, veja [PyPI](https://pypi.org/project/requests/)]. - -## Ficheiros da Lição - -Faça o *download* desta [pasta comprimida](/assets/extracting-illustrated-pages/lesson-files.zip) que contém dois *Jupyter notebooks*, um para cada uma das bibliotecas digitais. A pasta também contém um ficheiro de metadados JSON de amostra descrevendo uma coleção do HathiTrust. Descomprima e confirme que os seguintes ficheiros estão presentes: `554050894-1535834127.json`, `hathitrust.ipynb` e `internetarchive.ipynb`. - -
    -Todos os comandos subsequentes assumem que o seu diretório de trabalho atual é a pasta que contém os ficheiros da lição. -
    - -### Destino do *Download* - -Aqui está o diretório predefinido que será criado assim que todas as células em ambos os *notebooks* tiverem sido executadas (como providenciado). Depois de obter uma lista de quais páginas num volume contêm imagens, as funções de *download* do HT e do IA solicitam essas páginas como JPEGs (nomeadas pelo número de página) e arquivam-nas em subdiretórios (nomeados pelo ID do item). É claro que o leitor pode usar diferentes listas de volumes ou mudar o destino `out_dir` para algo que não `items`. - -``` -items/ -├── hathitrust -│ ├── hvd.32044021161005 -│ │ ├── 103.jpg -│ │ └── ... -│ └── osu.32435078698222 -│ ├── 100.jpg -│ ├── ... -└── internetarchive - └── talespeterparle00goodgoog - ├── 103.jpg - └── ... - -5 diretórios, 113 ficheiros -``` - -As funções de *download* são lentas; se executar os *notebooks* novamente, com o diretório `items` similar ao que se apresenta em cima, qualquer item que já tenha a sua própria subpasta será ignorado. - -## Anaconda (Opcional) - -A Anaconda é a principal distribuição científica do Python. O seu gerenciador de pacotes `conda` permite-lhe instalar bibliotecas como a `numpy` e a `tensorflow` com facilidade. A versão "Miniconda" não é acompanhada por quaisquer pacotes supérfluos pré-instalados, o que incentiva o leitor a manter o seu ambiente de base limpo e a instalar apenas o que necessita para um projeto dentro dum ambiente nomeado. - -Faça o *download* e instale o [Miniconda](https://conda.io/miniconda.html). Escolha a versão estável mais recente do Python 3. Se tudo correr bem, o leitor conseguirá executar `which conda` (no Linux/macOS) ou `where conda` (no Windows) no seu *shell* e ver a localização do programa executável no *output*. - -A Anaconda tem uma [*cheat sheet*](http://web.archive.org/web/20190115051900/https://conda.io/docs/_downloads/conda-cheatsheet.pdf) ou folha de dicas, em português, útil para comandos de uso frequente. - -### Criar um Ambiente - -Os ambientes, entre outras coisas, ajudam a controlar a complexidade associada ao uso de múltiplos gerenciadores de pacotes em conjunto. Nem todas as bibliotecas do Python podem ser instaladas através do `conda`. Em alguns casos, nós recorreremos ao gestor de pacote padrão do Python, o `pip` (ou alterações planejadas, como o `pipenv`). No entanto, quando o fizermos, nós usaremos uma versão do `pip` instalada através do `conda`. Isto mantém todos os pacotes que nós precisamos para o projeto no mesmo espaço virtual. - -```bash -# O seu ambiente atual é precedido por um asterisco -# (será a "base" num novo shell) -conda env list - -# Pacotes instalados no ambiente atual -conda list -``` - -Agora nós criamos um ambiente nomeado, configuramo-lo para usar Python 3, e ativamo-lo. - -```bash -# Note a sinalização "--name", que toma um argumento de string (e.g. "extract-pages") -# e a sintaxe para especificar a versão do Python -conda create --name extract-pages python=3 - -# Indique o novo ambiente (no Linux/macOS) -source activate extract-pages -``` - -```bash -# O comando do Windows para ativar o ambiente é ligeiramente diferente -conda activate extract-pages -``` - -Para sair dum ambiente, execute `source deactivate` no Linux/macOS ou `deactivate` no Windows. Mas certifique-se que permanece no ambiente `extract-pages` durante o decorrer da lição! - -### Instalar os Pacotes do Conda - -Nós podemos usar o `conda` para instalar os nossos primeiros pacotes. Todos os outros pacotes necessários (gzip, JSON, os, sys e time) fazem parte da [biblioteca padrão do Python](https://docs.python.org/3/library/). Note como nós precisamos de especificar um canal em alguns casos. O leitor pode pesquisar por pacotes no [Anaconda Cloud](https://anaconda.org/). - - -```bash -# Para garantir que nós temos uma versão local do pip (veja a discussão em baixo) -conda install pip - -conda install jupyter - -conda install --channel anaconda requests -``` - -O Jupyter tem muitas dependências (outros pacotes dos quais depende), por isso esta etapa pode exigir alguns minutos. Recorde-se que quando o `conda` lhe pergunta se deseja continuar com a instalação por via da questão `Proceed ([y]/n)?`, o leitor deve digitar um `y` ou um `yes` e, depois, pressionar *Enter* para aceitar a instalação do pacote. - -
    -Nos bastidores, o conda está a trabalhar para certificar-se que todos os pacotes e dependências necessários serão instalados numa maneira compatível. -
    - -### Instalar Pacotes do Pip - -Se estiver a usar um ambiente `conda`, é melhor usar a versão local do `pip`. Confirme que os seguintes comandos dão como resultado do *output* um programa cujo caminho absoluto contém algo como `/Miniconda/envs/extract-pages/Scripts/pip`. - -```bash -which pip -``` - -```bash -# O equivalente do Windows ao "which" -where pip -``` - -Se vir duas versões do `pip` no *output* em cima, certifique-se de digitar o caminho absoluto para a versão do ambiente *local* ao instalar as bibliotecas *wrapper* da API. - -```bash -pip install hathitrust-api -pip install internetarchive -``` - -```bash -# Exemplo do Windows usando o caminho absoluto para o executável do pip local -C:\Users\stephen-krewson\Miniconda\envs\extract-pages\Scripts\pip.exe install hathitrust-api internetarchive -# Substitua "stephen-krewson" pelo seu nome de utilizador -``` - -## *Jupyter Notebooks* - -O [*Text Mining in Python Through the HTRC Feature Reader*](/en/lessons/text-mining-with-extracted-features#start-a-notebook), de Peter Organisciak e Boris Capitanu, explica os benefícios dos *notebooks* para o desenvolvimento e a exploração de dados. Também contém informação útil sobre como executar eficazmente as células. Visto que nós instalámos a versão minimalista da Anaconda, nós precisamos de iniciar o Jupyter a partir da linha de comandos. No seu *shell* (a partir do interior da pasta contendo os ficheiros da lição) execute `jupyter notebook`. - -Isto executará o servidor do *notebook* no seu *shell* e iniciará o seu navegador de internet predefinido com a página inicial do Jupyter[^2]. A página inicial mostra todos os ficheiros no diretório de trabalho atual. - -{% include figure.html filename="jupyter-home.png" alt="Imagem com a apresentação da estrutura de ficheiros da página inicial do Jupyter" caption="A página inicial do Jupyter mostrando os ficheiros da lição." %} - -
    -No seu shell, certifique-se que usou o comando cd para ir até ao diretório descomprimido lesson-files. -
    - -Clique nos *notebooks* `hathitrust.ipynb` e `internetarchive.ipynb` para abri-los em novas abas do navegador de internet. A partir daqui, nós não precisamos de executar qualquer comando no *shell*. Os *notebooks* permitem-nos executar o código Python e ter acesso total ao sistema de pastas do computador. Quando o leitor tiver terminado, pode parar o servidor do *notebook* carregando em "*Quit*" na página inicial do Jupyter ou executando `ctrl+c` no *shell*. - -# HathiTrust - -## Acesso à API - -O leitor precisa efetuar um registro no HathiTrust antes de usar o API de dados. Dirija-se ao [portal de registro](https://babel.hathitrust.org/cgi/kgs/request) e preencha o seu nome, a sua organização e o seu e-mail para requerer chaves de acesso. O leitor deverá receber uma resposta no e-mail dentro de cerca dum minuto (**nota de tradução**: verifique também a caixa de *spam*). Clique no link, que o trará a uma página temporária com ambas as chaves exibidas. - -No *notebook* `hathitrust.ipynb`, examine a primeira célula (mostrada em baixo). Preencha as suas chaves da API como indicado. Depois, execute a célula clicando em "*Run*" na barra de navegação do *notebook*. - -```python -# Importe o wrapper da API de dados do HT -from hathitrust_api import DataAPI - -# Substitua as strings com as suas credenciais do HT (deixando as aspas) -ht_access_key = "YOUR_ACCESS_KEY_HERE" -ht_secret_key = "YOUR_SECRET_KEY_HERE" - -# Instancie o objeto de conexão da API de dados -data_api = DataAPI(ht_access_key, ht_secret_key) -``` - -
    -Cuidado! Não exponha as suas chaves de acesso através dum repositório público no GitHub (ou outro host de controle de versões). Elas serão pesquisáveis por qualquer outra pessoa. Uma boa prática para um projeto Python é a de armazenar as suas chaves de acesso como variáveis de ambiente ou salvá-las num ficheiro que não é versionado. -
    - -## Criar uma Lista de Volumes - -O HT permite a qualquer um fazer uma coleção de itens—o leitor nem sequer tem que estar na sua conta! No entanto, o leitor deveria registrar uma conta se quiser salvar a sua lista de volumes. Siga as [instruções](https://babel.hathitrust.org/cgi/mb?colltype=updated) para fazer algumas pesquisas no texto completo e para, depois, adicionar resultados escolhidos a uma coleção. Atualmente, o HathiTrust não tem uma API de pesquisa pública para adquirir volumes programaticamente; o leitor precisa de pesquisar através da sua *interface* da internet. - -Ao atualizar uma coleção, o HT mantém o rastro dos metadados associados para cada item nela. Eu incluí nos ficheiros da lição os metadados para uma lição de amostra no formato JSON. Se quisesse usar o ficheiro da sua própria coleção do HT, o leitor navegaria até à página das suas coleções e colocaria o cursor do *mouse* sobre o link dos metadados à esquerda para revelar a opção para fazer o *download* como JSON, como observado na seguinte captura de tela. - -{% include figure.html filename="download-ht-json.png" alt="Imagem de uma página web do site HathiTrust com instruções para download de metadados de ficheiros JSON" caption="Captura de tela de como fazer o *download* dos metadados de coleções no formato JSON." %} - -Assim que o leitor tiver feito o *download* do ficheiro JSON, basta movê-lo para o diretório onde colocou os *Jupyter notebooks*. Substitua o nome do ficheiro JSON no *notebook* do HT com o nome do ficheiro da sua coleção. - -O *notebook* mostra como usar *list comprehension* para obter todas as *strings* `htitem_id` dentro do objeto `gathers` que contem todas as informações da coleção. - -```python -# O leitor pode especificar o ficheiro de metadados da sua coleção aqui -metadata_path = "554050894-1535834127.json" - -with open(metadata_path, "r") as fp: - data = json.load(fp) - -# Uma lista de todas as IDs exclusivas na coleção -vol_ids = [item['htitem_id'] for item in data['gathers']] -``` - -
    -Os tutoriais normalmente mostram-lhe como processar um item de exemplo (muitas vezes de tamanho ou complexidade trivial). Isto é pedagogicamente conveniente, mas significa que o leitor está menos equipado para aplicar esse código a múltiplos itens—de longe o caso de uso mais comum. Nos notebooks, o leitor verá como encapsular transformações aplicadas a um item em funções que podem ser usadas num loop sobre uma coleção de itens. -
    - -## Característica Visual: IMAGE_ON_PAGE - -Dada uma lista de volumes, nós queremos explorar que características visuais eles têm ao nível da página. A [documentação mais recente](https://perma.cc/Y6UU-G9HZ) (2015) para o API de dados descreve um objeto metadados chamado `htd:pfeat` nas páginas 9-10. `htd:pfeat` é a abreviação para "HathiTrust Data API: Page Features". - -> * `htd:pfeat`­ - the page feature key (if available): -> - CHAPTER_START -> - COPYRIGHT -> - FIRST_CONTENT_CHAPTER_START -> - FRONT_COVER -> - INDEX -> - REFERENCES -> - TABLE_OF_CONTENTS -> - TITLE - -O que o *wrapper* `hathitrust-api` faz é disponibilizar os metadados completos para um volume do HT como um objeto Python. Dado o identificador dum volume, nós podemos pedir os seus metadados e, depois, fazer o *drill down* através da *sequência* de páginas até à informação ao nível da página. A *lista* `htd:pfeat` está associada com cada página num volume e, em teoria, contém todas as características que se aplicam a essa página. Na prática, existem mais algumas *tags* de características do que as oito listadas em cima. Aquela com a qual nós iremos trabalhar chama-se `IMAGE_ON_PAGE` e é mais abstratamente visual que *tags* estruturais como `CHAPTER_START`. - -Tom Burton-West, um bibliotecário pesquisador na biblioteca da *University of Michigan*, trabalha em estreita colaboração com o HathiTrust e o HTRC, o Centro de Pesquisa do HathiTrust. O Tom disse-me por e-mail que o HathiTrust recebe a informação `htd:pfeat` via o Google, com o qual trabalham proximamente desde a fundação do HT, em 2008. Um contacto no Google deu permissão ao Tom para partilhar o seguinte: - -> Estas *tags* são derivadas duma combinação de Heurística, de aprendizagem de máquina e de anotação humana. - -Um exemplo heurístico pode ser o facto do primeiro elemento na sequência de páginas do volume ser quase sempre a `FRONT_COVER`. A aprendizagem de máquina pode ser usada para treinar modelos a discriminar, digamos, entre dados de imagem que são mais típicos das linhas de prosa numa escrita ocidental ou das linhas numa gravura. A anotação humana é a atribuição manual de etiquetas a imagens. A habilidade de ver as ilustrações dum volume nos bancos de dados do EEBO e do ECCO é um exemplo de anotação humana. - -O uso da "aprendizagem de máquina" pelo Google parece um pouco misterioso. Até o Google publicitar os seus métodos, é impossível saber todos os detalhes. No entanto, é provável que as *tags* `IMAGE_ON_PAGE` tenham sido propostas pela primeira vez após a deteção de blocos de "Imagens" nos ficheiros de _output_ do OCR (um processo discutido em baixo, na secção do Internet Archive). Mais filtragem pode, então, ser aplicada. - -## Passo a Passo Para o Código - -### Encontrar as imagens - -Nós vimos como criar uma lista de volumes e observámos que a API de dados pode ser usada para obter objetos metadados contendo características experimentais ao nível da página. A função essencial no *notebook* do HT tem a assinatura digital `ht_picture_download(item_id, out_dir=None)`. Dado um identificador exclusivo e um diretório de destino opcional, esta função irá, em primeiro lugar, obter os metadados do volume a partir da API e convertê-los num formato JSON. Depois, percorre a sequência de páginas e verifica se a *tag* `IMAGE_ON_PAGE` está na lista `htd:pfeat` (se a mesma existir). - -```python -# Metadados da API no formato JSON (diferente dos metadados da coleção do HT) -meta = json.loads(data_api.getmeta(item_id, json=True)) - -# A sequência dá-nos cada página do item digitalizado em ordem, com qualquer -# informação adicional que lhe pode estar disponível -sequence = meta['htd:seqmap'][0]['htd:seq'] - -# A lista de páginas com imagens (vazio para a iniciação) -img_pages = [] - -# O bloco try/except lida com situações onde nenhuma "pfeats" existe OU -# os números da sequência não são numéricos -for page in sequence: - try: - if 'IMAGE_ON_PAGE' in page['htd:pfeat']: - img_pages.append(int(page['pseq'])) - except (KeyError, TypeError) as e: - continue -``` - -Note que nós precisamos de fazer o *drill down* por vários níveis até ao objeto do nível de topo para obter o objeto `htd:seq`, sobre o qual nós podemos iterar. - -As duas exceções que eu quero evitar são o `KeyError`, que ocorre quando a página não tem qualquer característica ao nível da página a si associada, e o `TypeError`, que ocorre quando o campo `pseq` para a página é, por alguma razão, não numérico e, portanto, não pode ser destinado a um `int`. Se algo correr mal com uma página, nós simplesmente executamos `continue` para passar à próxima. O plano é obter todos os dados bons que conseguirmos. Não é limpar inconsistências ou falhas nos metadados do item. - -### Fazer o *Download* das Imagens - -Assim que `img_pages` contém a lista completa de páginas com a *tag* `IMAGE_ON_PAGE`, nós podemos fazer o download dessas páginas. Note que, se nenhum `out_dir` for fornecido a `ht_picture_download()`, então a função simplesmente retorna a lista `img_pages` e NÃO faz o *download* do quer que seja. - -A chamada da API `getpageimage()` retorna um JPEG por predefinição. Nós simplesmente colocamos os bytes do JPEG num ficheiro na forma normal. Dentro da subpasta do volume (ela própria dentro do `out_dir`), as páginas serão nomeadas `1.jpg` para a página 1 e assim sucessivamente. - -Uma coisa a considerar é a nossa taxa de uso da API. Nós não queremos abusar do nosso acesso ao fazer centenas de pedidos por minuto. Para estar a salvo, especialmente se pretendermos executar grandes trabalhos, nós esperamos dois segundos antes de fazer cada pedido de página. Isto pode ser frustrante a curto prazo, mas ajuda a evitar o sufocamento ou a suspenção da API. - - -```python -for i, page in enumerate(img_pages): - try: - # Uma simples mensagem de estado - print("[{}] Downloading page {} ({}/{})".format(item_id, page, i+1, total_pages)) - - img = data_api.getpageimage(item_id, page) - - # N.B.: O loop só é executado se "out_dir" não for "None" - img_out = os.path.join(out_dir, str(page) + ".jpg") - - # Escreva a imagem - with open(img_out, 'wb') as fp: - fp.write(img) - - # Para evitar exceder o uso da API permitido - time.sleep(2) - - except Exception as e: - print("[{}] Error downloading page {}: {}".format(item_id, page,e)) -``` - -# Internet Archive - -## Acesso à API - -Nós conectamos à biblioteca API do Python usando uma conta no Archive.org com e-mail e palavra-chave ao invés das chaves de acesso do API. Isto é discutido no [Guia Quickstart](https://archive.org/developers/index.html). Se não tiver uma conta, [registre-se](https://archive.org/account/login.createaccount.php) para obter o seu "Virtual Library Card". - -Na primeira célula do *notebook* `internetarchive.ipynb`, introduza as suas credenciais como indicado. Execute a célula para autenticar-se perante a API. - -> **Nota de tradução**: O comando `ia.configure(ia_email, ia_password)` é atualmente desnecessário e pode gerar um erro extenso, em cuja mensagem final consta: `InvalidURL: Invalid URL 'https:///services/xauthn/': No host supplied`. Sugerimos que o mesmo não seja executado no ficheiro IPYNB. - -## Criar uma Lista de Volumes - -A biblioteca IA do Python permite-lhe submeter *query strings* e receber uma lista de pares chave-valor correspondentes na qual a palavra "*identifier*", ou identificador, em português, é a chave e o verdadeiro identificador é o valor. A sintaxe para uma *query* é explicada na [página de Advanced Search](https://archive.org/advancedsearch.php) para o IA. O leitor pode especificar parâmetros ao usar uma palavra-chave como "*date*" ou "*mediatype*" seguida de dois pontos e o valor que quer atribuir a esse parâmetro. Por exemplo, eu só quero resultados que são *textos* (em oposição a vídeos, *etc.*). Certifique-se que os parâmetros e as opções que está a tentar usar são suportadas pela funcionalidade de pesquisa do IA. Caso contrário, pode perder ou obter resultados estranhos e não saber porquê. - -No *notebook*, eu gero uma lista de IDs do IA com o seguinte código: - -```python -# Uma pesquisa de amostra (deve gerar dois resultados) -query = "peter parley date:[1825 TO 1830] mediatype:texts" -vol_ids = [result['identifier'] for result in ia.search_items(query)] -``` - -## Característica Visual: Blocos de Imagens - -O Internet Archive não apresenta quaisquer características ao nível da página. Ao invés, disponibiliza um certo número de ficheiros brutos do processo de digitalização aos utilizadores. O mais importante destes para os nossos propósitos é o ficheiro XML Abbyy. Abbyy é uma empresa russa cujo *software* FineReader domina o mercado do OCR. - -Todas as versões recentes do FineReader produzem um [documento XML](https://perma.cc/83EK-LXP2) que associa diferentes "blocos" com cada página no documento digitalizado. O tipo de bloco mais comum é `Text` mas também existem blocos `Picture` ou "Imagem", em português. Aqui está um bloco de exemplo tirado dum ficheiro de XML Abbyy do IA. Os cantos superior esquerdo ("t" e "l") e inferior direito ("b" e "r") são suficientes para identificar a região de bloco retangular. - -```xml - - - -``` - -O equivalente no IA a ver as *tags* `IMAGE_ON_PAGE` no HT é a análise do ficheiro XML Abbyy e a iteração sobre cada página. Se existir pelo menos um bloco `Picture` nessa página, a página é sinalizada como possivelmente contendo uma imagem. - -Enquanto a característica `IMAGE_ON_PAGE` do HT não contém informação sobre a *localização* dessa imagem, os blocos `Picture` no ficheiro XML estão associados a uma região retangular na página. No entanto, porque o FineReader se especializa no reconhecimento de letras de conjuntos de caracteres ocidentais, é muito menos preciso a identificar regiões de imagem. O projeto de Leetaru (veja *Visão Geral*) usou as coordenadas da região para cortar imagens, mas nesta lição nós iremos simplesmente fazer o *download* da página inteira. - -Parte da diversão intelectual desta lição é usar um *dataset* (*tags* de bloco do OCR) por vezes confuso para um propósito largamente não intencional: identificar imagens e não palavras. A certa altura, tornar-se-á computacionalmente viável executar modelos de aprendizagem aprofundada em todas as páginas ilustradas nuas num volume e escolher o(s) tipo(s) de imagem(/ns) desejada(s). Mas, como a maior parte das páginas na maioria dos volumes não são ilustradas, esta é uma tarefa dispendiosa. Por agora, faz mais sentido aproveitar os dados existentes que nós detemos do processo de ingestão do OCR. - -Para mais informações sobre como o próprio OCR funciona e interage com o processo de digitalização, por favor, veja a lição do *PH* de Mila Oiva, [OCR With Tesseract and ScanTailor](/en/lessons/retired/OCR-with-Tesseract-and-ScanTailor) (atenção que esta lição já não é actualizada). Erros podem surgir por causa de distorções, artefactos e muitos outros problemas. Estes erros acabam por afetar a fiabilidade e a precisão dos blocos "Picture". Em muitos casos, o Abbyy estimará que páginas em branco ou descoloridas são, na realidade, imagens. Estas *tags* de bloco incorretas, ainda que indesejadas, podem ser combatidas com o uso de redes neurais convolucionais retreinadas. Pense nas páginas com imagens cujo download foi feito nesta lição como um primeiro passo num processo mais longo para obter um *dataset* limpo e útil de ilustrações históricas. - -## Passo a Passo do Código - -### Encontrar as Imagens - -Tal como com o HT, a função principal para o IA é `ia_picture_download(item_id, out_dir=None)`. - -Visto que envolve o I/O dum ficheiro, o processo para obter a lista `img_pages` é mais complicado do que o do HT. Usando a utilidade `ia` (que é instalada com a biblioteca) da linha de comando, o leitor pode obter uma ideia dos ficheiros de metadados disponíveis para um volume. Com muitas poucas exceções, um ficheiro com o formato "Abbyy GZ" deveria estar disponível para volumes com o tipo de *media* `text` no Internet Archive. - -Estes ficheiros, mesmo quando comprimidos, podem facilmente ter centenas de megabytes de tamanho! Se existir um ficheiro Abbyy para o volume, nós obtemos o seu nome e depois fazemos o *download*. A chamada `ia.download()` usa alguns parâmetros úteis para ignorar a solicitação se o ficheiro já existe e, se não, para fazer o seu *download* sem criar um diretório aninhado. Para salvar espaço, nós eliminamos o ficheiro Abbyy depois de o analisar. - -```python -# Use o cliente da linha de comandos para ver os formatos de metadados disponíveis: -# `ia metadata formats VOLUME_ID` - -# Para esta lição, só o ficheiro Abbyy é necessário -returned_files = list(ia.get_files(item_id, formats=["Abbyy GZ"])) - -# Certifique-se de que algo é devolvido -if len(returned_files) > 0: - abbyy_file = returned_files[0].name -else: - print("[{}] Could not get Abbyy file".format(item_id)) - return None - -# Faça o download do ficheiro Abbyy para o CWD -ia.download(item_id, formats=["Abbyy GZ"], ignore_existing=True, \ - destdir=os.getcwd(), no_directory=True) -``` - -Assim que nós tivermos o ficheiro, nós precisamos de analisar o XML usando a biblioteca padrão do Python. Nós tomamos vantagem do facto de que nós podemos abrir o ficheiro comprimido diretamente com a biblioteca `gzip`. Os ficheiros Abbyy são indexadas a partir do zero, por isso a primeira página na sequência digitalizada tem o índice de 0. No entanto, nós temos que filtrar 0 porque não pode ser exigido do IA. A exclusão do índice 0 por parte do IA não está documentada em qualquer lugar; em vez disso, eu descobri através de tentativa e erro. Se o leitor ver uma mensagem de erro de explicação difícil, tente rastrear a origem e não tenha medo em pedir ajuda, seja a alguém com experiência relevante, seja a alguém da própria organização. - -```python -# Colecione as páginas com pelo menos um bloco de imagem -img_pages = [] - -with gzip.open(abbyy_file) as fp: - tree = ET.parse(fp) - document = tree.getroot() - for i, page in enumerate(document): - for block in page: - try: - if block.attrib['blockType'] == 'Picture': - img_pages.append(i) - break - except KeyError: - continue - -# 0 não é uma página válida para a realização de solicitações GET ao IA, mas às vezes -# está no ficheiro Abbyy comprimido -img_pages = [page for page in img_pages if page > 0] - -# Acompanhe o progresso do download -total_pages = len(img_pages) - -# Os ficheiros do OCR são pesados, por isso elimine assim que tivermos a lista de páginas -os.remove(abbyy_file) -``` - -### Fazer o *Download* das Imagens - -O *wrapper* do IA incorporado no Python não providencia uma função de download de páginas únicas—apenas em massa. Isto significa que nós usaremos a RESTful API do IA para obter páginas específicas. Primeiro, nós construímos um URL para cada página de que nós precisamos. Depois, nós usamos a biblioteca `requests` para enviar uma solicitação `GET` de HTTP e, se tudo correr bem (*i.e.* o código 200 é enviado na resposta), nós escrevemos o conteúdo da resposta num ficheiro JPEG. - -O IA tem estado a trabalhar numa [versão *alpha*](https://perma.cc/F6HJ-YGM7) duma API para o corte e redimensionamento de imagens que obedeça às exigências do International Image Interoperability Framework ([IIIF](https://perma.cc/7ABF-GGJM)). O IIIF representa uma profunda melhoria face ao antigo método para *downloads* de páginas únicas que requeriam a realização do *download* de ficheiros JP2, um formato de ficheiro largamente não suportado. Agora, é extremamente simples obter um só JPEG duma página: - -```python -# Veja: https://iiif.archivelab.org/iiif/documentation -urls = ["https://iiif.archivelab.org/iiif/{}${}/full/full/0/default.jpg".format(item_id, page) - for page in img_pages] - -# Sem download de página direto a partir da biblioteca do Python, construa uma solicitação GET -for i, page, url in zip(range(1,total_pages), img_pages, urls): - - rsp = requests.get(url, allow_redirects=True) - - if rsp.status_code == 200: - print("[{}] Downloading page {} ({}/{})".format(item_id, \ - page, i+1, total_pages)) - - with open(os.path.join(out_dir, str(page) + ".jpg"), "wb") as fp: - fp.write(rsp.content) -``` - -# Próximos Passos - -Assim que o leitor tiver entendido as principais funções e o código de *unpacking* dos dados nos *notebooks*, sinta-se livre para executar as células em sequência ou carregar em "*Run All*" e ver as páginas ilustradas a entrar nas pastas. O leitor é encorajado a adaptar estes *scripts* e funções para as suas próprias questões de pesquisa. - -[^1]: **Nota de tradução**: Aconselhamos o leitor a adicionar o Python ao PATH, processo que pode ser feito na ocasião da sua instalação. Isto irá suavizar a incorporação das dependências (veja *Dependências*). - -[^2]: **Nota de tradução**: Inicialmente, aparece uma página de transição, a qual deverá remeter rapidamente para o Jupyter. Caso tal não aconteça, basta seguir as instruções nesta página. - - +--- +title: Extrair Páginas Ilustradas de Bibliotecas Digitais com Python +slug: extrair-paginas-ilustradas-com-python +layout: lesson +date: 2019-01-14 +translation_date: 2023-05-03 +authors: +- Stephen Krewson +reviewers: +- Catherine DeRose +- Taylor Arnold +editors: +- Anandi Silva Knuppel +translator: +- João Domingues Pereira +translation-editor: +- Eric Brasil +translation-reviewer: +- Felipe Lamarca +- Salete Farias +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/447 +difficulty: 2 +activity: acquiring +topics: [api] +abstract: A aprendizagem de máquina e as extensões de API do HathiTrust e do Internet Archive estão a tornar mais fácil a extração de regiões de página com interesse visual de volumes digitalizados. Esta lição mostra como extrair eficientemente essas regiões e, ao fazê-lo, como fomentar novas questões sobre a pesquisa visual. +avatar_alt: Instrumento Científico de Medição +original: extracting-illustrated-pages +doi: 10.46430/phpt0040 +--- + +{% include toc.html %} + +# Visão Geral + +E se só quisesse ver as imagens num livro? Este é um pensamento que já ocorreu tanto a jovens crianças como a pesquisadores adultos. Se soubesse que o livro está disponível através duma biblioteca digital, seria útil fazer o *download* somente das páginas com imagens e ignorar o resto. + +Aqui estão as miniaturas de página dum volume do HathiTrust com o identificador exclusivo `osu.32435078698222`. Após o processo descrito nesta lição, apenas as páginas com imagens (31 no total) foram baixadas como JPEGs para uma pasta. + +{% include figure.html filename="file-explorer-example.png" alt="Imagem com a apresentação das páginas de um livro que contêm imagens" caption="Visualização dum volume para o qual só as páginas com imagens foram baixadas." %} + +Para ver quantas páginas *não ilustradas* foram filtradas, compare com o [conjunto total de miniaturas](https://babel.hathitrust.org/cgi/pt?id=osu.32435078698222;view=thumb;seq=1) para todas as 148 páginas nesta edição revisada de 1845 do livro infantil *bestseller* de Samuel Griswold Goodrich, *The Tales of Peter Parley About America* (1827). + +{% include figure.html filename="parley-full-thumbnails.png" alt="Imagem com a visualização de todas as miniaturas das páginas de um livro" caption="Visualização das miniaturas do HathiTrust para todas as páginas." %} + +Esta lição mostra como completar estas etapas de filtragem e de *download* para volumes de texto em domínio público detidos pelo HathiTrust (HT) e pelo Internet Archive (IA), duas das maiores bibliotecas digitais no mundo. Será do interesse de qualquer um que deseje criar coleções de imagens com o fim de aprender sobre a História da Ilustração e o *layout* (*mise en page*) dos livros. As abordagens visuais à bibliografia digital estão a tornar-se populares, seguindo os esforços pioneiros do [EBBA](https://perma.cc/3QYS-XNSF) e do [Aida](https://perma.cc/SH49-K56K). Projetos recentemente concluídos ou financiados exploram maneiras de [identificar notas de rodapé](https://web.archive.org/web/20190526050917/https://culturalanalytics.org/2018/12/detecting-footnotes-in-32-million-pages-of-ecco/) e de [rastrear notas de margem de página](https://perma.cc/QB4J-55GU), para dar só dois [exemplos](https://perma.cc/9RC2-PJBL). + +A minha própria pesquisa tenta responder a questões empíricas sobre alterações na frequência e modo de ilustração em textos médicos e educacionais do século dezanove. Isto envolve agregar múltiplas imagens por livro e tentar estimar que processo de impressão foi usado para fazer tais imagens. Um caso de uso mais direcionado para a extração de páginas ilustradas pode ser a catalogação de ilustrações ao longo de [diferentes edições](https://perma.cc/2FCU-YW6D) do mesmo livro. Trabalhos futuros poderão investigar com sucesso as características visuais e o *significado* das imagens extraídas: a sua cor, o seu tamanho, o seu tema, o seu género, o número de figuras e assim por diante. + +Como obter informação *localizada* sobre regiões visuais de interesse está para além do âmbito desta lição, visto que o processo envolve uma quantidade significativa de aprendizagem de máquina. No entanto, a classificação sim/não de páginas com (ou sem) imagens é um primeiro passo prático para reduzir o enorme volume de *todas* as páginas para cada livro numa coleção visada, tornando viável a localização de ilustrações. Para dar um ponto de referência, os textos médicos do século dezanove contêm (em média) ilustrações em 1-3% das suas páginas. Se estiver a tentar estudar a ilustração no interior dum *corpus* duma biblioteca digital sobre o qual não tem qualquer informação preexistente, é, consequentemente, razoável assumir que 90+% das páginas nesse *corpus* NÃO estarão ilustradas. + +O HT e o IA permitem que a questão com imagens/sem imagens seja respondida indiretamente através da análise dos dados gerados pelo *software* *optical character recognition* (OCR) ou reconhecimento ótico de caracteres, em português (o OCR é aplicado após um volume físico ser digitalizado com o objetivo de gerar uma transcrição do texto muitas vezes desordenada). Aproveitar o resultado do *output* do OCR para encontrar páginas ilustradas foi proposto primeiramente por Kalev Leetaru numa [colaboração de 2014](https://perma.cc/3J79-4QA6) com o Internet Archive e o Flickr. Esta lição transfere a abordagem de Leetaru para o HathiTrust e tira proveito de bibliotecas de processamento de XML mais rápidas no Python, bem como da gama recentemente ampliada de formatos de ficheiro de imagem do IA. + +Uma vez que o HT e o IA expõem a sua informação derivada do OCR de maneiras ligeiramente diferentes, eu irei adiar a apresentação dos detalhes das "características visuais" de cada biblioteca para as suas secções respetivas. + +# Objetivos + +No final da lição, o leitor será capaz de: + +- Configurar a versão "mínima" da distribuição Anaconda do Python (Miniconda) e criar um ambiente; +- Salvar e iterar sobre uma lista de IDs de volumes do HT ou do IA gerados por uma pesquisa; +- Acessar aos *application programming interfaces* (APIs) ou interfaces de programação de aplicações, em português, de dados do HT e do IA através das bibliotecas do Python; +- Encontrar características visuais ao nível da página; +- Fazer o *download* dos JPEGs de páginas programaticamente. + +O grande objetivo é fortalecer as competências de coleta e exploração de dados ao criar um *corpus* de ilustração histórica. Combinar dados de imagem com os metadados dum volume permite a formulação de questões de pesquisa promissoras sobre a mudança visual ao longo do tempo. + +# Requisitos + +Os requisitos de *software* desta lição são mínimos: o acesso a uma máquina executando um sistema operacional padrão e um navegador de internet. O Miniconda está disponível em duas versões de 32 e de 64 *bits* para Windows, macOS e Linux. O Python 3 é a versão estável atual da linguagem e será suportado indefinidamente[^1]. + +Este tutorial assume um conhecimento básico da linha de comando e da linguagem de programação Python. O leitor deve compreender as convenções para comentários e comandos num tutorial baseado num *shell*. Eu recomendo a [*Introduction to the Bash Command Line*](/en/lessons/intro-to-bash), de Ian Milligan e James Baker, para aprender ou para rever as suas competências com a linha de comando. + +# Configuração + +## Dependências + +Os leitores mais experientes podem querer simplesmente instalar as dependências e executar os *notebooks* nos seus ambientes de escolha. Mais informações sobre a minha própria configuração do Miniconda (e algumas diferenças entre o Windows e o *nix) são providenciadas. + +> **Nota de tradução**: Para instalar as dependências, altere o seu diretório de trabalho para a pasta onde se encontra instalado o Python executando o comando `cd` e, depois, digite o comando `pip install` ou `pip3 install` acompanhado pelas seguintes linhas: + +- `hathitrust-api` ou `hathitrust_api` ([Documentos de Instalação](https://github.com/rlmv/hathitrust-api)); +- `internetarchive` ([Documentos de Instalação](https://archive.org/services/docs/api/internetarchive/)); +- `jupyter` ([Documentos de Instalação](https://jupyter.org/install)); +- `requests` ([Documentos de Instalação](https://requests.readthedocs.io/en/latest/user/install/#install)) [o criador recomenda a instalação do`pipenv`; para a instalação do `pip`, veja [PyPI](https://pypi.org/project/requests/)]. + +## Ficheiros da Lição + +Faça o *download* desta [pasta comprimida](/assets/extracting-illustrated-pages/lesson-files.zip) que contém dois *Jupyter notebooks*, um para cada uma das bibliotecas digitais. A pasta também contém um ficheiro de metadados JSON de amostra descrevendo uma coleção do HathiTrust. Descomprima e confirme que os seguintes ficheiros estão presentes: `554050894-1535834127.json`, `hathitrust.ipynb` e `internetarchive.ipynb`. + +
    +Todos os comandos subsequentes assumem que o seu diretório de trabalho atual é a pasta que contém os ficheiros da lição. +
    + +### Destino do *Download* + +Aqui está o diretório predefinido que será criado assim que todas as células em ambos os *notebooks* tiverem sido executadas (como providenciado). Depois de obter uma lista de quais páginas num volume contêm imagens, as funções de *download* do HT e do IA solicitam essas páginas como JPEGs (nomeadas pelo número de página) e arquivam-nas em subdiretórios (nomeados pelo ID do item). É claro que o leitor pode usar diferentes listas de volumes ou mudar o destino `out_dir` para algo que não `items`. + +``` +items/ +├── hathitrust +│ ├── hvd.32044021161005 +│ │ ├── 103.jpg +│ │ └── ... +│ └── osu.32435078698222 +│ ├── 100.jpg +│ ├── ... +└── internetarchive + └── talespeterparle00goodgoog + ├── 103.jpg + └── ... + +5 diretórios, 113 ficheiros +``` + +As funções de *download* são lentas; se executar os *notebooks* novamente, com o diretório `items` similar ao que se apresenta em cima, qualquer item que já tenha a sua própria subpasta será ignorado. + +## Anaconda (Opcional) + +A Anaconda é a principal distribuição científica do Python. O seu gerenciador de pacotes `conda` permite-lhe instalar bibliotecas como a `numpy` e a `tensorflow` com facilidade. A versão "Miniconda" não é acompanhada por quaisquer pacotes supérfluos pré-instalados, o que incentiva o leitor a manter o seu ambiente de base limpo e a instalar apenas o que necessita para um projeto dentro dum ambiente nomeado. + +Faça o *download* e instale o [Miniconda](https://conda.io/miniconda.html). Escolha a versão estável mais recente do Python 3. Se tudo correr bem, o leitor conseguirá executar `which conda` (no Linux/macOS) ou `where conda` (no Windows) no seu *shell* e ver a localização do programa executável no *output*. + +A Anaconda tem uma [*cheat sheet*](https://web.archive.org/web/20190115051900/https://conda.io/docs/_downloads/conda-cheatsheet.pdf) ou folha de dicas, em português, útil para comandos de uso frequente. + +### Criar um Ambiente + +Os ambientes, entre outras coisas, ajudam a controlar a complexidade associada ao uso de múltiplos gerenciadores de pacotes em conjunto. Nem todas as bibliotecas do Python podem ser instaladas através do `conda`. Em alguns casos, nós recorreremos ao gestor de pacote padrão do Python, o `pip` (ou alterações planejadas, como o `pipenv`). No entanto, quando o fizermos, nós usaremos uma versão do `pip` instalada através do `conda`. Isto mantém todos os pacotes que nós precisamos para o projeto no mesmo espaço virtual. + +```bash +# O seu ambiente atual é precedido por um asterisco +# (será a "base" num novo shell) +conda env list + +# Pacotes instalados no ambiente atual +conda list +``` + +Agora nós criamos um ambiente nomeado, configuramo-lo para usar Python 3, e ativamo-lo. + +```bash +# Note a sinalização "--name", que toma um argumento de string (e.g. "extract-pages") +# e a sintaxe para especificar a versão do Python +conda create --name extract-pages python=3 + +# Indique o novo ambiente (no Linux/macOS) +source activate extract-pages +``` + +```bash +# O comando do Windows para ativar o ambiente é ligeiramente diferente +conda activate extract-pages +``` + +Para sair dum ambiente, execute `source deactivate` no Linux/macOS ou `deactivate` no Windows. Mas certifique-se que permanece no ambiente `extract-pages` durante o decorrer da lição! + +### Instalar os Pacotes do Conda + +Nós podemos usar o `conda` para instalar os nossos primeiros pacotes. Todos os outros pacotes necessários (gzip, JSON, os, sys e time) fazem parte da [biblioteca padrão do Python](https://docs.python.org/3/library/). Note como nós precisamos de especificar um canal em alguns casos. O leitor pode pesquisar por pacotes no [Anaconda Cloud](https://anaconda.org/). + + +```bash +# Para garantir que nós temos uma versão local do pip (veja a discussão em baixo) +conda install pip + +conda install jupyter + +conda install --channel anaconda requests +``` + +O Jupyter tem muitas dependências (outros pacotes dos quais depende), por isso esta etapa pode exigir alguns minutos. Recorde-se que quando o `conda` lhe pergunta se deseja continuar com a instalação por via da questão `Proceed ([y]/n)?`, o leitor deve digitar um `y` ou um `yes` e, depois, pressionar *Enter* para aceitar a instalação do pacote. + +
    +Nos bastidores, o conda está a trabalhar para certificar-se que todos os pacotes e dependências necessários serão instalados numa maneira compatível. +
    + +### Instalar Pacotes do Pip + +Se estiver a usar um ambiente `conda`, é melhor usar a versão local do `pip`. Confirme que os seguintes comandos dão como resultado do *output* um programa cujo caminho absoluto contém algo como `/Miniconda/envs/extract-pages/Scripts/pip`. + +```bash +which pip +``` + +```bash +# O equivalente do Windows ao "which" +where pip +``` + +Se vir duas versões do `pip` no *output* em cima, certifique-se de digitar o caminho absoluto para a versão do ambiente *local* ao instalar as bibliotecas *wrapper* da API. + +```bash +pip install hathitrust-api +pip install internetarchive +``` + +```bash +# Exemplo do Windows usando o caminho absoluto para o executável do pip local +C:\Users\stephen-krewson\Miniconda\envs\extract-pages\Scripts\pip.exe install hathitrust-api internetarchive +# Substitua "stephen-krewson" pelo seu nome de utilizador +``` + +## *Jupyter Notebooks* + +O [*Text Mining in Python Through the HTRC Feature Reader*](/en/lessons/text-mining-with-extracted-features#start-a-notebook), de Peter Organisciak e Boris Capitanu, explica os benefícios dos *notebooks* para o desenvolvimento e a exploração de dados. Também contém informação útil sobre como executar eficazmente as células. Visto que nós instalámos a versão minimalista da Anaconda, nós precisamos de iniciar o Jupyter a partir da linha de comandos. No seu *shell* (a partir do interior da pasta contendo os ficheiros da lição) execute `jupyter notebook`. + +Isto executará o servidor do *notebook* no seu *shell* e iniciará o seu navegador de internet predefinido com a página inicial do Jupyter[^2]. A página inicial mostra todos os ficheiros no diretório de trabalho atual. + +{% include figure.html filename="jupyter-home.png" alt="Imagem com a apresentação da estrutura de ficheiros da página inicial do Jupyter" caption="A página inicial do Jupyter mostrando os ficheiros da lição." %} + +
    +No seu shell, certifique-se que usou o comando cd para ir até ao diretório descomprimido lesson-files. +
    + +Clique nos *notebooks* `hathitrust.ipynb` e `internetarchive.ipynb` para abri-los em novas abas do navegador de internet. A partir daqui, nós não precisamos de executar qualquer comando no *shell*. Os *notebooks* permitem-nos executar o código Python e ter acesso total ao sistema de pastas do computador. Quando o leitor tiver terminado, pode parar o servidor do *notebook* carregando em "*Quit*" na página inicial do Jupyter ou executando `ctrl+c` no *shell*. + +# HathiTrust + +## Acesso à API + +O leitor precisa efetuar um registro no HathiTrust antes de usar o API de dados. Dirija-se ao [portal de registro](https://babel.hathitrust.org/cgi/kgs/request) e preencha o seu nome, a sua organização e o seu e-mail para requerer chaves de acesso. O leitor deverá receber uma resposta no e-mail dentro de cerca dum minuto (**nota de tradução**: verifique também a caixa de *spam*). Clique no link, que o trará a uma página temporária com ambas as chaves exibidas. + +No *notebook* `hathitrust.ipynb`, examine a primeira célula (mostrada em baixo). Preencha as suas chaves da API como indicado. Depois, execute a célula clicando em "*Run*" na barra de navegação do *notebook*. + +```python +# Importe o wrapper da API de dados do HT +from hathitrust_api import DataAPI + +# Substitua as strings com as suas credenciais do HT (deixando as aspas) +ht_access_key = "YOUR_ACCESS_KEY_HERE" +ht_secret_key = "YOUR_SECRET_KEY_HERE" + +# Instancie o objeto de conexão da API de dados +data_api = DataAPI(ht_access_key, ht_secret_key) +``` + +
    +Cuidado! Não exponha as suas chaves de acesso através dum repositório público no GitHub (ou outro host de controle de versões). Elas serão pesquisáveis por qualquer outra pessoa. Uma boa prática para um projeto Python é a de armazenar as suas chaves de acesso como variáveis de ambiente ou salvá-las num ficheiro que não é versionado. +
    + +## Criar uma Lista de Volumes + +O HT permite a qualquer um fazer uma coleção de itens—o leitor nem sequer tem que estar na sua conta! No entanto, o leitor deveria registrar uma conta se quiser salvar a sua lista de volumes. Siga as [instruções](https://babel.hathitrust.org/cgi/mb?colltype=updated) para fazer algumas pesquisas no texto completo e para, depois, adicionar resultados escolhidos a uma coleção. Atualmente, o HathiTrust não tem uma API de pesquisa pública para adquirir volumes programaticamente; o leitor precisa de pesquisar através da sua *interface* da internet. + +Ao atualizar uma coleção, o HT mantém o rastro dos metadados associados para cada item nela. Eu incluí nos ficheiros da lição os metadados para uma lição de amostra no formato JSON. Se quisesse usar o ficheiro da sua própria coleção do HT, o leitor navegaria até à página das suas coleções e colocaria o cursor do *mouse* sobre o link dos metadados à esquerda para revelar a opção para fazer o *download* como JSON, como observado na seguinte captura de tela. + +{% include figure.html filename="download-ht-json.png" alt="Imagem de uma página web do site HathiTrust com instruções para download de metadados de ficheiros JSON" caption="Captura de tela de como fazer o *download* dos metadados de coleções no formato JSON." %} + +Assim que o leitor tiver feito o *download* do ficheiro JSON, basta movê-lo para o diretório onde colocou os *Jupyter notebooks*. Substitua o nome do ficheiro JSON no *notebook* do HT com o nome do ficheiro da sua coleção. + +O *notebook* mostra como usar *list comprehension* para obter todas as *strings* `htitem_id` dentro do objeto `gathers` que contem todas as informações da coleção. + +```python +# O leitor pode especificar o ficheiro de metadados da sua coleção aqui +metadata_path = "554050894-1535834127.json" + +with open(metadata_path, "r") as fp: + data = json.load(fp) + +# Uma lista de todas as IDs exclusivas na coleção +vol_ids = [item['htitem_id'] for item in data['gathers']] +``` + +
    +Os tutoriais normalmente mostram-lhe como processar um item de exemplo (muitas vezes de tamanho ou complexidade trivial). Isto é pedagogicamente conveniente, mas significa que o leitor está menos equipado para aplicar esse código a múltiplos itens—de longe o caso de uso mais comum. Nos notebooks, o leitor verá como encapsular transformações aplicadas a um item em funções que podem ser usadas num loop sobre uma coleção de itens. +
    + +## Característica Visual: IMAGE_ON_PAGE + +Dada uma lista de volumes, nós queremos explorar que características visuais eles têm ao nível da página. A [documentação mais recente](https://perma.cc/Y6UU-G9HZ) (2015) para o API de dados descreve um objeto metadados chamado `htd:pfeat` nas páginas 9-10. `htd:pfeat` é a abreviação para "HathiTrust Data API: Page Features". + +> * `htd:pfeat`­ - the page feature key (if available): +> - CHAPTER_START +> - COPYRIGHT +> - FIRST_CONTENT_CHAPTER_START +> - FRONT_COVER +> - INDEX +> - REFERENCES +> - TABLE_OF_CONTENTS +> - TITLE + +O que o *wrapper* `hathitrust-api` faz é disponibilizar os metadados completos para um volume do HT como um objeto Python. Dado o identificador dum volume, nós podemos pedir os seus metadados e, depois, fazer o *drill down* através da *sequência* de páginas até à informação ao nível da página. A *lista* `htd:pfeat` está associada com cada página num volume e, em teoria, contém todas as características que se aplicam a essa página. Na prática, existem mais algumas *tags* de características do que as oito listadas em cima. Aquela com a qual nós iremos trabalhar chama-se `IMAGE_ON_PAGE` e é mais abstratamente visual que *tags* estruturais como `CHAPTER_START`. + +Tom Burton-West, um bibliotecário pesquisador na biblioteca da *University of Michigan*, trabalha em estreita colaboração com o HathiTrust e o HTRC, o Centro de Pesquisa do HathiTrust. O Tom disse-me por e-mail que o HathiTrust recebe a informação `htd:pfeat` via o Google, com o qual trabalham proximamente desde a fundação do HT, em 2008. Um contacto no Google deu permissão ao Tom para partilhar o seguinte: + +> Estas *tags* são derivadas duma combinação de Heurística, de aprendizagem de máquina e de anotação humana. + +Um exemplo heurístico pode ser o facto do primeiro elemento na sequência de páginas do volume ser quase sempre a `FRONT_COVER`. A aprendizagem de máquina pode ser usada para treinar modelos a discriminar, digamos, entre dados de imagem que são mais típicos das linhas de prosa numa escrita ocidental ou das linhas numa gravura. A anotação humana é a atribuição manual de etiquetas a imagens. A habilidade de ver as ilustrações dum volume nos bancos de dados do EEBO e do ECCO é um exemplo de anotação humana. + +O uso da "aprendizagem de máquina" pelo Google parece um pouco misterioso. Até o Google publicitar os seus métodos, é impossível saber todos os detalhes. No entanto, é provável que as *tags* `IMAGE_ON_PAGE` tenham sido propostas pela primeira vez após a deteção de blocos de "Imagens" nos ficheiros de _output_ do OCR (um processo discutido em baixo, na secção do Internet Archive). Mais filtragem pode, então, ser aplicada. + +## Passo a Passo Para o Código + +### Encontrar as imagens + +Nós vimos como criar uma lista de volumes e observámos que a API de dados pode ser usada para obter objetos metadados contendo características experimentais ao nível da página. A função essencial no *notebook* do HT tem a assinatura digital `ht_picture_download(item_id, out_dir=None)`. Dado um identificador exclusivo e um diretório de destino opcional, esta função irá, em primeiro lugar, obter os metadados do volume a partir da API e convertê-los num formato JSON. Depois, percorre a sequência de páginas e verifica se a *tag* `IMAGE_ON_PAGE` está na lista `htd:pfeat` (se a mesma existir). + +```python +# Metadados da API no formato JSON (diferente dos metadados da coleção do HT) +meta = json.loads(data_api.getmeta(item_id, json=True)) + +# A sequência dá-nos cada página do item digitalizado em ordem, com qualquer +# informação adicional que lhe pode estar disponível +sequence = meta['htd:seqmap'][0]['htd:seq'] + +# A lista de páginas com imagens (vazio para a iniciação) +img_pages = [] + +# O bloco try/except lida com situações onde nenhuma "pfeats" existe OU +# os números da sequência não são numéricos +for page in sequence: + try: + if 'IMAGE_ON_PAGE' in page['htd:pfeat']: + img_pages.append(int(page['pseq'])) + except (KeyError, TypeError) as e: + continue +``` + +Note que nós precisamos de fazer o *drill down* por vários níveis até ao objeto do nível de topo para obter o objeto `htd:seq`, sobre o qual nós podemos iterar. + +As duas exceções que eu quero evitar são o `KeyError`, que ocorre quando a página não tem qualquer característica ao nível da página a si associada, e o `TypeError`, que ocorre quando o campo `pseq` para a página é, por alguma razão, não numérico e, portanto, não pode ser destinado a um `int`. Se algo correr mal com uma página, nós simplesmente executamos `continue` para passar à próxima. O plano é obter todos os dados bons que conseguirmos. Não é limpar inconsistências ou falhas nos metadados do item. + +### Fazer o *Download* das Imagens + +Assim que `img_pages` contém a lista completa de páginas com a *tag* `IMAGE_ON_PAGE`, nós podemos fazer o download dessas páginas. Note que, se nenhum `out_dir` for fornecido a `ht_picture_download()`, então a função simplesmente retorna a lista `img_pages` e NÃO faz o *download* do quer que seja. + +A chamada da API `getpageimage()` retorna um JPEG por predefinição. Nós simplesmente colocamos os bytes do JPEG num ficheiro na forma normal. Dentro da subpasta do volume (ela própria dentro do `out_dir`), as páginas serão nomeadas `1.jpg` para a página 1 e assim sucessivamente. + +Uma coisa a considerar é a nossa taxa de uso da API. Nós não queremos abusar do nosso acesso ao fazer centenas de pedidos por minuto. Para estar a salvo, especialmente se pretendermos executar grandes trabalhos, nós esperamos dois segundos antes de fazer cada pedido de página. Isto pode ser frustrante a curto prazo, mas ajuda a evitar o sufocamento ou a suspenção da API. + + +```python +for i, page in enumerate(img_pages): + try: + # Uma simples mensagem de estado + print("[{}] Downloading page {} ({}/{})".format(item_id, page, i+1, total_pages)) + + img = data_api.getpageimage(item_id, page) + + # N.B.: O loop só é executado se "out_dir" não for "None" + img_out = os.path.join(out_dir, str(page) + ".jpg") + + # Escreva a imagem + with open(img_out, 'wb') as fp: + fp.write(img) + + # Para evitar exceder o uso da API permitido + time.sleep(2) + + except Exception as e: + print("[{}] Error downloading page {}: {}".format(item_id, page,e)) +``` + +# Internet Archive + +## Acesso à API + +Nós conectamos à biblioteca API do Python usando uma conta no Archive.org com e-mail e palavra-chave ao invés das chaves de acesso do API. Isto é discutido no [Guia Quickstart](https://archive.org/developers/index.html). Se não tiver uma conta, [registre-se](https://archive.org/account/login.createaccount.php) para obter o seu "Virtual Library Card". + +Na primeira célula do *notebook* `internetarchive.ipynb`, introduza as suas credenciais como indicado. Execute a célula para autenticar-se perante a API. + +> **Nota de tradução**: O comando `ia.configure(ia_email, ia_password)` é atualmente desnecessário e pode gerar um erro extenso, em cuja mensagem final consta: `InvalidURL: Invalid URL 'https:///services/xauthn/': No host supplied`. Sugerimos que o mesmo não seja executado no ficheiro IPYNB. + +## Criar uma Lista de Volumes + +A biblioteca IA do Python permite-lhe submeter *query strings* e receber uma lista de pares chave-valor correspondentes na qual a palavra "*identifier*", ou identificador, em português, é a chave e o verdadeiro identificador é o valor. A sintaxe para uma *query* é explicada na [página de Advanced Search](https://archive.org/advancedsearch.php) para o IA. O leitor pode especificar parâmetros ao usar uma palavra-chave como "*date*" ou "*mediatype*" seguida de dois pontos e o valor que quer atribuir a esse parâmetro. Por exemplo, eu só quero resultados que são *textos* (em oposição a vídeos, *etc.*). Certifique-se que os parâmetros e as opções que está a tentar usar são suportadas pela funcionalidade de pesquisa do IA. Caso contrário, pode perder ou obter resultados estranhos e não saber porquê. + +No *notebook*, eu gero uma lista de IDs do IA com o seguinte código: + +```python +# Uma pesquisa de amostra (deve gerar dois resultados) +query = "peter parley date:[1825 TO 1830] mediatype:texts" +vol_ids = [result['identifier'] for result in ia.search_items(query)] +``` + +## Característica Visual: Blocos de Imagens + +O Internet Archive não apresenta quaisquer características ao nível da página. Ao invés, disponibiliza um certo número de ficheiros brutos do processo de digitalização aos utilizadores. O mais importante destes para os nossos propósitos é o ficheiro XML Abbyy. Abbyy é uma empresa russa cujo *software* FineReader domina o mercado do OCR. + +Todas as versões recentes do FineReader produzem um [documento XML](https://perma.cc/83EK-LXP2) que associa diferentes "blocos" com cada página no documento digitalizado. O tipo de bloco mais comum é `Text` mas também existem blocos `Picture` ou "Imagem", em português. Aqui está um bloco de exemplo tirado dum ficheiro de XML Abbyy do IA. Os cantos superior esquerdo ("t" e "l") e inferior direito ("b" e "r") são suficientes para identificar a região de bloco retangular. + +```xml + + + +``` + +O equivalente no IA a ver as *tags* `IMAGE_ON_PAGE` no HT é a análise do ficheiro XML Abbyy e a iteração sobre cada página. Se existir pelo menos um bloco `Picture` nessa página, a página é sinalizada como possivelmente contendo uma imagem. + +Enquanto a característica `IMAGE_ON_PAGE` do HT não contém informação sobre a *localização* dessa imagem, os blocos `Picture` no ficheiro XML estão associados a uma região retangular na página. No entanto, porque o FineReader se especializa no reconhecimento de letras de conjuntos de caracteres ocidentais, é muito menos preciso a identificar regiões de imagem. O projeto de Leetaru (veja *Visão Geral*) usou as coordenadas da região para cortar imagens, mas nesta lição nós iremos simplesmente fazer o *download* da página inteira. + +Parte da diversão intelectual desta lição é usar um *dataset* (*tags* de bloco do OCR) por vezes confuso para um propósito largamente não intencional: identificar imagens e não palavras. A certa altura, tornar-se-á computacionalmente viável executar modelos de aprendizagem aprofundada em todas as páginas ilustradas nuas num volume e escolher o(s) tipo(s) de imagem(/ns) desejada(s). Mas, como a maior parte das páginas na maioria dos volumes não são ilustradas, esta é uma tarefa dispendiosa. Por agora, faz mais sentido aproveitar os dados existentes que nós detemos do processo de ingestão do OCR. + +Para mais informações sobre como o próprio OCR funciona e interage com o processo de digitalização, por favor, veja a lição do *PH* de Mila Oiva, [OCR With Tesseract and ScanTailor](/en/lessons/retired/OCR-with-Tesseract-and-ScanTailor) (atenção que esta lição já não é actualizada). Erros podem surgir por causa de distorções, artefactos e muitos outros problemas. Estes erros acabam por afetar a fiabilidade e a precisão dos blocos "Picture". Em muitos casos, o Abbyy estimará que páginas em branco ou descoloridas são, na realidade, imagens. Estas *tags* de bloco incorretas, ainda que indesejadas, podem ser combatidas com o uso de redes neurais convolucionais retreinadas. Pense nas páginas com imagens cujo download foi feito nesta lição como um primeiro passo num processo mais longo para obter um *dataset* limpo e útil de ilustrações históricas. + +## Passo a Passo do Código + +### Encontrar as Imagens + +Tal como com o HT, a função principal para o IA é `ia_picture_download(item_id, out_dir=None)`. + +Visto que envolve o I/O dum ficheiro, o processo para obter a lista `img_pages` é mais complicado do que o do HT. Usando a utilidade `ia` (que é instalada com a biblioteca) da linha de comando, o leitor pode obter uma ideia dos ficheiros de metadados disponíveis para um volume. Com muitas poucas exceções, um ficheiro com o formato "Abbyy GZ" deveria estar disponível para volumes com o tipo de *media* `text` no Internet Archive. + +Estes ficheiros, mesmo quando comprimidos, podem facilmente ter centenas de megabytes de tamanho! Se existir um ficheiro Abbyy para o volume, nós obtemos o seu nome e depois fazemos o *download*. A chamada `ia.download()` usa alguns parâmetros úteis para ignorar a solicitação se o ficheiro já existe e, se não, para fazer o seu *download* sem criar um diretório aninhado. Para salvar espaço, nós eliminamos o ficheiro Abbyy depois de o analisar. + +```python +# Use o cliente da linha de comandos para ver os formatos de metadados disponíveis: +# `ia metadata formats VOLUME_ID` + +# Para esta lição, só o ficheiro Abbyy é necessário +returned_files = list(ia.get_files(item_id, formats=["Abbyy GZ"])) + +# Certifique-se de que algo é devolvido +if len(returned_files) > 0: + abbyy_file = returned_files[0].name +else: + print("[{}] Could not get Abbyy file".format(item_id)) + return None + +# Faça o download do ficheiro Abbyy para o CWD +ia.download(item_id, formats=["Abbyy GZ"], ignore_existing=True, \ + destdir=os.getcwd(), no_directory=True) +``` + +Assim que nós tivermos o ficheiro, nós precisamos de analisar o XML usando a biblioteca padrão do Python. Nós tomamos vantagem do facto de que nós podemos abrir o ficheiro comprimido diretamente com a biblioteca `gzip`. Os ficheiros Abbyy são indexadas a partir do zero, por isso a primeira página na sequência digitalizada tem o índice de 0. No entanto, nós temos que filtrar 0 porque não pode ser exigido do IA. A exclusão do índice 0 por parte do IA não está documentada em qualquer lugar; em vez disso, eu descobri através de tentativa e erro. Se o leitor ver uma mensagem de erro de explicação difícil, tente rastrear a origem e não tenha medo em pedir ajuda, seja a alguém com experiência relevante, seja a alguém da própria organização. + +```python +# Colecione as páginas com pelo menos um bloco de imagem +img_pages = [] + +with gzip.open(abbyy_file) as fp: + tree = ET.parse(fp) + document = tree.getroot() + for i, page in enumerate(document): + for block in page: + try: + if block.attrib['blockType'] == 'Picture': + img_pages.append(i) + break + except KeyError: + continue + +# 0 não é uma página válida para a realização de solicitações GET ao IA, mas às vezes +# está no ficheiro Abbyy comprimido +img_pages = [page for page in img_pages if page > 0] + +# Acompanhe o progresso do download +total_pages = len(img_pages) + +# Os ficheiros do OCR são pesados, por isso elimine assim que tivermos a lista de páginas +os.remove(abbyy_file) +``` + +### Fazer o *Download* das Imagens + +O *wrapper* do IA incorporado no Python não providencia uma função de download de páginas únicas—apenas em massa. Isto significa que nós usaremos a RESTful API do IA para obter páginas específicas. Primeiro, nós construímos um URL para cada página de que nós precisamos. Depois, nós usamos a biblioteca `requests` para enviar uma solicitação `GET` de HTTP e, se tudo correr bem (*i.e.* o código 200 é enviado na resposta), nós escrevemos o conteúdo da resposta num ficheiro JPEG. + +O IA tem estado a trabalhar numa [versão *alpha*](https://perma.cc/F6HJ-YGM7) duma API para o corte e redimensionamento de imagens que obedeça às exigências do International Image Interoperability Framework ([IIIF](https://perma.cc/7ABF-GGJM)). O IIIF representa uma profunda melhoria face ao antigo método para *downloads* de páginas únicas que requeriam a realização do *download* de ficheiros JP2, um formato de ficheiro largamente não suportado. Agora, é extremamente simples obter um só JPEG duma página: + +```python +# Veja: https://iiif.archivelab.org/iiif/documentation +urls = ["https://iiif.archivelab.org/iiif/{}${}/full/full/0/default.jpg".format(item_id, page) + for page in img_pages] + +# Sem download de página direto a partir da biblioteca do Python, construa uma solicitação GET +for i, page, url in zip(range(1,total_pages), img_pages, urls): + + rsp = requests.get(url, allow_redirects=True) + + if rsp.status_code == 200: + print("[{}] Downloading page {} ({}/{})".format(item_id, \ + page, i+1, total_pages)) + + with open(os.path.join(out_dir, str(page) + ".jpg"), "wb") as fp: + fp.write(rsp.content) +``` + +# Próximos Passos + +Assim que o leitor tiver entendido as principais funções e o código de *unpacking* dos dados nos *notebooks*, sinta-se livre para executar as células em sequência ou carregar em "*Run All*" e ver as páginas ilustradas a entrar nas pastas. O leitor é encorajado a adaptar estes *scripts* e funções para as suas próprias questões de pesquisa. + +[^1]: **Nota de tradução**: Aconselhamos o leitor a adicionar o Python ao PATH, processo que pode ser feito na ocasião da sua instalação. Isto irá suavizar a incorporação das dependências (veja *Dependências*). + +[^2]: **Nota de tradução**: Inicialmente, aparece uma página de transição, a qual deverá remeter rapidamente para o Jupyter. Caso tal não aconteça, basta seguir as instruções nesta página. + + diff --git a/pt/licoes/extrair-palavras-chave.md b/pt/licoes/extrair-palavras-chave.md index 8d7a0cb4d5..b6555e5e90 100644 --- a/pt/licoes/extrair-palavras-chave.md +++ b/pt/licoes/extrair-palavras-chave.md @@ -48,7 +48,7 @@ Se tem uma cópia de um texto armazenada no seu computador, é relativamente fá Esta lição é útil para qualquer um que trabalhe com fontes históricas armazenadas no seu próprio computador e que estejam transcritas em formatos mutáveis de texto eletrónico. É particularmente útil para pessoas interessadas em delimitar subgrupos de documentos que contenham uma ou mais de um grande número de palavras-chave. Isto pode ser útil para identificar um subconjunto relevante para leitura atenta ou para extrair e estruturar as palavras-chave num formato que possa ser usado noutra ferramenta digital como, por exemplo, dados de entrada para um exercício de mapeamento. -O presente tutorial mostrará aos usuários como extrair todas as menções a condados ingleses e gauleses de uma série de 6,692 minibiografias de indivíduos que ingressaram na Universidade de Oxford durante o reinado de Jaime I de Inglaterra (1603-1625). Estes registos foram transcritos pela [British History Online](http://www.british-history.ac.uk/alumni-oxon/1500-1714) (em inglês), através da versão impressa de *Alumni Oxonienses, 1500-1714*. Estas biografias contêm informação sobre cada aluno, incluindo a data dos seus estudos e a faculdade ou faculdades que frequentaram. Muitas vezes incluem informações adicionais, quando conhecidas, como a data de nascimento e morte, o nome e ocupação do pai, a sua naturalidade, e o percurso profissional posterior. As biografias são fontes ricas, das quais provêm informações relativamente comparáveis sobre um grande número de indivíduos semelhantes (homens ricos que frequentaram Oxford). Os 6,692 registos foram pré-processados pelo autor e salvos num [ficheiro CSV](https://perma.cc/MLL8-8BG4) (em inglês), com uma entrada por linha. +O presente tutorial mostrará aos usuários como extrair todas as menções a condados ingleses e gauleses de uma série de 6,692 minibiografias de indivíduos que ingressaram na Universidade de Oxford durante o reinado de Jaime I de Inglaterra (1603-1625). Estes registos foram transcritos pela [British History Online](https://www.british-history.ac.uk/alumni-oxon/1500-1714) (em inglês), através da versão impressa de *Alumni Oxonienses, 1500-1714*. Estas biografias contêm informação sobre cada aluno, incluindo a data dos seus estudos e a faculdade ou faculdades que frequentaram. Muitas vezes incluem informações adicionais, quando conhecidas, como a data de nascimento e morte, o nome e ocupação do pai, a sua naturalidade, e o percurso profissional posterior. As biografias são fontes ricas, das quais provêm informações relativamente comparáveis sobre um grande número de indivíduos semelhantes (homens ricos que frequentaram Oxford). Os 6,692 registos foram pré-processados pelo autor e salvos num [ficheiro CSV](https://perma.cc/MLL8-8BG4) (em inglês), com uma entrada por linha. Neste tutorial, o ["dataset"](https://perma.cc/V2B9-WVAK) envolve palavras-chave geográficas. Uma vez extraídas, os nomes de localidades podem ser georreferenciados para o seu local no globo e, depois, mapeados, recorrendo ao mapeamento digital. Isto torna possível determinar quais as faculdades que atraíam estudantes de determinadas partes do país, e se estes padrões se alteraram ao longo do tempo. Para um tutorial prático sobre como aplicar este próximo passo, veja a [lição de Fred Gibbs](https://perma.cc/64YX-2E2V) mencionada no final desta lição. Os leitores também podem estar interessados em ler [Georreferenciamento com o QGIS 3.20](/pt/licoes/georreferenciamento-qgis), também disponível no *Programming Historian*. diff --git a/pt/licoes/geocodificando-qgis.md b/pt/licoes/geocodificando-qgis.md index ec36275a3e..0f53109faf 100644 --- a/pt/licoes/geocodificando-qgis.md +++ b/pt/licoes/geocodificando-qgis.md @@ -80,10 +80,10 @@ A tradução deste tutorial foi feita utilizando o QGIS 3.32, numa máquina Wind Também precisará de utilizar uma base de dados relacional, como Microsoft Access ou o [LibreOffice Base](https://pt-br.libreoffice.org/baixar/) ou, então, ter bastante experiência com folhas de cálculo. As instruções no tutorial são feitas tendo o LibreOffice Base em mente, uma ferramenta de download gratuito como parte do pacote [LibreOffice](https://pt-br.libreoffice.org/baixar/).
    -Atenção: O LibreOffice requer uma instalação completa de Java para utilizar o aplicativo Base. Isto é facil de realizar fazendo o download e instalando o Java 8 Development Kit no seu sistema operacional pelo Oracle. O Java 8 Runtime Environment NÃO funciona com o LibreOffice no macOS 10.11. +Atenção: O LibreOffice requer uma instalação completa de Java para utilizar o aplicativo Base. Isto é facil de realizar fazendo o download e instalando o Java 8 Development Kit no seu sistema operacional pelo Oracle. O Java 8 Runtime Environment NÃO funciona com o LibreOffice no macOS 10.11.
    -O tutorial irá mapear os dados extraídos do [*Alumni Oxonienses*](http://www.british-history.ac.uk/alumni-oxon/1500-1714) (em inglês), da lição do *Programming Historian*, [Using Gazetteers to Extract Sets of Keywords from Free-Flowing Texts](/en/lessons/extracting-keywords) (em inglês), utilizando mapas de condados históricos da Inglaterra e do País de Gales, mapas estes disponíveis publicamente. Completar esse tutorial primeiro ajudará a compreender os dados mapeados aqui. Esses dados são oferecidos tanto como um conjunto de dados completo, quanto como um arquivo à parte que reúne os nomes de ex-alunos de Oxford pelos condados de origem, criado a partir do primeiro arquivo utilizando uma PivotTable do Excel. +O tutorial irá mapear os dados extraídos do [*Alumni Oxonienses*](https://www.british-history.ac.uk/alumni-oxon/1500-1714) (em inglês), da lição do *Programming Historian*, [Using Gazetteers to Extract Sets of Keywords from Free-Flowing Texts](/en/lessons/extracting-keywords) (em inglês), utilizando mapas de condados históricos da Inglaterra e do País de Gales, mapas estes disponíveis publicamente. Completar esse tutorial primeiro ajudará a compreender os dados mapeados aqui. Esses dados são oferecidos tanto como um conjunto de dados completo, quanto como um arquivo à parte que reúne os nomes de ex-alunos de Oxford pelos condados de origem, criado a partir do primeiro arquivo utilizando uma PivotTable do Excel. ## Os dados @@ -126,7 +126,7 @@ Atenção: O QGIS é bastante sensível ao corrigir arquivos CSV (Comma Separate Existe uma diferença importante entre Sistemas de Coordenadas Geográficas, que meramente definem as unidades de medida e o datum, e Sistemas de Coordenadas Projetadas, que também definem a maneira com a qual o globo é “achatado” sobre um mapa. O [OSGB](https://perma.cc/6U2D-V8SZ) (em inglês) está disponível em ambas as variantes do QGIS, então escolha a versão “projetada” que lhe dará um mapa no qual o Reino Unido apareça da maneira esperada. Para mais detalhes sobre projeções em SIG, veja o [tutorial Working with Projections in QGIS.](https://perma.cc/U47A-7CGG) (em inglês). -* Faça download de um shapefile contendo polígonos dos condados históricos da Inglaterra e do País de Gales em [http://www.county-borders.co.uk](http://www.county-borders.co.uk/) (em inglês) (selecione o arquivo `Definition A: SHP OSGB36 Simplified`, que é uma versão das fronteiras entre os condados da Grã-Bretanha, pré-1843, projetada sobre o OSGB, sem porções destacadas dos condados). Extraia o conteúdo do arquivo ZIP para a mesma pasta do seu projeto +* Faça download de um shapefile contendo polígonos dos condados históricos da Inglaterra e do País de Gales em [https://www.county-borders.co.uk](https://www.county-borders.co.uk/) (em inglês) (selecione o arquivo `Definition A: SHP OSGB36 Simplified`, que é uma versão das fronteiras entre os condados da Grã-Bretanha, pré-1843, projetada sobre o OSGB, sem porções destacadas dos condados). Extraia o conteúdo do arquivo ZIP para a mesma pasta do seu projeto * Clique no botão _Adicionar Camada Vetorial_ (remete a uma linha de gráfico), na barra de ferramentas Administrar Camadas, e então em _Explorar_ para selecionar e adicionar o shapefile `UKDefinitionA.shp` da pasta extraída. @@ -165,7 +165,7 @@ Ao alterar qualquer uma destas configurações contidas na página estilo gradua A geocodificação é uma técnica para além da simplesmente unir tabelas, pois cada linha individual dos seus dados mantém-se visível e passível de análise dentro do próprio software SIG, como pontos individuais no mapa (como na tabela 2). A princípio, o objetivo é atribuir a cada dado um par de coordenadas. A maior parte dos dados históricos não podem ser geocodificados automaticamente por meio de ferramentas online ou plugins do QGIS. Portanto, o processo de geocodificação deve ser realizado manualmente para combinar cada linha de dados a uma localização. Isso é uma tarefa operacional simples, unindo (combinando) os seus dados com um gazetteer (uma lista de lugares com suas coordenadas). Vários dicionários geográficos estão disponíveis, mas apenas alguns são pertinentes em relação a dados históricos. Por exemplo, para a Inglaterra: -- [Association of British Counties Gazetteer](http://www.gazetteer.org.uk/index.php) (em inglês) (dados disponíveis para compra) +- [Association of British Counties Gazetteer](https://www.gazetteer.org.uk/index.php) (em inglês) (dados disponíveis para compra) - [The Historical Gazetteer of England's Place Names](https://placenames.org.uk/) (em inglês) permite geocodificar as localizações individuais apenas online. Infelizmente, a API para acessar esses dados para geocodificação automática, conhecida como DEEP, parte do Unlock, já não está disponível (final de 2016). Uma melhor [interface de navegação](https://www.nottingham.ac.uk/research/groups/ins/Resources/Digital-Survey-of-English-Place-Names.aspx) está disponível em [Survey of English Place-Names](https://www.nottingham.ac.uk/research/groups/epns/survey.aspx) (em inglês). Caso não tenha nenhum gazetteer pertinente para a sua área ou período de estudo, é possível facilmente criar o seu próprio através de um mapa vetorial, criando uma camada de pontos contendo a informação necessária dentro do QGIS (talvez ao mesclar as informações de camadas pré-existentes) e exportando o resultando com coordenadas XY. Para determinadas partes do mundo não existem nem dicionários geográficos históricos, nem mapas vetoriais adequados para certos períodos históricos. Nesses casos, terá que se aventurar a criar seu próprio vetor e a sua camada de pontos; consulte o tutorial [Criar novas camadas vetoriais com o QGIS 2.0](/pt/licoes/camadas-vetoriais-qgis). @@ -175,7 +175,7 @@ Caso não tenha nenhum gazetteer pertinente para a sua área ou período de estu Uma vez completa a primeira parte, pode-se avançar e seguir os passos abaixo no mesmo projeto. Caso contrário, ou caso deseje criar um novo projeto em branco, siga as instruções da primeira seção para: * Criar um novo arquivo de projeto no QGIS, e configurar o Sistema de Referência Coordenado para `OSGB 1936/the British National Grid` com a ID de autoridade `ESPG:27700` como um sistema de projeção de coordenadas usando **Projeto** > **Propriedades** > **SRC**; -* Faça o download de um shapefile contendo polígonos dos condados históricos da Inglaterra e do País de Gales em [http://www.county-borders.co.uk/](http://www.county-borders.co.uk/) (em inglês) (selecione a definição A e o OSGB). +* Faça o download de um shapefile contendo polígonos dos condados históricos da Inglaterra e do País de Gales em [https://www.county-borders.co.uk/](https://www.county-borders.co.uk/) (em inglês) (selecione a definição A e o OSGB). No seu projeto pré-existente, pode então começar a adicionar mais camadas para criar o gazetteer: @@ -197,7 +197,7 @@ Agora, estes dados podem ser comparados aos seus dados pré-existentes para fina Podemos, agora, criar uma Tabela Composta com esses locais e os dados da nossa tabela original. Isto se dá ao corresponder o nome do condado, no campo “Lugar” da tabela de ex-alunos, ao ponto correspondente no gazetteer novo, utilizando uma base de dados relacional. -Esse tutorial supõe que tem várias centenas ou milhares ou linhas de dados (como neste tutorial), sendo necessário um método automatizado. Caso tenha apenas algumas linhas, ou caso tenha dificuldades para utilizar estes métodos, é possível fazê-lo manualmente — consulte [Geocodificando os seus próprios dados histórico](#Geocodificar-seus-próprios-dados-históricos)” abaixo. +Esse tutorial supõe que tem várias centenas ou milhares ou linhas de dados (como neste tutorial), sendo necessário um método automatizado. Caso tenha apenas algumas linhas, ou caso tenha dificuldades para utilizar estes métodos, é possível fazê-lo manualmente — consulte [Geocodificando os seus próprios dados histórico](#geocodificar-seus-próprios-dados-históricos)” abaixo. Em contextos mais simples (como este, onde iremos apenas combinar um único atributo de “lugar” — ou seja, apenas “condado”), é possível codificar os seus dados de acordo com um gazetteer com a função [PROCV](https://perma.cc/3JQ4-226T) (em inglês) do Microsoft Excel (ou folhas de cálculo equivalentes), ou até mesmo com o plugin [MMQGIS](https://michaelminn.com/linux/mmqgis/) do QGIS. No entanto, na prática, geralmente será preciso combinar diversos atributos simultaneamente (por exemplo, cidade, condado e país — seria necessário distinguir entre Sudbury, em Suffolk, Inglaterra; Sudbury, em Derbyshire, Inglaterra; Sudbury, em Middlesex, Inglaterra; e Sudbury, em Ontario, Canadá). Isso pode ser realizado através de um método trabalhoso, com a função [ÍNDICE](https://perma.cc/4JSF-R3ER) do Excel, que é mais prático, e extensível, numa base de dados relacional, como Microsoft Access ou o LibreOffice Base. @@ -222,7 +222,7 @@ Este tutorial usa o LibreOffice, uma alternativa de código aberto ao Microsoft * Exporte os resultados no formato CSV. No LibreOffice Base isto é possível ao arrastar a própria consulta sobre a primeira célula de uma nova folha de cálculo no LibreOffice Calc. Em seguida, em _Salvar como_, selecione o formato CSV na aba **Tipo** na parte de baixo da janela Salvar, e clique em _Salvar_ para criar o arquivo como `GeocodExAlunos.csv`.
    -Atenção: Ainda que as consultas da base de dados relacionais como estas sejam muito boas para combinar múltiplos critérios simultaneamente, elas também podem apresentar resultados errôneos se não forem verificadas cuidadosamente. Consulte a seção Solucionar problemas com a base de dados dos gazetteers unidos no final desse tutorial para dicas sobre como inspecionar os resultados das uniões quando trabalha com os seus próprios dados. +Atenção: Ainda que as consultas da base de dados relacionais como estas sejam muito boas para combinar múltiplos critérios simultaneamente, elas também podem apresentar resultados errôneos se não forem verificadas cuidadosamente. Consulte a seção Solucionar problemas com a base de dados dos gazetteers unidos no final desse tutorial para dicas sobre como inspecionar os resultados das uniões quando trabalha com os seus próprios dados.
    ### Adicionar dados geocodificados ao QGIS @@ -272,7 +272,7 @@ Enquanto este selecionaria apenas os matriculados antes de 1612: O processo aqui percorrido — o de combinar usando consultas externas — é aplicável a uma grande variedade de cenários, sempre que possa adquirir ou criar um gazetteer adequado. Lembre-se que o seu sucesso vai depender da consistência e precisão dos seus dados. Certifique-se de que as mesmas convenções são seguidas tanto nos dados quanto no gazetteer, principalmente quanto à pontuação (veja: _Devon_ ou _Devonshire_, _Hay-on-Wye_, ou _Hay on Wye_). Caso tenha a sorte de trabalhar com dados apresentados num formato moderno (como países, ruas e, até mesmo, códigos postais modernos), é possível utilizar os processos mais simples de geocodificação automatizada. Consultas a seção abaixo. -Caso possua apenas um pequeno número de fileiras nos seus dados, ou caso tenha dificuldade em padronizar as informações de localização num único campo para que seja geocodificada pelos métodos ensinados nesse tutorial, lembre-se que é possível realizar este processo manualmente. Basta utilizar uma das diversas ferramentas online de geocodificação para encontrar as coordenadas X e Y para cada fileira de dados nas colunas X e Y da sua folha de cálculo ou base de dados manualmente. Mas, lembre-se de anotar o sistema de coordenadas utilizado pela ferramenta para encontrar tais coordenadas (provavelmente, WGS1984)! Caso tenha dados geocodificados manualmente dessa forma, apenas siga as instruções acima a partir de [Adicionar dados geocodificados ao QGIS](#Adicionar-dados-geocodificados-ao-QGIS). +Caso possua apenas um pequeno número de fileiras nos seus dados, ou caso tenha dificuldade em padronizar as informações de localização num único campo para que seja geocodificada pelos métodos ensinados nesse tutorial, lembre-se que é possível realizar este processo manualmente. Basta utilizar uma das diversas ferramentas online de geocodificação para encontrar as coordenadas X e Y para cada fileira de dados nas colunas X e Y da sua folha de cálculo ou base de dados manualmente. Mas, lembre-se de anotar o sistema de coordenadas utilizado pela ferramenta para encontrar tais coordenadas (provavelmente, WGS1984)! Caso tenha dados geocodificados manualmente dessa forma, apenas siga as instruções acima a partir de [Adicionar dados geocodificados ao QGIS](#adicionar-dados-geocodificados-ao-qgis). ### Solucionar problemas com a base de dados dos gazetteers unidos diff --git a/pt/licoes/georreferenciamento-qgis.md b/pt/licoes/georreferenciamento-qgis.md index 782ea0c0de..dfbd2a6495 100644 --- a/pt/licoes/georreferenciamento-qgis.md +++ b/pt/licoes/georreferenciamento-qgis.md @@ -1,210 +1,210 @@ ---- -title: Georreferenciamento com o QGIS 3.20 -layout: lesson -collection: lessons -slug: georreferenciamento-qgis -original: georeferencing-qgis -date: 2013-12-13 -translation_date: 2023-05-01 -authors: -- Jim Clifford -- Josh MacFadyen -- Daniel Macfarlane -reviewers: -- Finn Arne Jørgensen -- Peter Webster -- Abby Schreiber -editors: -- Adam Crymble -translator: -- Ângela Pité -translation-editor: -- Joana Vieira Paulino -translation-reviewer: -- Luis Ferla -- Ana Sofia Ribeiro -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/434 -activity: transforming -topics: [mapping, data-visualization] -abstract: "Nesta lição aprenderá como georreferenciar mapas históricos para que possam ser adicionados a um SIG como uma camada raster." -avatar_alt: Mapa de uma cidade no topo de uma montanha -doi: 10.46430/phpt0039 ---- - -{% include toc.html %} - - -> Nota de tradução 1: Embora a lição original em inglês se refira à versão 2.0 do Quantum GIS (QGIS), na presente tradução da lição foi tomada a opção de usar uma versão mais recente do QGIS - a 3.20 - tendo-se efetuado as modificações necessárias para adaptar a lição a esta versão do software. -Tenha em atenção que, nos links que remetem para outras lições sobre o QGIS, a versão utilizada nestas será diferente da utilizada nesta tradução. - -> Nota de tradução 2: Na tradução desta lição usou-se a versão em pt-pt podendo-se, no entanto, optar também pela versão em pt-br do QGIS. - - -Objetivos da lição ------------- - -Nesta lição aprenderá como georreferenciar mapas históricos para que possam ser adicionados a um SIG como uma camada raster. O georreferenciamento é importante para quem queira digitalizar com precisão dados presentes num mapa em suporte papel e, visto que os historiadores trabalham sobretudo no domínio do documento em papel, o georreferenciamento é uma das ferramentas que mais frequentemente utilizamos. Esta técnica utiliza uma série de pontos de controlo para proporcionar a um objeto bidimensional, como um mapa em suporte papel, as coordenadas geográficas reais de que necessita para se alinhar com as características tridimensionais da terra no software SIG (em [Introdução ao Google Maps e Google Earth](/en/lessons/googlemaps-googleearth) (em inglês) vimos uma 'sobreposição', que é uma versão mais simplificada de georreferenciamento do Google Earth). - -O georreferenciamento de um mapa histórico requer um conhecimento tanto da geografia como da história do local que se está a estudar, de modo a garantir exatidão. As paisagens construídas e naturais mudaram ao longo do tempo e é importante confirmar se a localização dos seus pontos de controlo - quer sejam casas, intersecções ou mesmo cidades - tem permanecido constante. Introduzir pontos de controlo num SIG é fácil, mas nos bastidores o georreferenciamento usa processos complexos de transformação e compressão. Estes são utilizados para corrigir as distorções e imprecisões encontradas em muitos mapas históricos e ‘esticar’ os mapas para que se ajustem às coordenadas geográficas. Em cartografia isto é conhecido como [*rubber-sheeting*](https://perma.cc/4554-EWZB) (em inglês) - uma correção geométrica - pois trata o mapa como se fosse feito de borracha (*rubber*, em inglês) e os pontos de controlo como se fossem tachas 'fixando' o documento histórico a uma superfície tridimensional como o globo. - -## Começando - -Antes de começar a georreferenciar no QGIS é necessário ativar os Plugins apropriados (Módulos na versão do software em pt-pt). Na barra de ferramentas vá a Módulos (Plugins) -> Gerir e instalar módulos (plugins). - -{% include figure.html filename="tr-pt-georeferencing-qgis-1.png" alt="Imagem com detalhe do menu para gerir e instalar módulos" caption="Figura 1" %} - -Irá abrir uma janela intitulada "Módulos" (Plugins). Desça até *Georeferencer* GDAL, marque a caixa ao lado e clique "OK". - -{% include figure.html filename="tr-pt-georeferencing-qgis-2.png" alt="Imagem com lista dos módulos disponíveis" caption="Figura 2" %} - -- Neste ponto é preciso encerrar e reabrir o QGIS. Para o propósito deste exemplo, e para manter as coisas tão simples quanto possível, não reinicie o seu projeto existente e, em vez disso, inicie um novo projeto. -- Configure corretamente o [Sistema de Referência de Coordenadas (SRC) - *Coordenate Reference System (CRS)*](https://perma.cc/58HF-WURV) (em inglês). (Veja [Instalação do QGIS 2.0 e adição de camadas](/en/lessons/qgis-layers) (em inglês) para se relembrar. Tenha em mente que a versão do QGIS dessa lição será diferente da utilizada nesta tradução.) -- Guarde este novo projeto (no menu "Ficheiro", selecione "Guardar") e nomeie-o 'georreferenciamento'. -- Adicione a camada 'coastine_polygon'. (Veja [Instalação do QGIS 2.0 e adição de camadas](/en/lessons/qgis-layers) (em inglês) para relembrar. Tenha em atenção que a versão do QGIS dessa lição será diferente da utilizada nesta tradução.) - -## Abrir as Camadas SIG necessárias - -Para o estudo de caso da Ilha do Príncipe Eduardo (*Prince Edward Island* (PEI), em inglês) - utilizaremos os limites da cidade como pontos de controlo, pois estes foram estabelecidos em 1764 por Samuel Holland, para além de estarem identificados na maioria dos mapas da PEI e terem mudado pouco desde a sua criação. - -*Faça o download de 'lot_township_polygon':* - -Este é o *shapefile* que contém a camada vetorial atual que iremos usar para georreferenciar o mapa histórico. Note que, em 1764, não foram dados nomes aos municípios, mas um número de lote, pelo que normalmente são referidos na PEI como "Lotes" (*lots*, em inglês). Daí o nome do ficheiro 'lot_township_polygon'. - -- Download do ficheiro 'lot_township_polygon': - -[lot_town.SHP.zip](/assets/qgis-layers/lot_town.SHP.zip) - -- Depois de fazer o download do ficheiro coloque-o numa pasta que possa encontrar mais tarde e descompacte o ficheiro. (Lembre-se de manter todos os ficheiros juntos, uma vez que todos são necessários para abrir a camada no seu SIG). - -{% include figure.html filename="geo310.png" alt="Imagem da página com informação SIG no website Prince Edward Island" caption="Figura 3" %} - -*Adicione 'lot_township_polygon' ao QGIS:* - -- Em "Camada" no menu superior escolha "Adicionar" e "Adicionar Camada Vetorial" (alternativamente, o mesmo ícone que vê ao lado de "Adicionar Camada Vetorial" também pode ser selecionado a partir da barra de ferramentas). -- Clique em "Procurar". Navegue até ao seu ficheiro descompactado e selecione o ficheiro intitulado 'lot_township_polygon.shp'. -- Clique em "Abrir". - -{% include figure.html filename="geo41.png" alt="Imagem do ícone de menu Adicionar Camada Vetorial" caption="Figura 4" %} - -Para mais informações sobre como adicionar e visualizar camadas veja [Instalação do QGIS 2.0 e adição de camadas](/en/lessons/qgis-layers) (em inglês). Tenha em atenção que a versão do QGIS dessa lição será diferente da utilizada nesta tradução. - -{% include figure.html filename="tr-pt-georeferencing-qgis-5.png" alt="Imagem da área de trabalho do QGIS com os shapefiles incluídos" caption="Figura 5" %} - -## Abrir a ferramenta *Georeferencer* / Georreferenciador - -*Georeferencer* está agora disponível em "Raster" no menu superior - selecione-a. A ferramenta irá agora ter o título de "Georreferenciador". - -{% include figure.html filename="tr-pt-georeferencing-qgis-6.png" alt="Imagem com as opções do menu Raster" caption="Figura 6" %} - -*Adicione o seu mapa histórico:* - -- Na janela que surgirá clique no botão "Abrir Raster" no canto superior esquerdo (que é idêntico ao botão de "Adicionar camada raster"). - -{% include figure.html filename="geo71.png" alt="Imagem do ícone de menu Adicionar camada raster" caption="Figura 7" %} - -- Procure o ficheiro intitulado 'PEI_LakeMap1863.jpg' no seu computador e selecione "Abrir". [O download do ficheiro pode ser realizado aqui](https://geospatialhistorian.files.wordpress.com/2013/02/pei_lakemap1863.jpg), sendo que a sua localização original era no antigo repositório de mapas online *[Island Imagined](https://islandimagined.ca/islandora/object/imagined:208687)* (em inglês). -- Deverá, em seguida, definir o sistema de coordenadas desta camada. Na caixa "Filtro" procure por '2291′, e depois na caixa abaixo selecione 'NAD83 (CSRS98)/Príncipe Eduardo ...'. - -O resultado será o seguinte: - -{% include figure.html filename="tr-pt-georeferencing-qgis-8.png" alt="Imagem com visualização do ficheiro raster incluído" caption="Figura 8" %} - -*Adicionar pontos de controlo:* - -Planeie previamente as localizações que vai utilizar como pontos de controlo antes dos passos que se seguem. É muito mais fácil explorar primeiro todo o mapa histórico, e obter assim uma boa ideia dos melhores pontos a utilizar para os ter em conta mais tarde. - -Algumas sugestões para escolher os pontos de controlo: - -- **Quantos** pontos precisa? Normalmente quantos mais pontos atribuir, mais preciso será o seu mapa georreferenciado. Dois pontos de controlo indicarão ao SIG para escalar e rodar o mapa em relação a esses dois pontos, mas para se conseguir verdadeiramente executar um *rubbersheet* do mapa histórico é necessário adicionar mais pontos. -- **Onde** deve colocar os pontos de controlo? Escolha áreas tão próximas quanto possível dos quatro cantos do seu mapa para que essas áreas nas extremidades não sejam omitidas no *rubbersheeting*. -- Selecione pontos de controlo adicionais perto da sua área de interesse. Tudo entre os quatro pontos de controlo dos cantos deve ser georreferenciado de forma uniforme, mas se estiver preocupado com a precisão de um lugar em particular certifique-se de que seleciona pontos de controlo adicionais nessa área. -- Escolha o meio de cruzamentos e estradas, porque as margens das estradas mudaram ligeiramente ao longo do tempo à medida que as melhorias nestas iam sendo efetuadas. -- Verifique se os seus pontos de controlo não mudaram de localização ao longo do tempo. As estradas foram frequentemente redirecionadas, e mesmo casas e outros edifícios podem ter sido deslocados, especialmente nas [regiões atlânticas do Canadá](https://perma.cc/H8DK-KBXC) (em inglês). - -*Adicione o seu primeiro ponto de controlo:* - -**Primeiro**, navegue até a localização do seu primeiro ponto de controlo no **mapa histórico**. - -- Clique na lupa de zoom na barra de ferramentas da janela ou utilize a roda do mouse para fazer zoom. - -{% include figure.html filename="tr-pt-georeferencing-qgis-9.png" alt="Imagem com opções zoom no menu de ferramentas" caption="Figura 9" %} - -- Amplie para um ponto que possa reconhecer, tanto no seu mapa impresso como no seu SIG. - -- Clique em "Adicionar Ponto" na barra de ferramentas. - -{% include figure.html filename="tr-pt-georeferencing-qgis-10.png" alt="Imagem com opções de pontos de controlo no menu de ferramentas" caption="Figura 10" %} - -- Clique no local no mapa impresso que pode localizar no seu SIG (ou seja, o ponto de controlo). Uma janela abrirá para introduzir as coordenadas X e Y que correspondam ao ponto indicado ou, então, selecionar um ponto correspondente "A partir da tela do mapa". Clique nessa segunda opção. - -{% include figure.html filename="tr-pt-georeferencing-qgis-11.png" alt="Imagem com visualização do mapa e com janela de menu para introdução de coordenadas" caption="Figura 11" %} - -- A janela do "Georreferenciador" irá minimizar automaticamente. Clique no local do mapa no QGIS que coincida com o ponto de controlo. -- As coordenadas X e Y do ponto selecionado serão adicionadas imediatamente à janela "Introduza as coordenadas do mapa", assim como o SRC que lhes está associado. Se estiver satisfeito com o ponto selecionado clique em "OK" para criar o seu primeiro ponto de controlo. - -- Nesta fase identificámos um problema nos limites dos lotes. Planeámos utilizar a localização onde o limite sul do Lote 1 no extremo oeste da Província contém uma curva pronunciada perto do centro da massa terrestre. No entanto, nota-se que nem todas estas curvas pronunciadas nos limites dos lotes coincidem com o mapa histórico. É possível que os limites dos lotes tenham mudado um pouco nos 250 anos desde que foram estabelecidos, por isso é melhor escolher o ponto do qual se tem mais certezas. Neste caso a curva pronunciada entre o Lote 2 e o Lote 3 estava bem (veja a seta na imagem abaixo). Foi o limite dos Lotes 3 e 4 que mudou. A discrepância entre os limites dos lotes 1 e 2 mostra a necessidade de inserir mais pontos de controlo para executar corretamente um *rubbersheeting* neste mapa parcialmente distorcido de 1863, de forma a corresponder à camada da província no SIG. - -{% include figure.html filename="geo121.png" alt="Imagem com visualização da sobreposição dos mapas raster e vectorial" caption="Figura 12" %} - -*Adicione, pelo menos, mais um ponto de controlo:* - -- Regresse à janela do "Georreferenciador" e repita os passos em "*Adicione o seu primeiro ponto de controlo*" descritos acima, de modo a acrescentar mais pontos de controlo. -- Adicione um ponto perto do lado oposto do seu mapa impresso (quanto mais afastados estiverem os seus pontos de controlo, mais preciso é o processo de georreferenciamento) e outro perto de Charlottetown. -- Regresse à janela do "Georreferenciador". Deverá agora ver três pontos vermelhos no mapa impresso e três registos na tabela GCP (*Ground Control Points* - Pontos de Controlo no Terreno) na parte inferior da janela. - -{% include figure.html filename="tr-pt-georeferencing-qgis-13.png" alt="Imagem com visualização do mapa raster e respetivos pontos de controlo" caption="Figura 13" %} - -*Determine as configurações da transformação:* - -Antes de clicar em "Iniciar georreferenciamento" e começar o processo de georreferenciamento automático, especifique ao QGIS onde guardar o ficheiro (que será um ficheiro raster), como deve interpretar os seus pontos de controlo e como deve comprimir a imagem. - -- Clique no botão "Configuração da Transformação". - -{% include figure.html filename="geo141.png" alt="Imagem com ícone do botão Configuração da Transformação" caption="Figura 14" %} - -A maioria destas opções de configuração pode ser deixada como está predefinida. Neste exemplo foi usado: tipo de transformação "linear", método de reamostragem "vizinho mais próximo" e compressão "LZW". O SRC (Sistema de Referência de Coordenadas) de destino pode ficar o do projeto, mas pode também usar esta função para dar ao novo raster um sistema de referência diferente. - -- O seu novo ficheiro raster georreferenciado será guardado por predefinição na pasta do projeto. [Tif](https://perma.cc/WZ6W-J4YF) é o formato predefinido para rasters georreferenciados no QGIS. -- Tenha em mente que um ficheiro Tif vai ser muito mais pesado que o seu mapa original, mesmo com compressão LZW. Por isso, certifique-se de que tem espaço suficiente se estiver a utilizar, por exemplo, uma USB pen drive. (*Aviso*: o ficheiro Tif produzido a partir deste 6.8 Mb .jpg será **maior que 1GB** depois de georreferenciado). Uma forma de controlar o tamanho do ficheiro raster georreferenciado e manter uma resolução suficientemente alta para ter legibilidade é recortar apenas a área do mapa importante para o projeto. Poderá também procurar se está disponível uma versão de menor resolução da imagem do mapa histórico. - -- Não será necessário um [*world file*](https://perma.cc/A9RZ-J8VG) (em inglês), a menos que queira georreferenciar novamente a mesma imagem noutro SIG ou se alguém precisar de georreferenciar a imagem e não tiver acesso aos seus dados SIG, Sistema de Referência de Coordenadas, *etc.*,... -- É possível selecionar 'Use 0 para transparência quando necessário' de forma a eliminar espaços negros à volta das margens do mapa, mas não é essencial, e pode experimentar conforme precisar. -- Não será necessário definir a resolução de saída. -- Certifique-se de que "Carregar no QGIS quando concluído" está selecionado de modo a poupar um passo. Assim irá adicionar automaticamente o novo ficheiro ao seu SIG para que mais tarde não tenha de procurar o ficheiro Tif. Depois de configurada a transformação clique em "OK". - -{% include figure.html filename="tr-pt-georeferencing-qgis-15.png" alt="Imagem da janela de configurações da transformação" caption="Figura 15" %} - -## Georreferenciar! - -- Clique no botão "Iniciar georreferenciamento" na barra de ferramentas (ao lado de "Abrir Raster") - o que dá início ao processo de georreferenciamento. - -{% include figure.html filename="geo161.png" alt="Imagem do ícone do botão Iniciar georreferenciamento" caption="Figura 16" %} - -{% include figure.html filename="tr-pt-georeferencing-qgis-17.png" alt="Imagem de janela com barra de indicação de progresso do georreferenciamento" caption="Figura 17" %} - -{% include figure.html filename="tr-pt-georeferencing-qgis-18.png" alt="Imagem da área de trabalho do QGIS com o raster resultante do processo de georreferenciamento" caption="Figura 18" %} - -*Explore o seu mapa:* - -- Arraste a nova camada 'PEI_LakeMap1863_alterado' para o final do seu índice de camadas (ou seja, abaixo da camada 'lot_township_polygon'). - -{% include figure.html filename="tr-pt-georeferencing-qgis-19.png" alt="Imagem da área de trabalho do QGIS com o shapefile dos polígonos por cima do raster" caption="Figura 19" %} - -- Mude o preenchimento da camada 'lot_township_polygon' para "Sem preenchimento", selecionando a camada e depois em "Propriedades" escolher Simbologia -> Preenchimento Simples -> Estilo de Preenchimento -> Sem preenchimento. Clique em "OK". - -{% include figure.html filename="tr-pt-georeferencing-qgis-20.png" alt="Imagem com a janela das configurações de simbologia do shapefile" caption="Figura 20" %} - -- Agora deve conseguir ver a camada SIG atual com o mapa histórico no fundo. - -{% include figure.html filename="tr-pt-georeferencing-qgis-21.png" alt="Imagem da área de trabalho do QGIS com o shapefile dos polígonos transparentes por cima do raster" caption="Figura 21" %} - -Como já tem um mapa georreferenciado no seu SIG pode explorar a camada, ajustar a transparência, o contraste e o brilho e, novamente, [Criar novas camadas vetoriais com o QGIS 2.0](/pt/licoes/camadas-vetoriais-qgis) para digitalizar parte da informação histórica que foi criada. (Tenha em mente que a versão do QGIS da lição no link será diferente da utilizada nesta tradução.) -Por exemplo, este mapa georreferenciado da PEI mostra a localização de todas as habitações em 1863, incluindo o nome do chefe de família. Através da atribuição de pontos no mapa é possível introduzir as localizações das habitações e nomes dos proprietários e, a seguir, analisar ou partilhar essa nova camada geo-espacial como um *shapefile*. - -Ao digitalizar vetores de linhas, tais como estradas ou linhas costeiras, pode comparar a localização destes elementos com outros dados históricos ou simplesmente compará-los visualmente com a camada 'lot_township_polygon' neste SIG. - -Em processos mais avançados pode, inclusivamente, sobrepor esta imagem georreferenciada com um DEM (*Digital Elevation Model* - Modelo de Elevação Digital) para proporcionar-lhe um efeito de altura através de sombras (*hillshade*) ou um efeito 3D e, assim, realizar um '*fly-over*' e ter uma perspetiva aérea das habitações da PEI no século XIX. - -*Esta lição é parte do [Geospatial Historian](https://perma.cc/6AN6-N7LX).* - +--- +title: Georreferenciamento com o QGIS 3.20 +layout: lesson +collection: lessons +slug: georreferenciamento-qgis +original: georeferencing-qgis +date: 2013-12-13 +translation_date: 2023-05-01 +authors: +- Jim Clifford +- Josh MacFadyen +- Daniel Macfarlane +reviewers: +- Finn Arne Jørgensen +- Peter Webster +- Abby Schreiber +editors: +- Adam Crymble +translator: +- Ângela Pité +translation-editor: +- Joana Vieira Paulino +translation-reviewer: +- Luis Ferla +- Ana Sofia Ribeiro +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/434 +activity: transforming +topics: [mapping, data-visualization] +abstract: "Nesta lição aprenderá como georreferenciar mapas históricos para que possam ser adicionados a um SIG como uma camada raster." +avatar_alt: Mapa de uma cidade no topo de uma montanha +doi: 10.46430/phpt0039 +--- + +{% include toc.html %} + + +> Nota de tradução 1: Embora a lição original em inglês se refira à versão 2.0 do Quantum GIS (QGIS), na presente tradução da lição foi tomada a opção de usar uma versão mais recente do QGIS - a 3.20 - tendo-se efetuado as modificações necessárias para adaptar a lição a esta versão do software. +Tenha em atenção que, nos links que remetem para outras lições sobre o QGIS, a versão utilizada nestas será diferente da utilizada nesta tradução. + +> Nota de tradução 2: Na tradução desta lição usou-se a versão em pt-pt podendo-se, no entanto, optar também pela versão em pt-br do QGIS. + + +Objetivos da lição +------------ + +Nesta lição aprenderá como georreferenciar mapas históricos para que possam ser adicionados a um SIG como uma camada raster. O georreferenciamento é importante para quem queira digitalizar com precisão dados presentes num mapa em suporte papel e, visto que os historiadores trabalham sobretudo no domínio do documento em papel, o georreferenciamento é uma das ferramentas que mais frequentemente utilizamos. Esta técnica utiliza uma série de pontos de controlo para proporcionar a um objeto bidimensional, como um mapa em suporte papel, as coordenadas geográficas reais de que necessita para se alinhar com as características tridimensionais da terra no software SIG (em [Introdução ao Google Maps e Google Earth](/en/lessons/googlemaps-googleearth) (em inglês) vimos uma 'sobreposição', que é uma versão mais simplificada de georreferenciamento do Google Earth). + +O georreferenciamento de um mapa histórico requer um conhecimento tanto da geografia como da história do local que se está a estudar, de modo a garantir exatidão. As paisagens construídas e naturais mudaram ao longo do tempo e é importante confirmar se a localização dos seus pontos de controlo - quer sejam casas, intersecções ou mesmo cidades - tem permanecido constante. Introduzir pontos de controlo num SIG é fácil, mas nos bastidores o georreferenciamento usa processos complexos de transformação e compressão. Estes são utilizados para corrigir as distorções e imprecisões encontradas em muitos mapas históricos e ‘esticar’ os mapas para que se ajustem às coordenadas geográficas. Em cartografia isto é conhecido como [*rubber-sheeting*](https://perma.cc/4554-EWZB) (em inglês) - uma correção geométrica - pois trata o mapa como se fosse feito de borracha (*rubber*, em inglês) e os pontos de controlo como se fossem tachas 'fixando' o documento histórico a uma superfície tridimensional como o globo. + +## Começando + +Antes de começar a georreferenciar no QGIS é necessário ativar os Plugins apropriados (Módulos na versão do software em pt-pt). Na barra de ferramentas vá a Módulos (Plugins) -> Gerir e instalar módulos (plugins). + +{% include figure.html filename="tr-pt-georeferencing-qgis-1.png" alt="Imagem com detalhe do menu para gerir e instalar módulos" caption="Figura 1" %} + +Irá abrir uma janela intitulada "Módulos" (Plugins). Desça até *Georeferencer* GDAL, marque a caixa ao lado e clique "OK". + +{% include figure.html filename="tr-pt-georeferencing-qgis-2.png" alt="Imagem com lista dos módulos disponíveis" caption="Figura 2" %} + +- Neste ponto é preciso encerrar e reabrir o QGIS. Para o propósito deste exemplo, e para manter as coisas tão simples quanto possível, não reinicie o seu projeto existente e, em vez disso, inicie um novo projeto. +- Configure corretamente o [Sistema de Referência de Coordenadas (SRC) - *Coordenate Reference System (CRS)*](https://perma.cc/58HF-WURV) (em inglês). (Veja [Instalação do QGIS 2.0 e adição de camadas](/en/lessons/qgis-layers) (em inglês) para se relembrar. Tenha em mente que a versão do QGIS dessa lição será diferente da utilizada nesta tradução.) +- Guarde este novo projeto (no menu "Ficheiro", selecione "Guardar") e nomeie-o 'georreferenciamento'. +- Adicione a camada 'coastine_polygon'. (Veja [Instalação do QGIS 2.0 e adição de camadas](/en/lessons/qgis-layers) (em inglês) para relembrar. Tenha em atenção que a versão do QGIS dessa lição será diferente da utilizada nesta tradução.) + +## Abrir as Camadas SIG necessárias + +Para o estudo de caso da Ilha do Príncipe Eduardo (*Prince Edward Island* (PEI), em inglês) - utilizaremos os limites da cidade como pontos de controlo, pois estes foram estabelecidos em 1764 por Samuel Holland, para além de estarem identificados na maioria dos mapas da PEI e terem mudado pouco desde a sua criação. + +*Faça o download de 'lot_township_polygon':* + +Este é o *shapefile* que contém a camada vetorial atual que iremos usar para georreferenciar o mapa histórico. Note que, em 1764, não foram dados nomes aos municípios, mas um número de lote, pelo que normalmente são referidos na PEI como "Lotes" (*lots*, em inglês). Daí o nome do ficheiro 'lot_township_polygon'. + +- Download do ficheiro 'lot_township_polygon': + +[lot_town.SHP.zip](/assets/qgis-layers/lot_town.SHP.zip) + +- Depois de fazer o download do ficheiro coloque-o numa pasta que possa encontrar mais tarde e descompacte o ficheiro. (Lembre-se de manter todos os ficheiros juntos, uma vez que todos são necessários para abrir a camada no seu SIG). + +{% include figure.html filename="geo310.png" alt="Imagem da página com informação SIG no website Prince Edward Island" caption="Figura 3" %} + +*Adicione 'lot_township_polygon' ao QGIS:* + +- Em "Camada" no menu superior escolha "Adicionar" e "Adicionar Camada Vetorial" (alternativamente, o mesmo ícone que vê ao lado de "Adicionar Camada Vetorial" também pode ser selecionado a partir da barra de ferramentas). +- Clique em "Procurar". Navegue até ao seu ficheiro descompactado e selecione o ficheiro intitulado 'lot_township_polygon.shp'. +- Clique em "Abrir". + +{% include figure.html filename="geo41.png" alt="Imagem do ícone de menu Adicionar Camada Vetorial" caption="Figura 4" %} + +Para mais informações sobre como adicionar e visualizar camadas veja [Instalação do QGIS 2.0 e adição de camadas](/en/lessons/qgis-layers) (em inglês). Tenha em atenção que a versão do QGIS dessa lição será diferente da utilizada nesta tradução. + +{% include figure.html filename="tr-pt-georeferencing-qgis-5.png" alt="Imagem da área de trabalho do QGIS com os shapefiles incluídos" caption="Figura 5" %} + +## Abrir a ferramenta *Georeferencer* / Georreferenciador + +*Georeferencer* está agora disponível em "Raster" no menu superior - selecione-a. A ferramenta irá agora ter o título de "Georreferenciador". + +{% include figure.html filename="tr-pt-georeferencing-qgis-6.png" alt="Imagem com as opções do menu Raster" caption="Figura 6" %} + +*Adicione o seu mapa histórico:* + +- Na janela que surgirá clique no botão "Abrir Raster" no canto superior esquerdo (que é idêntico ao botão de "Adicionar camada raster"). + +{% include figure.html filename="geo71.png" alt="Imagem do ícone de menu Adicionar camada raster" caption="Figura 7" %} + +- Procure o ficheiro intitulado 'PEI_LakeMap1863.jpg' no seu computador e selecione "Abrir". [O download do ficheiro pode ser realizado aqui](https://geospatialhistorian.files.wordpress.com/2013/02/pei_lakemap1863.jpg), sendo que a sua localização original era no antigo repositório de mapas online *[Island Imagined](https://islandimagined.ca/islandora/object/imagined:208687)* (em inglês). +- Deverá, em seguida, definir o sistema de coordenadas desta camada. Na caixa "Filtro" procure por '2291′, e depois na caixa abaixo selecione 'NAD83 (CSRS98)/Príncipe Eduardo ...'. + +O resultado será o seguinte: + +{% include figure.html filename="tr-pt-georeferencing-qgis-8.png" alt="Imagem com visualização do ficheiro raster incluído" caption="Figura 8" %} + +*Adicionar pontos de controlo:* + +Planeie previamente as localizações que vai utilizar como pontos de controlo antes dos passos que se seguem. É muito mais fácil explorar primeiro todo o mapa histórico, e obter assim uma boa ideia dos melhores pontos a utilizar para os ter em conta mais tarde. + +Algumas sugestões para escolher os pontos de controlo: + +- **Quantos** pontos precisa? Normalmente quantos mais pontos atribuir, mais preciso será o seu mapa georreferenciado. Dois pontos de controlo indicarão ao SIG para escalar e rodar o mapa em relação a esses dois pontos, mas para se conseguir verdadeiramente executar um *rubbersheet* do mapa histórico é necessário adicionar mais pontos. +- **Onde** deve colocar os pontos de controlo? Escolha áreas tão próximas quanto possível dos quatro cantos do seu mapa para que essas áreas nas extremidades não sejam omitidas no *rubbersheeting*. +- Selecione pontos de controlo adicionais perto da sua área de interesse. Tudo entre os quatro pontos de controlo dos cantos deve ser georreferenciado de forma uniforme, mas se estiver preocupado com a precisão de um lugar em particular certifique-se de que seleciona pontos de controlo adicionais nessa área. +- Escolha o meio de cruzamentos e estradas, porque as margens das estradas mudaram ligeiramente ao longo do tempo à medida que as melhorias nestas iam sendo efetuadas. +- Verifique se os seus pontos de controlo não mudaram de localização ao longo do tempo. As estradas foram frequentemente redirecionadas, e mesmo casas e outros edifícios podem ter sido deslocados, especialmente nas [regiões atlânticas do Canadá](https://perma.cc/H8DK-KBXC) (em inglês). + +*Adicione o seu primeiro ponto de controlo:* + +**Primeiro**, navegue até a localização do seu primeiro ponto de controlo no **mapa histórico**. + +- Clique na lupa de zoom na barra de ferramentas da janela ou utilize a roda do mouse para fazer zoom. + +{% include figure.html filename="tr-pt-georeferencing-qgis-9.png" alt="Imagem com opções zoom no menu de ferramentas" caption="Figura 9" %} + +- Amplie para um ponto que possa reconhecer, tanto no seu mapa impresso como no seu SIG. + +- Clique em "Adicionar Ponto" na barra de ferramentas. + +{% include figure.html filename="tr-pt-georeferencing-qgis-10.png" alt="Imagem com opções de pontos de controlo no menu de ferramentas" caption="Figura 10" %} + +- Clique no local no mapa impresso que pode localizar no seu SIG (ou seja, o ponto de controlo). Uma janela abrirá para introduzir as coordenadas X e Y que correspondam ao ponto indicado ou, então, selecionar um ponto correspondente "A partir da tela do mapa". Clique nessa segunda opção. + +{% include figure.html filename="tr-pt-georeferencing-qgis-11.png" alt="Imagem com visualização do mapa e com janela de menu para introdução de coordenadas" caption="Figura 11" %} + +- A janela do "Georreferenciador" irá minimizar automaticamente. Clique no local do mapa no QGIS que coincida com o ponto de controlo. +- As coordenadas X e Y do ponto selecionado serão adicionadas imediatamente à janela "Introduza as coordenadas do mapa", assim como o SRC que lhes está associado. Se estiver satisfeito com o ponto selecionado clique em "OK" para criar o seu primeiro ponto de controlo. + +- Nesta fase identificámos um problema nos limites dos lotes. Planeámos utilizar a localização onde o limite sul do Lote 1 no extremo oeste da Província contém uma curva pronunciada perto do centro da massa terrestre. No entanto, nota-se que nem todas estas curvas pronunciadas nos limites dos lotes coincidem com o mapa histórico. É possível que os limites dos lotes tenham mudado um pouco nos 250 anos desde que foram estabelecidos, por isso é melhor escolher o ponto do qual se tem mais certezas. Neste caso a curva pronunciada entre o Lote 2 e o Lote 3 estava bem (veja a seta na imagem abaixo). Foi o limite dos Lotes 3 e 4 que mudou. A discrepância entre os limites dos lotes 1 e 2 mostra a necessidade de inserir mais pontos de controlo para executar corretamente um *rubbersheeting* neste mapa parcialmente distorcido de 1863, de forma a corresponder à camada da província no SIG. + +{% include figure.html filename="geo121.png" alt="Imagem com visualização da sobreposição dos mapas raster e vectorial" caption="Figura 12" %} + +*Adicione, pelo menos, mais um ponto de controlo:* + +- Regresse à janela do "Georreferenciador" e repita os passos em "*Adicione o seu primeiro ponto de controlo*" descritos acima, de modo a acrescentar mais pontos de controlo. +- Adicione um ponto perto do lado oposto do seu mapa impresso (quanto mais afastados estiverem os seus pontos de controlo, mais preciso é o processo de georreferenciamento) e outro perto de Charlottetown. +- Regresse à janela do "Georreferenciador". Deverá agora ver três pontos vermelhos no mapa impresso e três registos na tabela GCP (*Ground Control Points* - Pontos de Controlo no Terreno) na parte inferior da janela. + +{% include figure.html filename="tr-pt-georeferencing-qgis-13.png" alt="Imagem com visualização do mapa raster e respetivos pontos de controlo" caption="Figura 13" %} + +*Determine as configurações da transformação:* + +Antes de clicar em "Iniciar georreferenciamento" e começar o processo de georreferenciamento automático, especifique ao QGIS onde guardar o ficheiro (que será um ficheiro raster), como deve interpretar os seus pontos de controlo e como deve comprimir a imagem. + +- Clique no botão "Configuração da Transformação". + +{% include figure.html filename="geo141.png" alt="Imagem com ícone do botão Configuração da Transformação" caption="Figura 14" %} + +A maioria destas opções de configuração pode ser deixada como está predefinida. Neste exemplo foi usado: tipo de transformação "linear", método de reamostragem "vizinho mais próximo" e compressão "LZW". O SRC (Sistema de Referência de Coordenadas) de destino pode ficar o do projeto, mas pode também usar esta função para dar ao novo raster um sistema de referência diferente. + +- O seu novo ficheiro raster georreferenciado será guardado por predefinição na pasta do projeto. [Tif](https://perma.cc/WZ6W-J4YF) é o formato predefinido para rasters georreferenciados no QGIS. +- Tenha em mente que um ficheiro Tif vai ser muito mais pesado que o seu mapa original, mesmo com compressão LZW. Por isso, certifique-se de que tem espaço suficiente se estiver a utilizar, por exemplo, uma USB pen drive. (*Aviso*: o ficheiro Tif produzido a partir deste 6.8 Mb .jpg será **maior que 1GB** depois de georreferenciado). Uma forma de controlar o tamanho do ficheiro raster georreferenciado e manter uma resolução suficientemente alta para ter legibilidade é recortar apenas a área do mapa importante para o projeto. Poderá também procurar se está disponível uma versão de menor resolução da imagem do mapa histórico. + +- Não será necessário um [*world file*](https://perma.cc/A9RZ-J8VG) (em inglês), a menos que queira georreferenciar novamente a mesma imagem noutro SIG ou se alguém precisar de georreferenciar a imagem e não tiver acesso aos seus dados SIG, Sistema de Referência de Coordenadas, *etc.*,... +- É possível selecionar 'Use 0 para transparência quando necessário' de forma a eliminar espaços negros à volta das margens do mapa, mas não é essencial, e pode experimentar conforme precisar. +- Não será necessário definir a resolução de saída. +- Certifique-se de que "Carregar no QGIS quando concluído" está selecionado de modo a poupar um passo. Assim irá adicionar automaticamente o novo ficheiro ao seu SIG para que mais tarde não tenha de procurar o ficheiro Tif. Depois de configurada a transformação clique em "OK". + +{% include figure.html filename="tr-pt-georeferencing-qgis-15.png" alt="Imagem da janela de configurações da transformação" caption="Figura 15" %} + +## Georreferenciar! + +- Clique no botão "Iniciar georreferenciamento" na barra de ferramentas (ao lado de "Abrir Raster") - o que dá início ao processo de georreferenciamento. + +{% include figure.html filename="geo161.png" alt="Imagem do ícone do botão Iniciar georreferenciamento" caption="Figura 16" %} + +{% include figure.html filename="tr-pt-georeferencing-qgis-17.png" alt="Imagem de janela com barra de indicação de progresso do georreferenciamento" caption="Figura 17" %} + +{% include figure.html filename="tr-pt-georeferencing-qgis-18.png" alt="Imagem da área de trabalho do QGIS com o raster resultante do processo de georreferenciamento" caption="Figura 18" %} + +*Explore o seu mapa:* + +- Arraste a nova camada 'PEI_LakeMap1863_alterado' para o final do seu índice de camadas (ou seja, abaixo da camada 'lot_township_polygon'). + +{% include figure.html filename="tr-pt-georeferencing-qgis-19.png" alt="Imagem da área de trabalho do QGIS com o shapefile dos polígonos por cima do raster" caption="Figura 19" %} + +- Mude o preenchimento da camada 'lot_township_polygon' para "Sem preenchimento", selecionando a camada e depois em "Propriedades" escolher Simbologia -> Preenchimento Simples -> Estilo de Preenchimento -> Sem preenchimento. Clique em "OK". + +{% include figure.html filename="tr-pt-georeferencing-qgis-20.png" alt="Imagem com a janela das configurações de simbologia do shapefile" caption="Figura 20" %} + +- Agora deve conseguir ver a camada SIG atual com o mapa histórico no fundo. + +{% include figure.html filename="tr-pt-georeferencing-qgis-21.png" alt="Imagem da área de trabalho do QGIS com o shapefile dos polígonos transparentes por cima do raster" caption="Figura 21" %} + +Como já tem um mapa georreferenciado no seu SIG pode explorar a camada, ajustar a transparência, o contraste e o brilho e, novamente, [Criar novas camadas vetoriais com o QGIS 2.0](/pt/licoes/camadas-vetoriais-qgis) para digitalizar parte da informação histórica que foi criada. (Tenha em mente que a versão do QGIS da lição no link será diferente da utilizada nesta tradução.) +Por exemplo, este mapa georreferenciado da PEI mostra a localização de todas as habitações em 1863, incluindo o nome do chefe de família. Através da atribuição de pontos no mapa é possível introduzir as localizações das habitações e nomes dos proprietários e, a seguir, analisar ou partilhar essa nova camada geo-espacial como um *shapefile*. + +Ao digitalizar vetores de linhas, tais como estradas ou linhas costeiras, pode comparar a localização destes elementos com outros dados históricos ou simplesmente compará-los visualmente com a camada 'lot_township_polygon' neste SIG. + +Em processos mais avançados pode, inclusivamente, sobrepor esta imagem georreferenciada com um DEM (*Digital Elevation Model* - Modelo de Elevação Digital) para proporcionar-lhe um efeito de altura através de sombras (*hillshade*) ou um efeito 3D e, assim, realizar um '*fly-over*' e ter uma perspetiva aérea das habitações da PEI no século XIX. + +*Esta lição é parte do [Geospatial Historian](https://perma.cc/6AN6-N7LX).* + diff --git a/pt/licoes/git-ferramenta-metodologica-projetos-historia-1.md b/pt/licoes/git-ferramenta-metodologica-projetos-historia-1.md index 9c0c5ed324..fa21c260f9 100644 --- a/pt/licoes/git-ferramenta-metodologica-projetos-historia-1.md +++ b/pt/licoes/git-ferramenta-metodologica-projetos-historia-1.md @@ -417,7 +417,7 @@ nothing to commit, working tree clean #### Status de um ficheiro -Agora que já sabemos como adicionar um ficheiro ao repositório Git e como submeter alterações acompanhadas de mensagens, vamos detalhar e analisar os diferentes status de um ficheiro no Git. Para isso vamos criar um ficheiro novo chamado `resumo.txt` e salvá-lo no diretório `projeto-de-pesquisa`. Repetiremos o mesmo método utilizado para criar o ficheiro `README.md`, com o comando `echo` (veja o tópico [Comandos Básicos](#comandos-basicos)). No entanto, pode criar este ficheiro utilizando qualquer outro método. +Agora que já sabemos como adicionar um ficheiro ao repositório Git e como submeter alterações acompanhadas de mensagens, vamos detalhar e analisar os diferentes status de um ficheiro no Git. Para isso vamos criar um ficheiro novo chamado `resumo.txt` e salvá-lo no diretório `projeto-de-pesquisa`. Repetiremos o mesmo método utilizado para criar o ficheiro `README.md`, com o comando `echo` (veja o tópico [Comandos Básicos](#comandos-básicos)). No entanto, pode criar este ficheiro utilizando qualquer outro método. ```bash ~/Documentos/projeto-de-pesquisa$ echo "Resumo" >> resumo.txt diff --git a/pt/licoes/instalacao-linux.md b/pt/licoes/instalacao-linux.md index 3817315068..d98d7953bf 100644 --- a/pt/licoes/instalacao-linux.md +++ b/pt/licoes/instalacao-linux.md @@ -134,6 +134,6 @@ nossa sugestão é que você tente a próxima lição ‘[Noções básicas de p [outros editores]: https://wiki.python.org/python/PythonEditors [site do Komodo Edit]: https://github.com/ActiveState/OpenKomodoIDE/releases - [Noções básicas de páginas web e HTML]: nocoes-basicas-paginas-web-html + [Noções básicas de páginas web e HTML]: /pt/licoes/nocoes-basicas-paginas-web-html diff --git a/pt/licoes/instalacao-mac.md b/pt/licoes/instalacao-mac.md index 104d9314e7..7bcab45b8b 100644 --- a/pt/licoes/instalacao-mac.md +++ b/pt/licoes/instalacao-mac.md @@ -1,130 +1,130 @@ ---- -title: Configurar um ambiente de desenvolvimento integrado para Python (Mac) -slug: instalacao-mac -layout: lesson -date: 2012-07-17 -tested_date: 2023-11-16 -translation_date: 2021-05-13 -authors: -- William J. Turkel -- Adam Crymble -reviewers: -- Jim Clifford -- Amanda Morton -editors: -- Miriam Posner -translator: -- Josir C. Gomes -translation-editor: -- Danielle Sanches -translation-reviewer: -- Bruno Martins -- Renato Rocha Souza -difficulty: 1 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/323 -activity: transforming -topics: [get-ready, python] -abstract: "Esta lição irá auxiliar na configuração de um ambiente de desenvolvimento integrado para o Python num computador com o Sistema Operacional Mac." -python_warning: false -original: mac-installation -avatar_alt: Uma banda com três músicos -doi: 10.46430/phpt0005 ---- - -{% include toc.html %} - - - - - -## Faça um backup do seu computador - -É sempre importante garantir que você tenha backups regulares e recentes do seu computador. Este é um bom conselho que serve para a vida toda e não se limita à pratica específica de programação. Usuários do Mac podem recorrer ao [Time Machine][] para isso. - -## Instale o Python 3 - -Você ainda pode ter o Python 2 na sua máquina. Como essa versão do Python foi descontinuada no fim de 2019, é importante que você instale o Python 3. Faça o download da versão mais estável da linguagem de programação Python (Version 3.8 de Novembro de 2019) e instale o software a partir do [site do Python][]. - -## Crie um diretório - -Para que você se organize, o ideal é que você tenha um diretório (i.e., pasta) no seu computador onde você irá armazenar os seus programas em Python (por exemplo, `programming-historian`). Crie esse diretório em qualquer outra pasta do seu computador. - -## Instale um editor de texto - -Existem vários editores de texto que você pode utilizar para escrever, armazenar e executar comandos em Python. O Sublime Text é utilizado nessa lição. Se vosê preferir usar outro editor, existem muitas outras [opções de editores de texto][]. Alguns dos nossos usuários preferem um programa chamado [BBEdit][]. A escolha é sua. Pode descarregar uma cópia do Sublime Text a partir do [website do Sublime Text][]. - -#### Configurar no Sublime Text - -Deve agora configurar o editor para que seja possível executar programas em Python. - -A partir do menu, escolha `Tools -> Build System -> Python`. - -## Passo 2 – “Olá Mundo” em Python --------------------------------- - -É uma tradição para quem está começando a programar em uma nova linguagem que o primeiro programa a ser construído emita a frase "Olá Mundo". - -O Python é uma boa linguagem de programação para iniciantes porque ela é de alto-nível. -Isto quer dizer que é possível escrever pequenos programas que realizam muitas funcionalidades. -Quanto menor o programa, mais provável que ele caiba em apenas um ecrã, e mais fácil será manter o controle dele em sua mente. - -O Python é uma lingugagem 'interpretada'. Isto significa que existe um programa especial (conhecido como Interpretador) que sabe como seguir as instruções da linguagem. Uma forma de utilizar o interpretador é guardar todas as instruções a executar em um ficheiro para, em seguida, solicitar ao interpretador que ele interprete o conteúdo desse ficheiro. - -Um ficheiro que contém instruções de linguagem de programação é conhecido como um programa. O interpretador irá executar cada uma das instruções que você incluiu no seu programa e no final irá parar. Vamos experimentar como isto funciona. - -No seu editor de texto, crie um novo ficheiro, entre o seguinte programa de duas linhas, e salve-o na pasta `programming-historian`: - -`ola-mundo.py` - -``` python -# ola-mundo.py -print('Olá Mundo') -``` - -O comando “*Run Python*” permite que você execute o seu programa. Se você escolheu um outro editor, este deve ter uma funcionalidade semelhante. Se está a usar Sublime Text, clique em `Tools -> Build` (ou digite `⌘B`). Se está a usar o BBEdit, clique em “#!” e no botão *Run*. Se tudo correu bem, o ecrã deverá mostrar algo como apresentado de seguida: - -{% include figure.html filename="BBEdit-ola-mundo.png" caption="Olá Mundo em Python no Mac, com BBEdit" %} - -Ou, com Sublime Text: - -{% include figure.html filename="pt-tr-sublimetext-ola-mundo.png" caption="Olá Mundo em Python no Mac, com Sublime Text" %} - -## Interagindo com a linha de comandos do Python - -Uma outra forma de interagir com o interpretador é utilizar o que é denominado por linha de comandos. Você pode digitar um comando na linha de comandos e pressionar a tecla Enter, sendo-lhe apresentada a resposta ao seu comando. Usar a linha de comandos é um ótimo método para testar os comandos, por forma a certificar que eles realmente fazem o que você está imaginando. - -Abra o *Finder*, faça duplo-clique em `Applications -> Utilities -> Terminal` e, em seguida, digite “`python3`” - -Este comando irá abrir a linha de comandos do Python, indicando assim que você já pode executar comandos Python. De seguida, digite: - -``` python -print('Olá Mundo') -``` -e pressione Enter. O computador irá responder com: - -``` python -Olá Mundo -``` - -Quando quisermos representar uma interação na linha de comandos, utilizaremos o símbolo `->` para indicar a resposta para o nosso comando, tal como no exemplo abaixo: - -``` python -print('Olá Mundo') --> Olá Mundo -``` - -No seu ecrã, você verá algo como: - -{% include figure.html filename="ola-mundo-terminal.png" caption="Olá Mundo em Python no Terminal do Mac" %} - -Agora que você e o seu computador estão preparados, podemos seguir para tarefas mais interessantes. Se você está seguindo as lições do Python, a nossa sugestão é que tente a próxima lição ‘[Noções básicas de páginas web e HTML][]‘ - - [Time Machine]: http://support.apple.com/kb/ht1427 - [site do Python]: https://www.python.org/downloads/mac-osx/ - [Beautiful Soup]: http://www.crummy.com/software/BeautifulSoup/ - [opções de editores de texto]: https://wiki.python.org/python/PythonEditors - [website do Sublime Text]: https://www.sublimetext.com/download - [BBEdit]: https://www.barebones.com/products/bbedit/ - [site do Komodo Edit]: https://www.activestate.com/products/komodo-ide/downloads/edit/ - [Noções básicas de páginas web e HTML]: nocoes-basicas-paginas-web-html - +--- +title: Configurar um ambiente de desenvolvimento integrado para Python (Mac) +slug: instalacao-mac +layout: lesson +date: 2012-07-17 +tested_date: 2023-11-16 +translation_date: 2021-05-13 +authors: +- William J. Turkel +- Adam Crymble +reviewers: +- Jim Clifford +- Amanda Morton +editors: +- Miriam Posner +translator: +- Josir C. Gomes +translation-editor: +- Danielle Sanches +translation-reviewer: +- Bruno Martins +- Renato Rocha Souza +difficulty: 1 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/323 +activity: transforming +topics: [get-ready, python] +abstract: "Esta lição irá auxiliar na configuração de um ambiente de desenvolvimento integrado para o Python num computador com o Sistema Operacional Mac." +python_warning: false +original: mac-installation +avatar_alt: Uma banda com três músicos +doi: 10.46430/phpt0005 +--- + +{% include toc.html %} + + + + + +## Faça um backup do seu computador + +É sempre importante garantir que você tenha backups regulares e recentes do seu computador. Este é um bom conselho que serve para a vida toda e não se limita à pratica específica de programação. Usuários do Mac podem recorrer ao [Time Machine][] para isso. + +## Instale o Python 3 + +Você ainda pode ter o Python 2 na sua máquina. Como essa versão do Python foi descontinuada no fim de 2019, é importante que você instale o Python 3. Faça o download da versão mais estável da linguagem de programação Python (Version 3.8 de Novembro de 2019) e instale o software a partir do [site do Python][]. + +## Crie um diretório + +Para que você se organize, o ideal é que você tenha um diretório (i.e., pasta) no seu computador onde você irá armazenar os seus programas em Python (por exemplo, `programming-historian`). Crie esse diretório em qualquer outra pasta do seu computador. + +## Instale um editor de texto + +Existem vários editores de texto que você pode utilizar para escrever, armazenar e executar comandos em Python. O Sublime Text é utilizado nessa lição. Se vosê preferir usar outro editor, existem muitas outras [opções de editores de texto][]. Alguns dos nossos usuários preferem um programa chamado [BBEdit][]. A escolha é sua. Pode descarregar uma cópia do Sublime Text a partir do [website do Sublime Text][]. + +#### Configurar no Sublime Text + +Deve agora configurar o editor para que seja possível executar programas em Python. + +A partir do menu, escolha `Tools -> Build System -> Python`. + +## Passo 2 – “Olá Mundo” em Python +-------------------------------- + +É uma tradição para quem está começando a programar em uma nova linguagem que o primeiro programa a ser construído emita a frase "Olá Mundo". + +O Python é uma boa linguagem de programação para iniciantes porque ela é de alto-nível. +Isto quer dizer que é possível escrever pequenos programas que realizam muitas funcionalidades. +Quanto menor o programa, mais provável que ele caiba em apenas um ecrã, e mais fácil será manter o controle dele em sua mente. + +O Python é uma lingugagem 'interpretada'. Isto significa que existe um programa especial (conhecido como Interpretador) que sabe como seguir as instruções da linguagem. Uma forma de utilizar o interpretador é guardar todas as instruções a executar em um ficheiro para, em seguida, solicitar ao interpretador que ele interprete o conteúdo desse ficheiro. + +Um ficheiro que contém instruções de linguagem de programação é conhecido como um programa. O interpretador irá executar cada uma das instruções que você incluiu no seu programa e no final irá parar. Vamos experimentar como isto funciona. + +No seu editor de texto, crie um novo ficheiro, entre o seguinte programa de duas linhas, e salve-o na pasta `programming-historian`: + +`ola-mundo.py` + +``` python +# ola-mundo.py +print('Olá Mundo') +``` + +O comando “*Run Python*” permite que você execute o seu programa. Se você escolheu um outro editor, este deve ter uma funcionalidade semelhante. Se está a usar Sublime Text, clique em `Tools -> Build` (ou digite `⌘B`). Se está a usar o BBEdit, clique em “#!” e no botão *Run*. Se tudo correu bem, o ecrã deverá mostrar algo como apresentado de seguida: + +{% include figure.html filename="BBEdit-ola-mundo.png" caption="Olá Mundo em Python no Mac, com BBEdit" %} + +Ou, com Sublime Text: + +{% include figure.html filename="pt-tr-sublimetext-ola-mundo.png" caption="Olá Mundo em Python no Mac, com Sublime Text" %} + +## Interagindo com a linha de comandos do Python + +Uma outra forma de interagir com o interpretador é utilizar o que é denominado por linha de comandos. Você pode digitar um comando na linha de comandos e pressionar a tecla Enter, sendo-lhe apresentada a resposta ao seu comando. Usar a linha de comandos é um ótimo método para testar os comandos, por forma a certificar que eles realmente fazem o que você está imaginando. + +Abra o *Finder*, faça duplo-clique em `Applications -> Utilities -> Terminal` e, em seguida, digite “`python3`” + +Este comando irá abrir a linha de comandos do Python, indicando assim que você já pode executar comandos Python. De seguida, digite: + +``` python +print('Olá Mundo') +``` +e pressione Enter. O computador irá responder com: + +``` python +Olá Mundo +``` + +Quando quisermos representar uma interação na linha de comandos, utilizaremos o símbolo `->` para indicar a resposta para o nosso comando, tal como no exemplo abaixo: + +``` python +print('Olá Mundo') +-> Olá Mundo +``` + +No seu ecrã, você verá algo como: + +{% include figure.html filename="ola-mundo-terminal.png" caption="Olá Mundo em Python no Terminal do Mac" %} + +Agora que você e o seu computador estão preparados, podemos seguir para tarefas mais interessantes. Se você está seguindo as lições do Python, a nossa sugestão é que tente a próxima lição ‘[Noções básicas de páginas web e HTML][]‘ + + [Time Machine]: https://support.apple.com/en-gb/104984 + [site do Python]: https://www.python.org/downloads/mac-osx/ + [Beautiful Soup]: https://www.crummy.com/software/BeautifulSoup/ + [opções de editores de texto]: https://wiki.python.org/python/PythonEditors + [website do Sublime Text]: https://www.sublimetext.com/download + [BBEdit]: https://www.barebones.com/products/bbedit/ + [site do Komodo Edit]: https://www.activestate.com/products/komodo-ide/downloads/edit/ + [Noções básicas de páginas web e HTML]: /pt/licoes/nocoes-basicas-paginas-web-html + diff --git a/pt/licoes/instalacao-windows.md b/pt/licoes/instalacao-windows.md index e1b62f2dd0..6287cc0e38 100644 --- a/pt/licoes/instalacao-windows.md +++ b/pt/licoes/instalacao-windows.md @@ -159,6 +159,6 @@ Agora que você e o seu computador estão preparados, podemos seguir para tarefa [site do Python]: https://www.python.org/downloads/windows/ [outros editores]: https://wiki.python.org/python/PythonEditors [UTF-8]: https://pt.wikipedia.org/wiki/UTF-8 - [Noções básicas de páginas web e HTML]: nocoes-basicas-paginas-web-html + [Noções básicas de páginas web e HTML]: /pt/licoes/nocoes-basicas-paginas-web-html diff --git a/pt/licoes/introducao-ao-markdown.md b/pt/licoes/introducao-ao-markdown.md index 3022a23da0..94a714b737 100644 --- a/pt/licoes/introducao-ao-markdown.md +++ b/pt/licoes/introducao-ao-markdown.md @@ -1,318 +1,318 @@ ---- -title: Introdução ao Markdown -slug: introducao-ao-markdown -layout: lesson -date: 2015-11-13 -translation_date: 2021-03-30 -authors: -- Sarah Simpkin -reviewers: -- John Fink -- Nancy Lemay -editors: -- Ian Milligan -translator: -- João Gilberto Neves Saraiva -translation-editor: -- Joana Vieira Paulino -translation-reviewer: -- Josir Cardoso Gomes -- Bruno Martins -difficulty: 1 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/363 -activity: presenting -topics: [data-management] -abstract: "Nesta lição é apresentado o Markdown, uma sintaxe baseada em texto simples para formatação de documentos. É explicado porque ele é usado, como formatar ficheiros Markdown e como pré-visualizar documentos formatados em Markdown na web." -original: getting-started-with-markdown -avatar_alt: Letras ornamentadas num manual tipográfico -doi: 10.46430/phpt0008 ---- - -{% include toc.html %} - - - - -### Objetivos da lição -Nesta lição, é apresentado o Markdown, uma sintáxe baseada em texto simples para formatação de documentos. É explicado porque ele é usado, como formatar ficheiros Markdown e como visualizar documentos formatados em Markdown na web. - -Como as lições do *Programming Historian em português* são submetidas em ficheiros Markdown, incluí exemplos do *Programming Historian* sempre que possível. Espero que este guia seja útil para quem estiver pensando em criar uma lição para este site. - -## O que é Markdown? - -Criado em 2004 por [John Gruber](http://daringfireball.net/projects/markdown/ "Markdown on Daring Fireball"), Markdown se refere a: (1) um modo de formatação de ficheiros de texto, e também (2) uma [ferramenta Perl](https://pt.wikipedia.org/wiki/Perl) para converter ficheiros Markdown em HTML. Nesta lição, nosso foco será na primeira parte, aprender a escrever ficheiros utilizando a sintaxe Markdown. - -Ficheiros de texto simples têm muitas vantagens sobre outros formatos. Uma delas é que são legíveis em praticamente qualquer dispositivo. Eles também resistem ao tempo melhor do que outros tipos de ficheiro - se abrir um documento salvo num formato de um processador de texto legado (como docx), estará familiarizado com os desafios de compatibilidade envolvidos. - -Utilizando a sintaxe Markdown, você será capaz de produzir ficheiros que são legíveis como texto simples e também prontos para ser estilizados em outras plataformas. Vários sistemas de blogs, geradores de sites estáticos e sites como o [GitHub](http://github.com "GitHub") também suportam Markdown, e renderizam esses ficheiros em HTML para exibição na web. Além disso, ferramentas como o Pandoc podem converter ficheiros de Markdown para outros formatos e vice-versa. Para mais informações sobre o Pandoc, visite a lição (em inglês) [Sustainable authorship in plain text using Pandoc and Markdown](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown), produzida por Dennis Tenen e Grant Wythoff. - -## Sintaxe Markdown -Ficheiros Markdown são salvos com a extensão `.md` e podem ser abertos num editor de texto como TextEdit, Notepad, Sublime Text ou Vim. Diversos websites e plataformas de publicação dispôem de editores web e/ou extensões para entrada de texto utilizando sintaxe Markdown. - -Neste tutorial, vamos praticar a sintaxe Markdown no navegador utilizando o [StackEdit](https://stackedit.io). Nele é possível inserir um texto formatado em Markdown na esquerda e ver imediatamente a versão renderizada dele à direita. - -Como todas as lições do *Programming Historian em português* são escritas em Markdown, é possível examinar esses ficheiros no StackEdit também. No [StackEdit editor](https://stackedit.io/app), clique no `#` no canto superior direito para abrir o menu. Escolha `Import/Export` e depois `Import Markdown`, então cole o conteúdo da URL a seguir na janela do lado esquerdo para exibir a lição "Preservar os seus dados de investigação" no editor: - -``` -https://raw.githubusercontent.com/programminghistorian/jekyll/gh-pages/pt/licoes/preservar-os-seus-dados-de-investigacao.md -``` -Note que enquanto o painel direito apresenta uma renderização mais elegante do texto, o ficheiro original à esquerda fica ainda bem legível. - -Agora, vamos apronfundar conhecimentos escrevendo nós mesmos com a sintaxe Markdown. Crie um novo documento no StackEdit clicando no ícone de pasta no canto superior esquerdo e escolha a opção `New file`. Você pode inserir um título para o documento na caixa de texto no topo da página. - -### Cabeçalhos -Quatro níveis de cabeçalho estão disponíveis no Markdown e são indicatos pelo número de `#` antes do texto do título. Copie os exemplos a seguir na caixa de texto à sua esquerda. - -``` -# Primeiro nível de cabeçalho -## Segundo nível de cabeçalho -### Terceiro nível de cabeçalho -#### Quarto nível de cabeçalho -``` - -O primeiro e segundo níveis de cabeçalho podem ser inseridos da seguinte forma: - -``` -Primeiro nível de cabeçalho -======= - -Segundo nível de cabeçalho ----------- -``` - -**Eles serão renderizados como:** - -# Primeiro nível de cabeçalho - -## Segundo nível de cabeçalho - -### Terceiro nível de cabeçalho - -#### Quarto nível de cabeçalho - - -Observe como a sintaxe do Markdown permanece compreensível mesmo na versão de texto simples. - - -### Parágrafos & Quebras de linha - -Escreva a frase a seguir na caixa de texto: - -``` -Bem-vindo ao Programming Historian em português. - -Hoje vamos aprender sobre a sintaxe Markdown. -Esta frase é separada da anterior por uma quebra de linha simples. -``` -**Isso é renderizado como** - -Bem-vindo ao Programming Historian em português. - -Hoje vamos aprender sobre a sintaxe Markdown. -Esta frase é separada da anterior por uma quebra de linha simples. - - -Os parágrafos devem ser separados por uma linha vazia. Deixe uma linha em branco entre `Markdown.` e `Esta` para ver como isso funciona. Em algumas implementações de Markdown, uma quebra de linha simples pode ser indicada com dois espaços vazios no fim de uma linha. Isso não é aplicado na formatação Markdown do [GitHub](https://docs.github.com/pt/github/writing-on-github/basic-writing-and-formatting-syntax) que o StackEdit utiliza como padrão. - - -### Acrescentando Ênfase - -O texto pode ser posto em itálico colocando a palavra entre os símbolos `*` ou `_`. Da mesma forma, o texto em negrito pode ser escrito colocando a palavra entre `**` ou `__`. - -Tente adicionar ênfase à frase usando estes métodos: - -``` -Estou **muito** animado com os tutoriais do _Programming Historian_. -``` - -**Isto é renderizado como:** - -Estou **muito** animado com os tutoriais do _Programming Historian_. - -### Criando Listas - -Markdown inclui suporte para listas ordenadas ou não. Tente digitar a lista a seguir na caixa de texto: - -``` -Lista de compras ----------- -* Frutas - * Maçãs - * Laranjas - * Uvas -* Laticínios - * Leite - * Queijo -``` -Identar o `*` permite criar itens alinhados. - -**Isso é renderizado como:** - -Lista de compras ----------- -* Frutas - * Maçãs - * Laranjas - * Uvas -* Laticínios - * Leite - * Queijo - -Listas ordenadas são escritas numerando cada linha. Mais uma vez, o objetivo do Markdown é produzir documentos que sejam legíveis como texto simples e que possam ser transformados noutros formatos. - -``` -Lista de afazeres ----------- -1. Terminar o tutorial de Markdown -2. Ir fazer compras -3. Preparar o almoço -``` - -**Isso é renderizado como:** - -Lista de afazeres ----------- -1. Terminar o tutorial de Markdown -2. Ir fazer compras -3. Preparar o almoço - -### Trechos de código -Representar trechos de código de maneira diferente do resto de um documento é uma boa prática pois melhora a legibilidade. Comumente, códigos são representandos em Markdown com texto monoespaçado. Uma vez que o Markdown não faz distinção entre fontes, codígos são representandos entre caractéres de crase como `` ` ``. Por exemplo, `` `
    ` ``. Blocos inteiros de código são escritos digitando três caracteres `` ` `` antes e depois de cada bloco. Na janela de visualização do StackEdit, isso será renderizado como uma caixa sombreada com texto em uma fonte monoespaçada. - -Digite o trecho a seguir na caixa de texto: - - ``` - - - Título do Website - - - - - ``` - -**Isso é renderizado como:** - -``` - - - Título do Website - - - - -``` - -Observe como o bloco de código é renderizado em uma fonte monoespaçada. - -### Blocos de citações - -Adicionar um `>` antes de qualquer parágrafo para renderizá-lo como um elemento de bloco de citação. - -Tente digitar o seguinte texto na caixa de texto: - -``` -> Olá, sou um parágrafo de texto encerrado em um bloco de citação. Observe como estou deslocado da margem esquerda. -``` - -**Isso é renderizado como:** - -> Olá, sou um parágrafo de texto encerrado em um bloco de citação. Observe como estou deslocado da margem esquerda. - -### Links - -Os links podem ser escritos em dois estilos. - -Os links embutidos são escritos colocando o texto do link entre colchetes primeiro e, em seguida, incluindo a URL e o texto alternativo opcional entre parêntesis curvos. - -`Para mais tutoriais, por favor visite o [Programming Historian em português](/pt/).` - -**Isso é renderizado como:** - -Para mais tutoriais, por favor visite o [Programming Historian em português](/pt/) - -Os links de referência são úteis para notas de rodapé e podem manter seu documento de texto simples mais organizado. Eles são escritos com um conjunto adicional de colchetes para estabelecer um rótulo de ID de link. - -`Um exemplo é o website do [Programming Historian em português][1].` - -Você deve então adicionar o URL a outra parte do documento: - -`[1]: http://programminghistorian.org/pt/ "The Programming Historian em português".` - -**Isso é renderizado como:** - -Um exemplo é o website do [_Programming Historian em português_][1] - -[1]: /pt/ "The Programming Historian em português" - - -### Imagens - -As imagens podem ser referenciadas usando `!` seguido por algum texto alternativo entre colchetes. Depois, a URL da imagem e um título opcional. Eles não serão exibidos em seu documento de texto simples, mas serão incorporados em uma página HTML renderizada. - -`![Wikipedia logo](https://upload.wikimedia.org/wikipedia/en/8/80/Wikipedia-logo-v2.svg "Wikipedia logo")` - -**Isso é renderizado como:** - -![Wikipedia logo](https://upload.wikimedia.org/wikipedia/en/8/80/Wikipedia-logo-v2.svg "Wikipedia logo") - -#### Linhas Horizontais - -Linhas horizontais são produzidas quando três ou mais `-`,` * `ou` _` são incluídos em sequência, independentemente do número de espaços entre eles. Todas as combinações a seguir renderizarão linhas horizontais: - -``` -___ -* * * -- - - - - - -``` - -**Isso é renderizado como:** - ---- -*** -- - - - - - - - -### Tabelas - -Originalmente o Markdown não inclui tabelas. No entanto, alguns sites e aplicativos usam variantes do Markdown que podem incluir tabelas e outros recursos especiais. É o caso da formatação utilizada no [GitHub](https://docs.github.com/pt/github/writing-on-github/organizing-information-with-tables) que é usada para renderizar arquivos `.md` a partir do GitHub. - -Para criar uma tabela dentro do GitHub, use barras `|` para separar colunas e hifens `-` entre seus cabeçalhos e o resto do conteúdo da tabela. Embora as barras sejam realmente necessárias entre as colunas, é possível usá-las em qualquer lado da tabela para obter uma aparência melhor. As células podem conter qualquer comprimento de conteúdo e não é necessário que as barras sejam alinhadas verticalmente umas com as outras. - -``` -| Título 1 | Título 2 | Título 3 | -| --------- | --------- | --------- | -| Linha 1, coluna 1 | Linha 1, coluna 2 | Linha 1, coluna 3| -| Linha 2, coluna 1 | Linha 2, coluna 2 | Linha 2, coluna 3| -| Linha 3, coluna 1 | Linha 3, coluna 2 | Linha 3, coluna 3| -``` - -**Isso é renderizado como:** - -| Título 1 | Título 2 | Título 3 | -| --------- | --------- | --------- | -| Linha 1, coluna 1 | Linha 1, coluna 2 | Linha 1, coluna 3| -| Linha 2, coluna 1 | Linha 2, coluna 2 | Linha 2, coluna 3| -| Linha 3, coluna 1 | Linha 3, coluna 2 | Linha 3, coluna 3| - -Para especificar o alinhamento de cada coluna, dois pontos `:` podem ser adicionados à linha do cabeçalho da seguinte forma: - -``` -| Alinhado à esquerda | Centralizado | Alinhado à direita | -| :-------- | :-------: | --------: | -| Maçãs | Vermelho | 5000 | -| Bananas | Amarelo| 75 | -``` -**Isso é renderizado como:** - -| Alinhado à esquerda | Centralizado | Alinhado à direita | -| :-------- | :-------: | --------: | -| Maçãs | Vermelho | 5000 | -| Bananas | Amarelo| 75 | - - -## Limitações do Markdown -Embora o Markdown esteja se tornando cada vez mais popular, principalmente para estilizar documentos que podem ser visualizados na web, muitas pessoas e editores ainda esperam documentos tradicionais do Word, PDFs e outros formatos de arquivo. Isso pode ser atenuado parcialmente com ferramentas de conversão de linha de comandos, como o [Pandoc](https://pandoc.org/); no entanto, certos recursos do processador de texto, como o controle de alterações, ainda não são suportados. Visite a lição do Programming Historian (em inglês) de título [Sustainable authorship in plain text using Pandoc and Markdown](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) para obter mais informações sobre Pandoc. - - -## Conclusão -Markdown é uma ferramenta útil e um meio-termo entre arquivos de texto simples não estilizados e documentos legados de processadores de texto. Sua sintaxe simples é rápida de aprender e legível por si só e também quando renderizada em HTML e outros tipos de documentos. Por fim, escolher escrever seus próprios documentos em Markdown significa que eles serão utilizáveis e legíveis a longo prazo. +--- +title: Introdução ao Markdown +slug: introducao-ao-markdown +layout: lesson +date: 2015-11-13 +translation_date: 2021-03-30 +authors: +- Sarah Simpkin +reviewers: +- John Fink +- Nancy Lemay +editors: +- Ian Milligan +translator: +- João Gilberto Neves Saraiva +translation-editor: +- Joana Vieira Paulino +translation-reviewer: +- Josir Cardoso Gomes +- Bruno Martins +difficulty: 1 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/363 +activity: presenting +topics: [data-management] +abstract: "Nesta lição é apresentado o Markdown, uma sintaxe baseada em texto simples para formatação de documentos. É explicado porque ele é usado, como formatar ficheiros Markdown e como pré-visualizar documentos formatados em Markdown na web." +original: getting-started-with-markdown +avatar_alt: Letras ornamentadas num manual tipográfico +doi: 10.46430/phpt0008 +--- + +{% include toc.html %} + + + + +### Objetivos da lição +Nesta lição, é apresentado o Markdown, uma sintáxe baseada em texto simples para formatação de documentos. É explicado porque ele é usado, como formatar ficheiros Markdown e como visualizar documentos formatados em Markdown na web. + +Como as lições do *Programming Historian em português* são submetidas em ficheiros Markdown, incluí exemplos do *Programming Historian* sempre que possível. Espero que este guia seja útil para quem estiver pensando em criar uma lição para este site. + +## O que é Markdown? + +Criado em 2004 por [John Gruber](https://daringfireball.net/projects/markdown/ "Markdown on Daring Fireball"), Markdown se refere a: (1) um modo de formatação de ficheiros de texto, e também (2) uma [ferramenta Perl](https://pt.wikipedia.org/wiki/Perl) para converter ficheiros Markdown em HTML. Nesta lição, nosso foco será na primeira parte, aprender a escrever ficheiros utilizando a sintaxe Markdown. + +Ficheiros de texto simples têm muitas vantagens sobre outros formatos. Uma delas é que são legíveis em praticamente qualquer dispositivo. Eles também resistem ao tempo melhor do que outros tipos de ficheiro - se abrir um documento salvo num formato de um processador de texto legado (como docx), estará familiarizado com os desafios de compatibilidade envolvidos. + +Utilizando a sintaxe Markdown, você será capaz de produzir ficheiros que são legíveis como texto simples e também prontos para ser estilizados em outras plataformas. Vários sistemas de blogs, geradores de sites estáticos e sites como o [GitHub](https://github.com "GitHub") também suportam Markdown, e renderizam esses ficheiros em HTML para exibição na web. Além disso, ferramentas como o Pandoc podem converter ficheiros de Markdown para outros formatos e vice-versa. Para mais informações sobre o Pandoc, visite a lição (em inglês) [Sustainable authorship in plain text using Pandoc and Markdown](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown), produzida por Dennis Tenen e Grant Wythoff. + +## Sintaxe Markdown +Ficheiros Markdown são salvos com a extensão `.md` e podem ser abertos num editor de texto como TextEdit, Notepad, Sublime Text ou Vim. Diversos websites e plataformas de publicação dispôem de editores web e/ou extensões para entrada de texto utilizando sintaxe Markdown. + +Neste tutorial, vamos praticar a sintaxe Markdown no navegador utilizando o [StackEdit](https://stackedit.io). Nele é possível inserir um texto formatado em Markdown na esquerda e ver imediatamente a versão renderizada dele à direita. + +Como todas as lições do *Programming Historian em português* são escritas em Markdown, é possível examinar esses ficheiros no StackEdit também. No [StackEdit editor](https://stackedit.io/app), clique no `#` no canto superior direito para abrir o menu. Escolha `Import/Export` e depois `Import Markdown`, então cole o conteúdo da URL a seguir na janela do lado esquerdo para exibir a lição "Preservar os seus dados de investigação" no editor: + +``` +https://raw.githubusercontent.com/programminghistorian/jekyll/gh-pages/pt/licoes/preservar-os-seus-dados-de-investigacao.md +``` +Note que enquanto o painel direito apresenta uma renderização mais elegante do texto, o ficheiro original à esquerda fica ainda bem legível. + +Agora, vamos apronfundar conhecimentos escrevendo nós mesmos com a sintaxe Markdown. Crie um novo documento no StackEdit clicando no ícone de pasta no canto superior esquerdo e escolha a opção `New file`. Você pode inserir um título para o documento na caixa de texto no topo da página. + +### Cabeçalhos +Quatro níveis de cabeçalho estão disponíveis no Markdown e são indicatos pelo número de `#` antes do texto do título. Copie os exemplos a seguir na caixa de texto à sua esquerda. + +``` +# Primeiro nível de cabeçalho +## Segundo nível de cabeçalho +### Terceiro nível de cabeçalho +#### Quarto nível de cabeçalho +``` + +O primeiro e segundo níveis de cabeçalho podem ser inseridos da seguinte forma: + +``` +Primeiro nível de cabeçalho +======= + +Segundo nível de cabeçalho +---------- +``` + +**Eles serão renderizados como:** + +# Primeiro nível de cabeçalho + +## Segundo nível de cabeçalho + +### Terceiro nível de cabeçalho + +#### Quarto nível de cabeçalho + + +Observe como a sintaxe do Markdown permanece compreensível mesmo na versão de texto simples. + + +### Parágrafos & Quebras de linha + +Escreva a frase a seguir na caixa de texto: + +``` +Bem-vindo ao Programming Historian em português. + +Hoje vamos aprender sobre a sintaxe Markdown. +Esta frase é separada da anterior por uma quebra de linha simples. +``` +**Isso é renderizado como** + +Bem-vindo ao Programming Historian em português. + +Hoje vamos aprender sobre a sintaxe Markdown. +Esta frase é separada da anterior por uma quebra de linha simples. + + +Os parágrafos devem ser separados por uma linha vazia. Deixe uma linha em branco entre `Markdown.` e `Esta` para ver como isso funciona. Em algumas implementações de Markdown, uma quebra de linha simples pode ser indicada com dois espaços vazios no fim de uma linha. Isso não é aplicado na formatação Markdown do [GitHub](https://docs.github.com/pt/github/writing-on-github/basic-writing-and-formatting-syntax) que o StackEdit utiliza como padrão. + + +### Acrescentando Ênfase + +O texto pode ser posto em itálico colocando a palavra entre os símbolos `*` ou `_`. Da mesma forma, o texto em negrito pode ser escrito colocando a palavra entre `**` ou `__`. + +Tente adicionar ênfase à frase usando estes métodos: + +``` +Estou **muito** animado com os tutoriais do _Programming Historian_. +``` + +**Isto é renderizado como:** + +Estou **muito** animado com os tutoriais do _Programming Historian_. + +### Criando Listas + +Markdown inclui suporte para listas ordenadas ou não. Tente digitar a lista a seguir na caixa de texto: + +``` +Lista de compras +---------- +* Frutas + * Maçãs + * Laranjas + * Uvas +* Laticínios + * Leite + * Queijo +``` +Identar o `*` permite criar itens alinhados. + +**Isso é renderizado como:** + +Lista de compras +---------- +* Frutas + * Maçãs + * Laranjas + * Uvas +* Laticínios + * Leite + * Queijo + +Listas ordenadas são escritas numerando cada linha. Mais uma vez, o objetivo do Markdown é produzir documentos que sejam legíveis como texto simples e que possam ser transformados noutros formatos. + +``` +Lista de afazeres +---------- +1. Terminar o tutorial de Markdown +2. Ir fazer compras +3. Preparar o almoço +``` + +**Isso é renderizado como:** + +Lista de afazeres +---------- +1. Terminar o tutorial de Markdown +2. Ir fazer compras +3. Preparar o almoço + +### Trechos de código +Representar trechos de código de maneira diferente do resto de um documento é uma boa prática pois melhora a legibilidade. Comumente, códigos são representandos em Markdown com texto monoespaçado. Uma vez que o Markdown não faz distinção entre fontes, codígos são representandos entre caractéres de crase como `` ` ``. Por exemplo, `` `
    ` ``. Blocos inteiros de código são escritos digitando três caracteres `` ` `` antes e depois de cada bloco. Na janela de visualização do StackEdit, isso será renderizado como uma caixa sombreada com texto em uma fonte monoespaçada. + +Digite o trecho a seguir na caixa de texto: + + ``` + + + Título do Website + + + + + ``` + +**Isso é renderizado como:** + +``` + + + Título do Website + + + + +``` + +Observe como o bloco de código é renderizado em uma fonte monoespaçada. + +### Blocos de citações + +Adicionar um `>` antes de qualquer parágrafo para renderizá-lo como um elemento de bloco de citação. + +Tente digitar o seguinte texto na caixa de texto: + +``` +> Olá, sou um parágrafo de texto encerrado em um bloco de citação. Observe como estou deslocado da margem esquerda. +``` + +**Isso é renderizado como:** + +> Olá, sou um parágrafo de texto encerrado em um bloco de citação. Observe como estou deslocado da margem esquerda. + +### Links + +Os links podem ser escritos em dois estilos. + +Os links embutidos são escritos colocando o texto do link entre colchetes primeiro e, em seguida, incluindo a URL e o texto alternativo opcional entre parêntesis curvos. + +`Para mais tutoriais, por favor visite o [Programming Historian em português](/pt/).` + +**Isso é renderizado como:** + +Para mais tutoriais, por favor visite o [Programming Historian em português](/pt/) + +Os links de referência são úteis para notas de rodapé e podem manter seu documento de texto simples mais organizado. Eles são escritos com um conjunto adicional de colchetes para estabelecer um rótulo de ID de link. + +`Um exemplo é o website do [Programming Historian em português][1].` + +Você deve então adicionar o URL a outra parte do documento: + +`[1]: http://programminghistorian.org/pt/ "The Programming Historian em português".` + +**Isso é renderizado como:** + +Um exemplo é o website do [_Programming Historian em português_][1] + +[1]: /pt/ "The Programming Historian em português" + + +### Imagens + +As imagens podem ser referenciadas usando `!` seguido por algum texto alternativo entre colchetes. Depois, a URL da imagem e um título opcional. Eles não serão exibidos em seu documento de texto simples, mas serão incorporados em uma página HTML renderizada. + +`![Wikipedia logo](https://upload.wikimedia.org/wikipedia/en/8/80/Wikipedia-logo-v2.svg "Wikipedia logo")` + +**Isso é renderizado como:** + +![Wikipedia logo](https://upload.wikimedia.org/wikipedia/en/8/80/Wikipedia-logo-v2.svg "Wikipedia logo") + +#### Linhas Horizontais + +Linhas horizontais são produzidas quando três ou mais `-`,` * `ou` _` são incluídos em sequência, independentemente do número de espaços entre eles. Todas as combinações a seguir renderizarão linhas horizontais: + +``` +___ +* * * +- - - - - - +``` + +**Isso é renderizado como:** + +--- +*** +- - - - - - - + +### Tabelas + +Originalmente o Markdown não inclui tabelas. No entanto, alguns sites e aplicativos usam variantes do Markdown que podem incluir tabelas e outros recursos especiais. É o caso da formatação utilizada no [GitHub](https://docs.github.com/pt/github/writing-on-github/organizing-information-with-tables) que é usada para renderizar arquivos `.md` a partir do GitHub. + +Para criar uma tabela dentro do GitHub, use barras `|` para separar colunas e hifens `-` entre seus cabeçalhos e o resto do conteúdo da tabela. Embora as barras sejam realmente necessárias entre as colunas, é possível usá-las em qualquer lado da tabela para obter uma aparência melhor. As células podem conter qualquer comprimento de conteúdo e não é necessário que as barras sejam alinhadas verticalmente umas com as outras. + +``` +| Título 1 | Título 2 | Título 3 | +| --------- | --------- | --------- | +| Linha 1, coluna 1 | Linha 1, coluna 2 | Linha 1, coluna 3| +| Linha 2, coluna 1 | Linha 2, coluna 2 | Linha 2, coluna 3| +| Linha 3, coluna 1 | Linha 3, coluna 2 | Linha 3, coluna 3| +``` + +**Isso é renderizado como:** + +| Título 1 | Título 2 | Título 3 | +| --------- | --------- | --------- | +| Linha 1, coluna 1 | Linha 1, coluna 2 | Linha 1, coluna 3| +| Linha 2, coluna 1 | Linha 2, coluna 2 | Linha 2, coluna 3| +| Linha 3, coluna 1 | Linha 3, coluna 2 | Linha 3, coluna 3| + +Para especificar o alinhamento de cada coluna, dois pontos `:` podem ser adicionados à linha do cabeçalho da seguinte forma: + +``` +| Alinhado à esquerda | Centralizado | Alinhado à direita | +| :-------- | :-------: | --------: | +| Maçãs | Vermelho | 5000 | +| Bananas | Amarelo| 75 | +``` +**Isso é renderizado como:** + +| Alinhado à esquerda | Centralizado | Alinhado à direita | +| :-------- | :-------: | --------: | +| Maçãs | Vermelho | 5000 | +| Bananas | Amarelo| 75 | + + +## Limitações do Markdown +Embora o Markdown esteja se tornando cada vez mais popular, principalmente para estilizar documentos que podem ser visualizados na web, muitas pessoas e editores ainda esperam documentos tradicionais do Word, PDFs e outros formatos de arquivo. Isso pode ser atenuado parcialmente com ferramentas de conversão de linha de comandos, como o [Pandoc](https://pandoc.org/); no entanto, certos recursos do processador de texto, como o controle de alterações, ainda não são suportados. Visite a lição do Programming Historian (em inglês) de título [Sustainable authorship in plain text using Pandoc and Markdown](/en/lessons/sustainable-authorship-in-plain-text-using-pandoc-and-markdown) para obter mais informações sobre Pandoc. + + +## Conclusão +Markdown é uma ferramenta útil e um meio-termo entre arquivos de texto simples não estilizados e documentos legados de processadores de texto. Sua sintaxe simples é rápida de aprender e legível por si só e também quando renderizada em HTML e outros tipos de documentos. Por fim, escolher escrever seus próprios documentos em Markdown significa que eles serão utilizáveis e legíveis a longo prazo. diff --git a/pt/licoes/introducao-dados-abertos-conectados.md b/pt/licoes/introducao-dados-abertos-conectados.md index 1a531c59df..c3d3daf492 100644 --- a/pt/licoes/introducao-dados-abertos-conectados.md +++ b/pt/licoes/introducao-dados-abertos-conectados.md @@ -1,421 +1,420 @@ ---- -title: Introdução aos Dados Abertos Conectados -layout: lesson -collection: lessons -slug: introducao-dados-abertos-conectados -original: intro-to-linked-data -date: 2013-08-05 -translation_date: 2022-11-21 -authors: -- Jonathan Blaney -reviewers: -- Terhi Nurmikko-Fuller -- Matthew Lincoln -editors: -- Adam Crymble -translator: -- Francisco Nabais -translation-editor: -- Joana Vieira Paulino -translation-reviewer: -- Bruno Almeida -- Daniel Bonatto Seco -lesson-testers: David Valentine -tested_date: 2025-02-28 -difficulty: 1 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/428 -activity: acquiring -topics: [lod] -abstract: "Este tutorial apresenta os principais conceitos de dados abertos conectados (*Linked Open Data*), incluindo URIs, ontologias, formatos RDF e uma breve introdução à linguagem de consulta de gráficos SPARQL." -avatar_alt: Um homem velho com uma mulher em cada braço -doi: 10.46430/phpt0033 ---- - -{% include toc.html %} - -Nota de Tradução: Alguns termos, por aparecerem constantemente e facilitarem a interpretação das imagens, apenas foram propositadamente traduzidos uma vez e serão colocados entre parênteses. Alertamos também para a existência de alguns exemplos que não foram propositadamente traduzidos para facilitar a sua introdução nos programas apresentados. - - -Introdução e Âmbito da lição ------------------------------ - -Esta lição oferece uma breve e concisa introdução aos [dados abertos conectados](https://pt.wikipedia.org/wiki/Linked_data#The_Linking_Open_Data_Project) (*Linked Open Data* ou LOD). Não é necessário conhecimento prévio para realizar este tutorial. Os leitores deverão obter uma compreensão clara dos conceitos por detrás dos dados abertos conectados, como são utilizados e como são criados. O tutorial está dividido em cinco partes, além de leituras adicionais: - -1. Dados abertos conectados: o que são? -2. O papel do [Identificador Uniforme de Recurso](https://pt.wikipedia.org/wiki/URI) (*Uniform Resource Identifier* ou URI) -3. Como o LOD organiza o conhecimento: [ontologias](https://pt.wikipedia.org/wiki/Ontologia_(ci%C3%AAncia_da_computa%C3%A7%C3%A3o)) -4. A [Estrutura de Descrição de Recursos](https://pt.wikipedia.org/wiki/Resource_Description_Framework) (*Resource Description Framework* ou RDF) e formatos de dados -5. Consulta de dados abertos conectados com [SPARQL](https://pt.wikipedia.org/wiki/SPARQL) -6. Outras leituras e recursos - -A conclusão deste tutorial poderá levar algumas horas e poderá ser útil reler algumas secções para solidificar a sua compreensão. Os termos técnicos foram ligados à sua página correspondente na Wikipedia e encoraja-se a que faça uma pausa e leia sobre termos que considere desafiadores. Depois de ter aprendido alguns dos princípios-chave do LOD, a melhor maneira de melhorar e solidificar esse conhecimento é praticar. Este tutorial fornece oportunidades para fazê-lo. No final da lição, deverá compreender os princípios básicos de LOD, incluindo termos e conceitos-chave. - -Se precisar aprender a como explorar LOD usando a linguagem de consulta [SPARQL](https://pt.wikipedia.org/wiki/SPARQL), recomenda-se a lição de Matthew Lincoln ['*Using SPARQL to access Linked Open Data*'](/en/lessons/retired/graph-databases-and-SPARQL) (em inglês) (Nota: a lição deste link encontra-se desatualizada e já não é mantida pelo _Programming Historian_. Por favor veja a nota inicial dessa página sobre a razão dessa lição ter sido retirada), que segue praticamente a visão geral fornecida nesta lição. - -Para proporcionar aos leitores uma base sólida dos princípios básicos de LOD, este tutorial não oferecerá uma cobertura abrangente de todos os seus conceitos. Estes **não** serão o foco desta lição: - -1. [Web Semântica](https://pt.wikipedia.org/wiki/Web_sem%C3%A2ntica) e [raciocínio semântico](https://en.wikipedia.org/wiki/Semantic_reasoner) (em inglês) de [datasets](https://pt.wikipedia.org/wiki/Conjunto_de_dados). Um raciocinador semântico deduziria que Jorge VI é o irmão ou meio-irmão de Eduardo VIII, dado que: a) Eduardo VIII é o filho de Jorge V e b) Jorge VI é o filho de Jorge V. Este tutorial não se foca neste tipo de tarefa. -2. Criação e *upload* de conjuntos de dados abertos conectados ligados à [Nuvem de dados conectados](http://linkeddatacatalog.dws.informatik.uni-mannheim.de/state/) (em inglês). Partilhar LOD é um princípio importante, que é encorajado abaixo. Contudo, os aspetos práticos de contribuir com LOD para a nuvem de dados conectados estão além do âmbito desta lição. Alguns recursos que podem ajudar a começar esta tarefa estão disponíveis no final deste tutorial. - -## Dados abertos conectados: O que são? - -LOD é informação estruturada num formato destinado a máquinas e, por isso, não é necessariamente um conceito de fácil definição. É importante não perder a motivação com esta informação já que, ao compreender os princípios, pode colocar uma máquina a fazer uma leitura autónoma. - -Se todos os datasets fossem publicados abertamente e utilizassem o mesmo formato para estruturar a informação, seria possível interrogá-los todos de uma só vez. A análise de grandes volumes de dados é potencialmente muito mais poderosa do que qualquer pessoa que utilize os seus próprios datasets individuais espalhados pela web nos chamados [silos de informação](https://en.wikipedia.org/wiki/Information_silo) (em inglês). Estes datasets interoperáveis são aquilo para que os profissionais de LOD estão a trabalhar. - -Para atingir este objetivo, ao trabalhar com LOD, é importante recordar três princípios: - -1. **Utilizar um formato padrão de LOD reconhecido**. Para que o LOD funcione, os dados devem ser [estruturados](https://pt.wikipedia.org/wiki/Estrutura_de_dados), utilizando normas reconhecidas para que os computadores que interrogam os dados possam processá-los de forma consistente. Há vários formatos de LOD, alguns dos quais são discutidos abaixo. -2. **Referir uma entidade da mesma forma que outras pessoas o fazem**. Se existirem dados sobre a mesma pessoa/local/coisa em dois ou mais locais, certifique-se de que se refere à pessoa/local/coisa da mesma forma em todos os casos. -3. **Publicar os seus dados abertamente**. Qualquer pessoa deverá poder utilizar os seus dados sem pagar uma taxa e num formato que não exija [software proprietário](https://pt.wikipedia.org/wiki/Software_propriet%C3%A1rio). - -Comecemos com um exemplo de dados sobre uma pessoa, utilizando uma abordagem comum [par atributo-valor](https://en.wikipedia.org/wiki/Attribute%E2%80%93value_pair) (em inglês) típica em computação: - - pessoa=número - -Neste caso, o 'atributo' é uma pessoa. E o valor - ou quem é essa pessoa - é representado por um número. O número pode ser atribuído aleatoriamente ou pode ser utilizado um número que já esteja associado a essa pessoa. Esta última abordagem tem grandes vantagens: se todos os que criarem um dataset que menciona essa pessoa utilizarem *exatamente o mesmo número, exatamente no mesmo formato*, então podemos encontrar esse indivíduo de forma fiável em qualquer dataset aderindo a essas regras. Vamos criar um exemplo usando Jack Straw: tanto o nome de um rebelde inglês do século XIV, como o de um ministro de gabinete britânico proeminente na administração de Tony Blair. É útil ser capaz de diferenciar as duas pessoas que partilham o mesmo nome. - -Utilizando o modelo acima, no qual cada pessoa é representada por um número único, vamos atribuir ao ministro britânico Jack Straw o número `64183282`. O seu par atributo-valor ficaria então com este aspeto: - - pessoa=64183282 - -E vamos atribuir a Jack Straw, descrito no *[Oxford Dictionary of National Biography](http://www.oxforddnb.com)* (em inglês) como 'o enigmático líder rebelde', o número `33059614`, fazendo com que o seu par atributo-valor se pareça com isto: - - pessoa=33059614 - -Desde que todos os que fazem LOD utilizem estes dois números para se referirem aos respetivos Jack Straws, podemos agora procurar a pessoa `64183282` num conjunto de dados abertos conectados e podemos estar confiantes de que estamos a obter a pessoa certa - neste caso, o ministro. - -Os pares atributo-valor também podem armazenar informações sobre outros tipos de entidades: lugares, por exemplo. Jack Straw, o político moderno, era membro do Parlamento britânico, representando o assento de Blackburn. Há mais do que um lugar no Reino Unido chamado Blackburn, para não mencionar outros Blackburn em todo o mundo. Usando os mesmos princípios acima delineados, podemos desambiguar entre os vários Blackburns, atribuindo um identificador único ao lugar correto: Blackburn em Lancashire, Inglaterra. - - Lugar=2655524 - -Neste momento pode estar pensando, "isso é o que um catálogo de biblioteca faz". É verdade que a ideia-chave aqui é a do [ficheiro de autoridade](https://pt.wikipedia.org/wiki/Controle_de_autoridade), central na biblioteconomia (um ficheiro de autoridade é uma lista definitiva de termos que podem ser utilizados num contexto particular, por exemplo, quando se cataloga um livro). Nos dois exemplos acima descritos, utilizamos ficheiros de autoridade para atribuir números (os identificadores únicos) aos Jacks e ao Blackburn. Os números que utilizamos para os dois Jack Straws provêm do [Virtual International Authority File](https://www.oclc.org/en/viaf.html) (em inglês) (VIAF) (Arquivo Internacional de Autoridade Virtual), que é mantido por um consórcio de bibliotecas de todo o mundo, de modo a tentar resolver o problema da miríade de maneiras pelas quais a mesma pessoa pode ser referida. O identificador único que utilizamos para o distrito eleitoral de Blackburn provém da [GeoNames](http://www.geonames.org/) (em inglês), uma base de dados geográfica gratuita. - -Vamos tentar ser mais precisos com o que, neste caso, queremos dizer com 'Blackburn'. Jack Straw representou o círculo eleitoral (uma área representada por um único membro do parlamento) de Blackburn, que mudou os seus limites ao longo do tempo. O projeto "[*Digging Into Linked Parliamentary Data*](https://repository.jisc.ac.uk/6544/1/DiLiPaD_final_report_1.pdf)" (Dilipad) (em inglês), no qual trabalhei, produziu identificadores únicos para as filiações partidárias e circunscrições eleitorais para cada membro do parlamento. Neste exemplo, Jack Straw representou o distrito eleitoral conhecido como 'Blackburn' na sua encarnação pós-1955: - - blackburn1955-presente - -Como o VIAF é um ficheiro de autoridade respeitado e bem mantido, fornece um conjunto óbvio de identificadores a utilizar para Jack Straw. Como o distrito eleitoral representado por Straw estava perfeitamente coberto pelos ficheiros de autoridade criados pelo projeto Dilipad, também era um ficheiro de autoridade lógico a utilizar. Infelizmente, nem sempre é tão óbvio qual das listas publicadas online é a melhor para se usar. Uma pode ser mais utilizada do que outra, mas esta última pode ser mais abrangente para um determinado fim. O GeoNames funcionaria melhor do que os identificadores da Dilipad em alguns casos. Haverá também casos em que não se consegue encontrar um dataset com essa informação. Por exemplo, se quiser escrever pares atributo-valor sobre si próprio e as suas relações familiares imediatas terá de inventar os seus próprios identificadores. - -Esta falta de ficheiros de autoridade coerentes é um dos maiores desafios que o LOD enfrenta neste momento. [Tim Berners-Lee](https://pt.wikipedia.org/wiki/Tim_Berners-Lee), que inventou uma forma de ligar documentos em rede e criou assim a World Wide Web, um dos principais proponentes de LOD, para encorajar uma maior utilização de dados conectados, sugeriu um "[sistema de classificação de cinco estrelas](https://www.w3.org/DesignIssues/LinkedData.html)" (em inglês) para que todos avançassem o mais longe possível em direção ao LOD. Essencialmente, Tim Berners-Lee apoia a publicação aberta de dados, especialmente ao utilizar formatos abertos e normas públicas, mas o melhor é que os dados se liguem também aos dados de outras pessoas. - -Com os identificadores únicos atribuídos a todos os elementos, o próximo passo fundamental na criação de LOD é ter uma forma de *descrição* da relação entre Jack Straw (`64183282`) e Blackburn (`blackburn1955-presente`). Em LOD, as relações são expressas utilizando o que é conhecido como '[tripla semântica](https://en.wikipedia.org/wiki/Semantic_triple)' (em inglês). Vamos fazer uma tripla semântica que represente a relação entre Jack Straw e o seu distrito eleitoral: - - pessoa:64183282 papel:representaNoParlamentoBritânicodistritoeleitoral:"blackburn1955-presente" . - -A apresentação (ou [sintaxe](https://pt.wikipedia.org/wiki/Sintaxe)) das triplas semânticas, incluindo a pontuação utilizada acima, será discutida mais tarde, na secção sobre RDF e formatos de dados. Por agora, vamos focar-nos na estrutura básica. A tripla semântica, não surpreendentemente, tem três partes. Estas são convencionalmente referidas como sujeito (*subject*), predicado (*predicate*) e objeto (*object*): - -| o sujeito | o predicado | o objeto | -| --------------- | ------------------------- | ----------------------- | -| pessoa 64183282 | representadaNoParlamentoBritânico | "blackburn1955-presente" | - -A forma tradicional de representar uma tripla semântica em forma esquemática é a seguinte (em inglês): - -{% include figure.html filename="pt-tr-introducao-dados-abertos-conectados-01.png" alt="Imagem com a representação de uma tripla semântica" caption="Figura 1. Forma tradicional de representar uma tripla semântica." %} - -Assim, a nossa tripla semântica do Jack Straw, apresentado de forma mais legível para o ser humano, poderia assumir a seguinte forma: - -{% include figure.html filename="pt-tr-introducao-dados-abertos-conectados-02.png" alt="Imagem com a representação de uma tripla semântica aplicada ao exemplo de Jack Straw" caption="Figura 2. Diagrama da tripla semântica que demonstra que Jack Straw representava Blackburn." %} - -Por enquanto, é importante fixar três pontos-chave: - -- O LOD deve estar aberto e disponível para qualquer pessoa na Internet (caso contrário, não está "aberto") -- Os defensores do LOD têm como objetivo normalizar as formas de referência a entidades únicas -- O LOD consiste em triplas semânticas que descrevem as relações entre entidades - -## O papel do *Uniform Resource Identifier* (URI) - -Uma parte essencial de LOD é o [Identificador Uniforme de Recurso](https://pt.wikipedia.org/wiki/URI)(*Uniform Resource Identifier* ou URI). O URI é uma forma única e fiável de representar uma entidade (uma pessoa, um objeto, uma relação, etc.), de uma forma que é utilizável por todos no mundo. - -Na secção anterior, utilizamos dois números diferentes para identificar os diferentes Jack Straws. - - pessoa="64183282" - - pessoa="33059614" - -O problema é que em todo o mundo existem muitas bases de dados que contêm pessoas com estes números e são, provavelmente, todas pessoas diferentes. Fora do nosso contexto imediato, estes números não identificam indivíduos únicos. Vamos tentar resolver isso. Aqui estão estes mesmos identificadores, mas como URIs: - - http://viaf.org/viaf/64183282/ - - http://viaf.org/viaf/33059614/ - -Tal como o número único desambiguou os nossos dois Jack Straws, o URI completo acima ajuda-nos a desambiguar entre todos os diferentes ficheiros de autoridade lá fora. Neste caso, é evidente que estamos a utilizar o VIAF como o nosso ficheiro de autoridade. Com certeza, já viu esta forma de desambiguação muitas vezes na web. Existem muitos websites em todo o mundo com páginas chamadas `/home` ou `/faq`. Mas não há confusão porque o [domínio](https://pt.wikipedia.org/wiki/Nome_de_dom%C3%ADnio) (a primeira parte do [Localizador Uniforme de Recursos](https://pt.wikipedia.org/wiki/URL) (*Uniform Resource Locator* ou URL) - ex. `bbc.co.uk`) é único, portanto, todas as páginas que fazem parte desse domínio são únicas em outras páginas `/faq` de outros websites. No endereço `http://www.bbc.co.uk/faqs` é a parte `bbc.co.uk` que torna as páginas subsequentes únicas. Isto é tão óbvio para as pessoas que utilizam a web a toda a hora que não pensam sobre isso. Provavelmente, também sabe que se quiser criar um website chamado `bbc.co.uk` não conseguirá, porque esse nome já foi registado com a autoridade apropriada, que é o [Sistema de Nomes de Domínio](https://pt.wikipedia.org/wiki/Sistema_de_Nomes_de_Dom%C3%ADnio) (*Domain Name System*). O registo garante a singularidade. Os URIs também têm de ser únicos. - -Embora os exemplos acima se pareçam com URLs, também é possível construir um URI que não se pareça nada com um URL. Temos muitas formas de identificar pessoas e coisas de forma única e raramente pensamos ou nos preocupamos com isso. Os códigos de barras, números de passaporte, até mesmo os códigos postais são concebidos para serem únicos. Os números de telefone são frequentemente colocados como placas de loja precisamente porque são únicos. Todos eles podem ser utilizados como URIs. - -Quando criamos URIs para as entidades descritas pelo projeto '[Tobias](https://gtr.ukri.org/projects?ref=AH%2FN003446%2F1#/tabOverview)' (em inglês), escolhemos uma estrutura do tipo URL e escolhemos utilizar o nosso espaço web institucional, pondo de lado `data.history.ac.uk/tobias-project/` como um lugar dedicado à hospedagem destes URIs. Ao colocá-lo em `data.history.ac.uk` em vez de `history.ac.uk`, houve uma separação clara entre URIs e as páginas do website. Por exemplo, um dos URIs do projeto Tobias era 'http://data.history.ac.uk/tobias-project/person/15601'. Embora o formato dos URIs acima mencionados seja o mesmo que um URL, eles não se ligam a websites (tente colá-lo num navegador web). Muitas pessoas novas no LOD acham isto confuso. Todos os URLs são URIs, mas nem todos os URIs são URLs. (nota de tradução: tendo em conta que o site original do projeto Tobias já não se encontra disponível, o leitor da lição deve entender os exemplos aqui indicados como meramente ilustrativos daquilo que o autor pretende demonstrar) Um URI pode descrever qualquer coisa, enquanto o URL descreve a localização de algo na web. Assim, um URL diz-lhe a localização de uma página web, de um ficheiro ou algo semelhante. Um URI faz apenas o trabalho de identificar algo. Tal como o Número internacional Normalizado do Livro (International Standard Book Number ou [ISBN](https://www.iso.org/standard/36563.html) (em inglês) `978-0-1-873354-6` identifica exclusivamente uma edição de capa dura de _Baptism, Brotherhood and Belief in Reformation Germany_, de Kat Hill, mas não diz onde obter uma cópia. Para isso precisaria de algo como um [número de acesso](https://pt.wikipedia.org/wiki/N%C3%BAmero_de_acesso_(biblioteconomia)), que lhe dá uma localização exata de um livro numa prateleira de uma biblioteca específica. - -Há um pouco de jargão em torno de URIs. As pessoas falam sobre se são ou não [desreferenciáveis](https://pt.wikipedia.org/wiki/Refer%C3%AAncia_(ci%C3%AAncia_da_computa%C3%A7%C3%A3o)). Isso apenas significa que *podemos transformar uma referência abstrata em algo diferente?* Por exemplo, se colarmos um URI na barra de endereços de um browser, será que ele encontra algo? O VIAF URI para o historiador Simon Schama é: - - http://viaf.org/viaf/46784579 - -Se o colocarmos no browser, receberemos de volta uma página web sobre Simon Schama que contém dados estruturados sobre ele e a sua história editorial. Isto é muito útil por um motivo. A partir do URI não é óbvio quem ou mesmo o que é que está a ser referido. Da mesma forma, se tratarmos um número de telefone (com código internacional) como o URI de uma pessoa, então deve ser desreferenciável. Alguém pode atender o telefone e pode até ser Schama. - -Mas isto não é essencial. Muitos URIs não são desreferenciáveis, como no exemplo acima do projeto Tobias. Não se pode encontrá-lo em lado nenhum; é uma convenção. - -O exemplo do VIAF leva-nos a outra coisa importante sobre os URIs: não os invente a não ser que tenha de o fazer. As pessoas e organizações têm feito esforços para construir boas listas de URI e o LOD não vai funcionar eficazmente se as pessoas duplicarem esse trabalho criando novos URIs desnecessariamente. Por exemplo, o VIAF tem o apoio de muitas bibliotecas internacionais. Se quiser construir URIs para pessoas, o VIAF é uma escolha muito boa. Se não conseguir encontrar algumas pessoas no VIAF, ou noutras listas de autoridade, só então poderá precisar fazer a sua própria. - -## Como o LOD organiza o conhecimento: ontologias - -Pode não ter sido óbvio a partir das triplas semânticas individuais que analisamos na secção anterior, mas o LOD pode responder a perguntas complexas. Quando se juntam as triplas semânticas, estas formam um [Mapa conceitual](https://pt.wikipedia.org/wiki/Mapa_conceitual), devido à forma como as triplas semânticas se interligam. Suponhamos que queremos encontrar uma lista de todas as pessoas que foram alunos do compositor Franz Liszt. Se a informação estiver em triplas semânticas de dados conectados sobre pianistas e os seus professores, podemos descobrir o que procuramos com uma consulta (veremos esta linguagem de consulta, chamada SPARQL, na secção final). - -Por exemplo, o pianista Charles Rosen foi aluno do pianista Moriz Rosenthal, que foi aluno de Franz Liszt. Vamos agora expressar isto em duas triplas semânticas (vamos cingir-nos às sequências de caracteres para os nomes em vez dos números de identificação, para tornar os exemplos mais legíveis): - - "Franz Liszt" ensinouPianoAo "Moriz Rosenthal" . - "Moriz Rosenthal" ensinouPianoAo "Charles Rosen" . - -Poderíamos igualmente ter criado as nossas triplas semânticas desta forma: - - "Charles Rosen" aprendeuPianoCom "Moriz Rosenthal" . - "Moriz Rosenthal" aprendeuPianoCom "Franz Liszt" . - -Estamos a inventar exemplos simplesmente para fins de ilustração, mas se quiser ligar os seus dados a outros datasets na "nuvem de dados conectados" deve olhar para as convenções que são utilizadas nesses datasets e fazer o mesmo. Na verdade, esta é uma das características mais úteis de LOD porque muito do trabalho já foi feito. As pessoas têm passado muito tempo a desenvolver formas de modelar a informação dentro de uma determinada área de estudo e a pensar sobre como as relações dentro dessa área podem ser representadas. Estes modelos são geralmente conhecidos como ontologias. Uma ontologia é uma abstração que permite a representação de um conhecimento particular sobre o mundo. Neste sentido, estas são bastante recentes e foram concebidas para fazer o que uma [taxonomia](https://pt.wikipedia.org/wiki/Taxonomia_(geral)) hierárquica faz (pense na classificação das espécies na [Taxonomia de Lineu](https://pt.wikipedia.org/wiki/Taxonomia_de_Lineu), mas de uma forma mais flexível. - -Uma ontologia é mais flexível porque não é hierárquica. Visa representar a fluidez do mundo real, onde as coisas podem ser relacionadas umas com as outras de formas mais complexas do que quando são representadas por uma estrutura hierárquica em forma de árvore. Em vez disso, uma ontologia é mais parecida com uma teia de aranha. - -O que quer que pretenda representar com LOD, sugerimos que encontre um vocabulário existente e que o utilize, em vez de tentar escrever o seu próprio vocabulário. Esta página tem [uma lista de alguns dos vocabulários mais populares](http://semanticweb.org/wiki/Main_Page.html) (em inglês). - -Uma vez que o nosso exemplo acima se concentra nos pianistas, seria uma boa ideia encontrar uma ontologia apropriada em vez de criar o nosso próprio sistema. De facto, existe [uma ontologia para música](http://web.archive.org/web/20170715094229/http://www.musicontology.com/) (em inglês). Para além de uma especificação bem desenvolvida, esta tem também alguns exemplos úteis da sua utilização. Pode dar uma olhada nas [páginas de iniciação](http://web.archive.org/web/20170718143925/http://musicontology.com/docs/getting-started.html) (em inglês) para ter uma ideia de como se pode utilizar esta ontologia em particular. - -Infelizmente, não conseguimos encontrar nada que descreva a relação entre um professor e um aluno na Ontologia da Música. Mas a ontologia é publicada abertamente, logo podemos utilizá-la para descrever outras características da música e depois criar a nossa própria extensão. Se então publicássemos a nossa extensão abertamente, outros poderiam utilizá-la se assim o desejassem e este ato pode tornar-se num padrão. Embora o projeto *Music Ontology* (Ontologia Musical) não tenha a relação que precisamos, o [projeto *Linked Jazz*](https://linkedjazz.org/) (em inglês) permite o uso de '*mentorOf*', o que parece funcionar bem no nosso caso. Embora esta não seja uma solução ideal, é uma solução que faz um esforço para utilizar o que já existe por aí. - -Agora, se estivéssemos a estudar a história do pianismo, poderíamos querer identificar muitos pianistas que foram ensinados por alunos de Liszt, para estabelecer uma espécie de árvore genealógica e ver se estes 'netos' de Liszt têm algo em comum. Poderíamos pesquisar os alunos de Liszt, fazer uma grande lista deles, depois pesquisar cada um dos alunos e tentar fazer listas de quaisquer alunos que eles tivessem. Com LOD poderíamos (novamente, se as triplas semânticas existissem) escrever uma query semelhante a: - - Dá-me os nomes de todos os pianistas ensinados por x - onde x aprendeu piano com Liszt - -Isto encontraria todas as pessoas do dataset que eram alunos de alunos de Liszt. Não nos entusiasmemos demasiado: esta pergunta não nos dará todos os alunos de todos os alunos de Liszt que já existiram, porque essa informação provavelmente não existe e não existe dentro de nenhum conjunto de triplas semânticas existentes. Lidar com dados do mundo real mostra todo o tipo de omissões e inconsistências, que veremos quando olharmos para o maior conjunto de LOD, a [DBpedia](https://www.dbpedia.org/), na secção final. - -Se tiver utilizado [bases de dados relacionais](https://pt.wikipedia.org/wiki/Banco_de_dados_relacional) poderá estar a pensar que estas podem desempenhar a mesma função. No nosso caso de Liszt, a informação sobre pianistas acima descrita pode estar organizada numa [tabela](https://pt.wikipedia.org/wiki/Tabela_(banco_de_dados)) de base de dados denominada por algo como 'Alunos'. - -| IDaluno | IDprofessor | -| ------- | --------- | -| 31 | 17 | -| 35 | 17 | -| 49 | 28 | -| 56 | 28 | -| 72 | 40 | - -Se não estiver familiarizado com bases de dados não se preocupe. Mas, provavelmente, ainda pode ver que alguns pianistas nesta tabela tinham o mesmo professor (números 17 e 28). Sem entrar em pormenores, se Liszt estiver nesta tabela de bases de dados, seria bastante fácil extrair os alunos de Liszt, ao utilizar um ``Join`` ([*join*](https://pt.wikipedia.org/wiki/Join_(SQL))). - -De facto, as bases de dados relacionais podem oferecer resultados semelhantes ao LOD. A grande diferença é que o LOD pode ir mais longe: pode ligar datasets que foram criados sem intenção explícita de serem ligados entre si. A utilização do [Quadro de Descrição de Recursos](https://pt.wikipedia.org/wiki/Resource_Description_Framework) (*Resource Description Framework* ou RDF) e URIs permite que isto aconteça. - -## RDF e formatos de dados - -LOD utiliza uma norma, definida pelo [Consórcio World Wide Web](https://www.w3.org/) (em inglês) (*World Wide Web Consortium* ou W3C), chamada *[Resource Description Framework](https://pt.wikipedia.org/wiki/Resource_Description_Framework)* ou apenas RDF. As normas são úteis desde que sejam amplamente adotadas - pense no metro ou nos tamanhos de parafuso padrão - mesmo que sejam essencialmente arbitrárias. O RDF tem sido amplamente adotado como a norma LOD. - -Ouvirá frequentemente o LOD referido simplesmente como RDF. Atrasamos a conversa sobre o RDF até agora porque é bastante abstrato. RDF é um [modelo de dados](https://pt.wikipedia.org/wiki/Modelagem_de_dados) que descreve como é que os dados são estruturados num nível teórico. Assim, a insistência na utilização de triplas semânticas (em vez de quatro partes, ou duas ou nove, por exemplo) é uma regra no RDF. Mas quando se trata de questões mais práticas, há algumas escolhas quanto à implementação. Assim, o RDF diz-lhe o que tem de fazer, mas não exatamente como o tem de fazer. Estas escolhas dividem-se em duas áreas: como se escrevem as coisas (serialização) e as relações que as suas triplas semânticas descrevem. - -### Serialização - -A [Serialização](https://pt.wikipedia.org/wiki/Serializa%C3%A7%C3%A3o) é o termo técnico para "como se escrevem as coisas". O chinês padrão (mandarim) pode ser escrito em caracteres tradicionais, caracteres simplificados ou romanização Pinyin e a língua em si não muda. Tal como o mandarim, o RDF pode ser escrito de várias formas. Aqui vamos olhar para duas (há outras, mas por uma questão de simplicidade, vamos concentrar-nos nestas): - -1) [Turtle](https://en.wikipedia.org/wiki/Turtle_(syntax)) (em inglês) -2) [RDF/XML](https://pt.wikipedia.org/wiki/RDF/XML) - -Reconhecer a serialização que está a ser utilizada significa que podemos então escolher ferramentas apropriadas concebidas para esse formato. Por exemplo, o RDF pode vir serializado no formato [XML](https://pt.wikipedia.org/wiki/XML). Podemos então utilizar uma ferramenta ou biblioteca de códigos concebida para analisar esse formato em particular, o que é útil se já souber como trabalhar com ele. O reconhecimento do formato também lhe dá as palavras-chave certas para procurar ajuda online. Muitos recursos permitem descarregar as suas bases de dados LOD, podendo escolher qual a serialização que deseja fazer o *Download*. - -#### Turtle - -'Turtle' é um jogo de palavras. 'Tur' é a abreviatura de 'terse' e 'tle' - é a abreviatura de '*triple language*' (linguagem de triplos). Turtle é uma forma agradavelmente simples de escrever triplas semânticas. - -O Turtle usa apelidos ou atalhos, conhecidos como [prefixos](https://www.w3.org/TeamSubmission/turtle/#sec-tutorial) (em inglês), o que nos poupa ter de escrever URIs completos todas as vezes. Voltemos ao URI que criamos na secção anterior: - - http://data.history.ac.uk/tobias-project/person/15601 - -Não queremos escrever isto cada vez que nos referimos a esta pessoa (lembrar-se-á de Jack Straw). Por isso, só temos de enunciar o nosso atalho: - - @prefix toby: . - -Então Jack é `toby:15601`, que substitui o longo URI e é mais fácil à vista. Eu escolhi 'toby', mas poderia igualmente escolher qualquer sequência de letras. - -Vamos agora passar de Jack Straw para William Shakespeare e utilizar Turtle para descrever algumas coisas sobre as suas obras. Vamos ter de decidir sobre os ficheiros de autoridade a utilizar, um processo que, como mencionado acima, é melhor ser selecionado ao olhar para outros conjuntos de LOD. Aqui usaremos como um dos nossos prefixos [*Dublin Core*](https://pt.wikipedia.org/wiki/Dublin_Core), uma norma de [metadados](https://pt.wikipedia.org/wiki/Metadados) de bibliotecas [(Número de controle da Biblioteca do Congresso](https://en.wikipedia.org/wiki/Library_of_Congress_Control_Number) (*Library of Congress Control Number*) como outro e, o último (VIAF) deverá ser-lhe familiar. Juntos, estes três ficheiros de autoridade fornecem identificadores únicos para todas as entidades que tenciono utilizar neste exemplo: - - @prefix lccn: . - @prefix dc: . - @prefix viaf: . - - lccn:n82011242 dc:creator viaf:96994048 . - -Note o espaçamento do ponto final após a última linha. Esta é a forma de Turtle indicar o fim. Tecnicamente não é necessário ter o espaço, mas facilita a leitura após uma longa sequência de caracteres. - -No exemplo acima, lccn:n82011242 representa Macbeth; dc:creator liga Macbeth ao seu autor; viaf:96994048 representa William Shakespeare. - -O Turtle também permite listar triplas semânticas sem se preocupar em repetir cada URI quando acabou de o usar. Acrescentemos a data em que os estudiosos pensam que Macbeth foi escrito, utilizando o par atributo-valor Dublin Core: `dc:create 'YYYY'`: - - @prefix lccn: . - @prefix dc: . - @prefix viaf: . - - lccn:n82011242 dc:creator viaf:96994048 ; - dc:created "1606" . - -Lembra-se da estrutura da tripla semântica discutida na secção 1? Aí demos este exemplo: - - 1 pessoa 15601 (o sujeito) 2 representadaNoParlamentoBritânico (o predicado) 3 "Blackburn" (o objeto) - -O essencial é que o predicado liga o sujeito e o objeto. Ele descreve a relação entre eles. O sujeito vem primeiro na tripla semântica, mas isso é uma questão de escolha, como discutimos com o exemplo de pessoas que foram ensinadas a tocar piano por Liszt. - -Pode-se usar um ponto e vírgula se o sujeito for o mesmo mas o predicado e o objeto forem diferentes, ou uma vírgula se o sujeito e o predicado forem o mesmo e apenas o objeto for diferente. - - lccn:no2010025398 dc:creator viaf:96994048 , - viaf:12323361 . - -Aqui estamos a dizer que Shakespeare (96994048) e John Fletcher (12323361) foram ambos os criadores da obra *The Two Noble Kinsmen*. - -Quando analisamos as ontologias anteriormente sugeri que visse a [*Music Ontology*](http://web.archive.org/web/20170718143925/http://musicontology.com/docs/getting-started.html) (em inglês). Dê agora uma olhada novamente. Isto ainda é complicado, mas será que agora fazem mais sentido? - -Uma das ontologias mais acessíveis é a '*Friend of a Friend*' (amigo de um amigo) ou [FOAF](https://en.wikipedia.org/wiki/FOAF_(ontology)) (em inglês). Esta é concebida para descrever pessoas e, talvez por essa razão, é bastante intuitiva. Se, por exemplo, quiser escrever-me para me dizer que este curso é a melhor coisa que já leu, aqui está o meu email expresso como triplas semânticas em FOAF: - - @prefix foaf: . - - :"Jonathan Blaney" foaf:mbox . - -#### RDF/XML - -Em contraste com o Turtle, o RDF/XML pode parecer um pouco pesado. Para começar, vamos apenas converter uma tripla semântica da Turtle acima, aquela que refere que Shakespeare foi o criador de *The Two Noble Kinsmen*: - - lccn:no2010025398 dc:creator viaf:96994048 . - -Em RDF/XML, com os prefixos declarados dentro do trecho de código de XML, fica: - -``` xml - - - - - -``` - -O formato RDF/XML tem a mesma informação básica que o formato Turtle, mas é escrito de forma muito diferente, baseando-se nos princípios das etiquetas XML encaixadas. - -Passemos a um exemplo diferente para mostrar como o RDF/XML combina triplas semânticas e, ao mesmo tempo, introduz o [*Simple Knowledge Organization System*](https://pt.wikipedia.org/wiki/Simple_Knowledge_Organization_System) (SKOS) (Sistema Simples de Organização do Conhecimento), que foi concebido para codificar tesauros ou taxonomias. - - - Abdication - - -Aqui estamos a dizer que o conceito SKOS `21250`, *markdown abdication*, tem um rótulo preferido de "*abdication*". A forma como funciona é que o elemento sujeito (incluindo a parte da '*abdication*', que é um valor de atributo em termos de XML) tem o predicado e o objeto encaixados no seu interior. O elemento encaixado é o predicado e [o nó folha](https://pt.wikipedia.org/wiki/%C3%81rvore_(estrutura_de_dados)#Terminologia) (em inglês), é o objeto. Este exemplo é retirado de um projeto para publicar um [*Tesauro de História Britânica e Irlandesa*](https://www.history.ac.uk/research/digital-history) (em inglês). - -Tal como com o Turtle, podemos acrescentar mais triplas semânticas. Portanto, vamos declarar que o termo mais restrito na nossa hierarquia de sujeitos, um abaixo de *Abdication*, vai ser *Abdication crisis (1936)*. - - - Abdication - - - - - - -Lembra-se de como os predicados e os objetos são encaixados dentro do sujeito? Aqui já o fizemos duas vezes com o mesmo sujeito, para que possamos tornar isto menos prolixo, aninhando ambos os conjuntos de predicados como objetos dentro do mesmo sujeito: - - - Abdication - - - -Se estiver familiarizado com XML isto será fácil. Se não estiver, talvez prefira um formato como o Turtle. Mas a vantagem aqui é que ao criar o seu RDF/XML pode usar as ferramentas habituais disponíveis com XML, como editores e analisadores dedicados ao XML, para verificar se o seu RDF/XML está corretamente formatado. Se não for uma pessoa que use o XML recomendo o Turtle, podendo usar uma [ferramenta online](http://www.easyrdf.org/converter) (em inglês) para verificar se a sua sintaxe está correta. - -## Consulta de RDF com SPARQL - -Para esta secção final iremos interrogar algum LOD e ver o que poderá ser feito com ele. - -A linguagem de consulta que usamos para LOD é chamada [SPARQL](https://pt.wikipedia.org/wiki/SPARQL). É um daqueles acrónimos recursivos amados pelos técnicos: ***S**PARQL **P**rotocol **a**nd **R**DF **Q**uery **L**anguage* (Protocolo SPARQL e Linguagem de Consulta RDF). - -Como mencionado no início, o *Programming Historian* tem [uma lição completa](/en/lessons/retired/graph-databases-and-SPARQL) (em inglês), de Matthew Lincoln, sobre a utilização do SPARQL (embora não seja já mantida (ver nota no início desta tradução). A secção final aqui presente é apenas uma visão geral dos conceitos básicos. Se o SPARQL despertar o seu interesse, pode obter uma fundamentação completa no tutorial de Lincoln. - -Vamos realizar as nossas consultas SPARQL na [DBpedia](https://www.dbpedia.org/), que é um enorme conjunto de LOD derivado da Wikipedia. Além de estar cheio de informação que é muito difícil de encontrar através da habitual interface da Wikipédia, tem vários "pontos de extremidade" (end points) SPARQL - interfaces onde se podem digitar as consultas SPARQL e obter resultados a partir das triplas semânticas da DBpedia. - -O end point de consulta SPARQL que é utilizado chama-se snorql: `http://dbpedia.org/snorql/` (em inglês). Estes end points ocasionalmente ficam offline. Se for o seu caso, tente procurar por *dbpedia sparql* e deverá encontrar um substituto semelhante. - -Se for ao URL snorql acima verá, no início, um número de prefixos que já nos foram declarados, o que é útil. Agora também irá reconhecer alguns dos prefixos. - -{% include figure.html filename="en-or-intro-to-linked-data-03.png" alt="Captura de tela com a interface de criação de consultas snorql" caption="Figura 3. Caixa de consulta padrão do snorql, com alguns prefixos declarados para si." %} - -Na caixa de consulta abaixo das declarações de prefixo, deverá ver o seguinte: - - SELECT * WHERE { - ... - } - -Se alguma vez escreveu uma consulta de bases de dados em *Structured Query Language*, [mais conhecida como SQL](https://pt.wikipedia.org/wiki/SQL), isto vai parecer-lhe bastante familiar e vai ajudá-lo a aprender SPARQL. Se não, não se preocupe. As palavras-chave aqui utilizadas, ``SELECT`` (SELECIONAR) e ``WHERE`` (ONDE) não são sensíveis a maiúsculas e minúsculas, mas algumas partes de uma consulta SPARQL podem ser (ver abaixo), por isso recomendo que se cinja ao caso dado ao longo das consultas neste curso. - -Aqui `SELECT` significa "encontrar alguma coisa" e `*` significa "dá-me tudo". `WHERE` introduz uma condição, que é onde vamos colocar os detalhes de que tipo de coisas queremos que a consulta encontre. - -Vamos começar com algo simples para ver como é que isto funciona. Cole (ou, melhor, escreva) isto na caixa de consulta: - - SELECT * WHERE { - :Lyndal_Roper ?b ?c - } - -Clique em '*go*' (ir). Se deixar o menu *drop-down* como '*browse*' (navegar) deverá obter duas colunas com os rótulos "b" e "c". (Note que aqui, as maiúsculas/minúsculas importam: lyndal_roper não lhe dará resultados). - - -{% include figure.html filename="en-or-intro-to-linked-data-04.png" alt="Captura de tela com a interface de resultados de consultas snorql" caption="Figura 4. Topo das listas de resultados de uma consulta com todas as triplas semânticas com 'Lyndal_Roper' como sujeito." %} - -Então o que é que acabou de acontecer? E como é que soubemos o que escrever? - -Na verdade, não sabíamos. Esse é um dos problemas com end points do SPARQL. Quando se conhece um dataset, é preciso experimentar coisas e descobrir que termos são utilizados. Porque isto vem da *Wikipedia* e nós estávamos interessados sobre que informação sobre historiadores podíamos encontrar. Então vamos à página da *Wikipedia* da historiadora [Lyndal Roper](https://en.wikipedia.org/wiki/Lyndal_Roper) (em inglês). - -A parte final do URL é `Lyndal_Roper` e concluímos então que é provável que esta cadeia de caracteres seja a forma como Roper é referida na DBpedia. Porque não sabemos o que mais poderia estar em triplas semânticas que mencionam Roper, nós utilizamos `?b` e `?c`: estes são apenas marcadores de posição. Poderia igualmente ter digitado `?whatever` e `?you_like` e as colunas teriam esses rótulos. Quando quiser ser mais preciso sobre o que se está a pesquisar, será importante etiquetar as colunas de forma significativa. - -Experimente agora a sua própria consulta SPARQL: escolha uma página *Wikipedia* e copie a parte final do URL, após a barra final, e coloque-a no lugar de Lyndal_Roper. Depois clique em 'go'. - -A partir da informação que se obtém destes resultados é possível gerar *queries* mais precisas. Isto pode ser pouco fiável, por isso não se preocupe se algumas não funcionarem. - -Vamos voltar aos resultados para a consulta que fizemos há momentos: - - SELECT * WHERE { - :Lyndal_Roper ?b ?c - } - -Podemos ver uma longa lista na coluna etiquetada _c_. Estes são todos os atributos que Roper tem na *DBpedia* e que nos ajudarão a encontrar outras pessoas com estes atributos. Por exemplo, podemos ver ```http://dbpedia.org/class/yago/Historian110177150```. Poderemos utilizar isto para obter uma lista de historiadores? Vamos colocá-lo na nossa pergunta, mas em terceiro lugar, porque era onde estava quando a encontrei nos resultados da Lyndal Roper. A minha consulta tem este aspecto: - - SELECT * WHERE { - ?historian_name ?predicate - } - -Fizemos uma pequena mudança aqui. Se esta consulta funcionar de todo, então esperemos que os nossos historiadores estejam na primeira coluna, porque 'historiador' não parece poder ser um predicado: não funciona como um verbo numa frase; por isso vamos chamar à nossa primeira coluna de resultados 'historian_name' e à minha segunda (sobre a qual não sabemos nada) 'predicate' (predicado). - -Execute a *querie*. Deverá encontrar uma grande lista de historiadores. - -{% include figure.html filename="en-or-intro-to-linked-data-05.png" alt="Duas capturas de tela com a interface de consultas snorql e respectivos resultados" caption="Figura 5. Historiadores de acordo com a DBpedia." %} - -Assim, esta ferramenta funciona para criar listas, o que é útil, mas seria muito mais poderoso combinar listas para obter intersecções de conjuntos. Encontrei mais algumas coisas que podem ser interessantes consultar nos atributos DBpedia de Lyndal Roper: `http://dbpedia.org/class/yago/WikicatBritishHistorians` e `http://dbpedia.org/class/yago/WikicatWomenHistorians`. É muito fácil combiná-los pedindo uma variável a ser devolvida (no nosso caso isto é `?name` (nome)) e depois utilizando-a em múltiplas linhas de uma *querie*. Note também o espaço e o ponto completo no final da primeira linha que começa com `?name`: - - SELECT ?name - WHERE { - ?name ?b . - ?name ?b - } - -Funciona! Devemos obter cinco resultados. Na altura em que escrevo, há cinco historiadoras britânicas na *DBpedia*... - -{% include figure.html filename="en-or-intro-to-linked-data-06.png" alt="Duas capturas de tela com a interface de consultas snorql e respectivos resultados" caption="Figura 6. Historiadoras britânicas segundo a DBpedia." %} - -Apenas cinco historiadoras britânicas? Claro que há, na realidade, muitas mais do que isso, como poderíamos facilmente mostrá-lo substituindo o nome de, digamos, Alison Weir na nossa primeira consulta sobre Lyndal Roper. Isto leva-nos ao problema com a *Dbpedia* que mencionamos anteriormente: não é muito consistentemente marcado com informação estrutural do tipo que a *DBpedia* que utiliza. A nossa consulta pode listar algumas historiadoras britânicas mas acontece que não podemos utilizá-la para gerar uma lista significativa de pessoas nesta categoria. Tudo o que encontrámos foram as pessoas nas entradas da *Wikipedia* que alguém decidiu classificar como "historiadora britânica" e "historiadora". - -Com SPARQL na *DBpedia*, é preciso ter cuidado com as inconsistências do material de origem coletiva. Poderá usar o SPARQL exatamente da mesma forma num dataset mais confiável, por exemplo, os dados do governo britânico: https://data-gov.tw.rpi.edu//sparql (em inglês) e esperar obter resultados mais robustos (há aqui um breve tutorial para este dataset: https://data-gov.tw.rpi.edu/wiki/A_crash_course_in_SPARQL (em inglês). - -No entanto, apesar das suas inconsistências, a *DBpedia* é um ótimo local para aprender SPARQL. Esta foi apenas uma breve introdução, mas há muito mais em [Usando SPARQL para aceder ao Linked Open Data](/en/lessons/retired/graph-databases-and-SPARQL) (em inglês). - - -## Leituras e recursos adicionais - -* Dean Allemang e James Hendler, *Semantic Web for the Working Ontologist*, 2nd edn, Elsevier, 2011 -* Tim Berners-Lee [*Linked Data*](https://www.w3.org/DesignIssues/LinkedData.html) (em inglês) -* Bob DuCharme, *Learning SPARQL*, O'Reilly, 2011 -* [Blog de Bob DuCharme](http://www.snee.com/bobdc.blog/) (em inglês) também vale a pena ler -* Richard Gartner, *Metadata: Shaping Knowledge from Antiquity to the Semantic Web*, Springer, 2016 -* Seth van Hooland and Ruben Verborgh, *Linked Data for Libraries, Archives and Museums*, 2015 -* Matthew Lincoln ['*Using SPARQL to access Linked Open Data*'](/en/lessons/retired/graph-databases-and-SPARQL) (em inglês) -* [*Linked Data guides and tutorials*](https://web.archive.org/web/20170515070722/http://linkeddata.org/guides-and-tutorials) (em inglês) -* Dominic Oldman, Martin Doerr e Stefan Gradmann, '*Zen and the Art of Linked Data: New Strategies for a Semantic Web of Humanist Knowledge*', em *A New Companion to Digital Humanities*, editado por Susan Schreibman et al. -* Max Schmachtenberg, Christian Bizer e Heiko Paulheim, [*State of the LOD Cloud 2017*](http://linkeddatacatalog.dws.informatik.uni-mannheim.de/state/) (em inglês) -* David Wood, Marsha Zaidman e Luke Ruth, *Linked Data: Structured data on the Web*, Manning, 2014 - -## Agradecimentos - -Gostaria de agradecer aos meus dois colegas revisores, Matthew Lincoln e Terhi Nurmikko-Fuller e ao meu editor, Adam Crymble, por me ajudarem generosamente a melhorar esta lição com numerosas sugestões, esclarecimentos e correções. Este tutorial baseia-se num outro escrito como parte do '*Thesaurus of British and Irish History as SKOS*' [*(Tobias) project*](https://gtr.ukri.org/projects?ref=AH%2FN003446%2F1#/tabOverview) (em inglês), financiado pelo [AHRC](http://www.ahrc.ac.uk/) (em inglês). A lição foi revista para o projeto *Programming Historian*. - +--- +title: Introdução aos Dados Abertos Conectados +layout: lesson +collection: lessons +slug: introducao-dados-abertos-conectados +original: intro-to-linked-data +date: 2013-08-05 +translation_date: 2022-11-21 +authors: +- Jonathan Blaney +reviewers: +- Terhi Nurmikko-Fuller +- Matthew Lincoln +editors: +- Adam Crymble +translator: +- Francisco Nabais +translation-editor: +- Joana Vieira Paulino +translation-reviewer: +- Bruno Almeida +- Daniel Bonatto Seco +lesson-testers: David Valentine +tested_date: 2025-02-28 +difficulty: 1 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/428 +activity: acquiring +topics: [lod] +abstract: "Este tutorial apresenta os principais conceitos de dados abertos conectados (*Linked Open Data*), incluindo URIs, ontologias, formatos RDF e uma breve introdução à linguagem de consulta de gráficos SPARQL." +avatar_alt: Um homem velho com uma mulher em cada braço +doi: 10.46430/phpt0033 +--- + +{% include toc.html %} + +Nota de Tradução: Alguns termos, por aparecerem constantemente e facilitarem a interpretação das imagens, apenas foram propositadamente traduzidos uma vez e serão colocados entre parênteses. Alertamos também para a existência de alguns exemplos que não foram propositadamente traduzidos para facilitar a sua introdução nos programas apresentados. + + +Introdução e Âmbito da lição +----------------------------- + +Esta lição oferece uma breve e concisa introdução aos [dados abertos conectados](https://pt.wikipedia.org/wiki/Linked_data#The_Linking_Open_Data_Project) (*Linked Open Data* ou LOD). Não é necessário conhecimento prévio para realizar este tutorial. Os leitores deverão obter uma compreensão clara dos conceitos por detrás dos dados abertos conectados, como são utilizados e como são criados. O tutorial está dividido em cinco partes, além de leituras adicionais: + +1. Dados abertos conectados: o que são? +2. O papel do [Identificador Uniforme de Recurso](https://pt.wikipedia.org/wiki/URI) (*Uniform Resource Identifier* ou URI) +3. Como o LOD organiza o conhecimento: [ontologias](https://pt.wikipedia.org/wiki/Ontologia_(ci%C3%AAncia_da_computa%C3%A7%C3%A3o)) +4. A [Estrutura de Descrição de Recursos](https://pt.wikipedia.org/wiki/Resource_Description_Framework) (*Resource Description Framework* ou RDF) e formatos de dados +5. Consulta de dados abertos conectados com [SPARQL](https://pt.wikipedia.org/wiki/SPARQL) +6. Outras leituras e recursos + +A conclusão deste tutorial poderá levar algumas horas e poderá ser útil reler algumas secções para solidificar a sua compreensão. Os termos técnicos foram ligados à sua página correspondente na Wikipedia e encoraja-se a que faça uma pausa e leia sobre termos que considere desafiadores. Depois de ter aprendido alguns dos princípios-chave do LOD, a melhor maneira de melhorar e solidificar esse conhecimento é praticar. Este tutorial fornece oportunidades para fazê-lo. No final da lição, deverá compreender os princípios básicos de LOD, incluindo termos e conceitos-chave. + +Se precisar aprender a como explorar LOD usando a linguagem de consulta [SPARQL](https://pt.wikipedia.org/wiki/SPARQL), recomenda-se a lição de Matthew Lincoln ['*Using SPARQL to access Linked Open Data*'](/en/lessons/retired/graph-databases-and-SPARQL) (em inglês) (Nota: a lição deste link encontra-se desatualizada e já não é mantida pelo _Programming Historian_. Por favor veja a nota inicial dessa página sobre a razão dessa lição ter sido retirada), que segue praticamente a visão geral fornecida nesta lição. + +Para proporcionar aos leitores uma base sólida dos princípios básicos de LOD, este tutorial não oferecerá uma cobertura abrangente de todos os seus conceitos. Estes **não** serão o foco desta lição: + +1. [Web Semântica](https://pt.wikipedia.org/wiki/Web_sem%C3%A2ntica) e [raciocínio semântico](https://en.wikipedia.org/wiki/Semantic_reasoner) (em inglês) de [datasets](https://pt.wikipedia.org/wiki/Conjunto_de_dados). Um raciocinador semântico deduziria que Jorge VI é o irmão ou meio-irmão de Eduardo VIII, dado que: a) Eduardo VIII é o filho de Jorge V e b) Jorge VI é o filho de Jorge V. Este tutorial não se foca neste tipo de tarefa. +2. Criação e *upload* de conjuntos de dados abertos conectados ligados à [Nuvem de dados conectados](https://linkeddatacatalog.dws.informatik.uni-mannheim.de/state/) (em inglês). Partilhar LOD é um princípio importante, que é encorajado abaixo. Contudo, os aspetos práticos de contribuir com LOD para a nuvem de dados conectados estão além do âmbito desta lição. Alguns recursos que podem ajudar a começar esta tarefa estão disponíveis no final deste tutorial. + +## Dados abertos conectados: O que são? + +LOD é informação estruturada num formato destinado a máquinas e, por isso, não é necessariamente um conceito de fácil definição. É importante não perder a motivação com esta informação já que, ao compreender os princípios, pode colocar uma máquina a fazer uma leitura autónoma. + +Se todos os datasets fossem publicados abertamente e utilizassem o mesmo formato para estruturar a informação, seria possível interrogá-los todos de uma só vez. A análise de grandes volumes de dados é potencialmente muito mais poderosa do que qualquer pessoa que utilize os seus próprios datasets individuais espalhados pela web nos chamados [silos de informação](https://en.wikipedia.org/wiki/Information_silo) (em inglês). Estes datasets interoperáveis são aquilo para que os profissionais de LOD estão a trabalhar. + +Para atingir este objetivo, ao trabalhar com LOD, é importante recordar três princípios: + +1. **Utilizar um formato padrão de LOD reconhecido**. Para que o LOD funcione, os dados devem ser [estruturados](https://pt.wikipedia.org/wiki/Estrutura_de_dados), utilizando normas reconhecidas para que os computadores que interrogam os dados possam processá-los de forma consistente. Há vários formatos de LOD, alguns dos quais são discutidos abaixo. +2. **Referir uma entidade da mesma forma que outras pessoas o fazem**. Se existirem dados sobre a mesma pessoa/local/coisa em dois ou mais locais, certifique-se de que se refere à pessoa/local/coisa da mesma forma em todos os casos. +3. **Publicar os seus dados abertamente**. Qualquer pessoa deverá poder utilizar os seus dados sem pagar uma taxa e num formato que não exija [software proprietário](https://pt.wikipedia.org/wiki/Software_propriet%C3%A1rio). + +Comecemos com um exemplo de dados sobre uma pessoa, utilizando uma abordagem comum [par atributo-valor](https://en.wikipedia.org/wiki/Attribute%E2%80%93value_pair) (em inglês) típica em computação: + + pessoa=número + +Neste caso, o 'atributo' é uma pessoa. E o valor - ou quem é essa pessoa - é representado por um número. O número pode ser atribuído aleatoriamente ou pode ser utilizado um número que já esteja associado a essa pessoa. Esta última abordagem tem grandes vantagens: se todos os que criarem um dataset que menciona essa pessoa utilizarem *exatamente o mesmo número, exatamente no mesmo formato*, então podemos encontrar esse indivíduo de forma fiável em qualquer dataset aderindo a essas regras. Vamos criar um exemplo usando Jack Straw: tanto o nome de um rebelde inglês do século XIV, como o de um ministro de gabinete britânico proeminente na administração de Tony Blair. É útil ser capaz de diferenciar as duas pessoas que partilham o mesmo nome. + +Utilizando o modelo acima, no qual cada pessoa é representada por um número único, vamos atribuir ao ministro britânico Jack Straw o número `64183282`. O seu par atributo-valor ficaria então com este aspeto: + + pessoa=64183282 + +E vamos atribuir a Jack Straw, descrito no *[Oxford Dictionary of National Biography](https://www.oxforddnb.com)* (em inglês) como 'o enigmático líder rebelde', o número `33059614`, fazendo com que o seu par atributo-valor se pareça com isto: + + pessoa=33059614 + +Desde que todos os que fazem LOD utilizem estes dois números para se referirem aos respetivos Jack Straws, podemos agora procurar a pessoa `64183282` num conjunto de dados abertos conectados e podemos estar confiantes de que estamos a obter a pessoa certa - neste caso, o ministro. + +Os pares atributo-valor também podem armazenar informações sobre outros tipos de entidades: lugares, por exemplo. Jack Straw, o político moderno, era membro do Parlamento britânico, representando o assento de Blackburn. Há mais do que um lugar no Reino Unido chamado Blackburn, para não mencionar outros Blackburn em todo o mundo. Usando os mesmos princípios acima delineados, podemos desambiguar entre os vários Blackburns, atribuindo um identificador único ao lugar correto: Blackburn em Lancashire, Inglaterra. + + Lugar=2655524 + +Neste momento pode estar pensando, "isso é o que um catálogo de biblioteca faz". É verdade que a ideia-chave aqui é a do [ficheiro de autoridade](https://pt.wikipedia.org/wiki/Controle_de_autoridade), central na biblioteconomia (um ficheiro de autoridade é uma lista definitiva de termos que podem ser utilizados num contexto particular, por exemplo, quando se cataloga um livro). Nos dois exemplos acima descritos, utilizamos ficheiros de autoridade para atribuir números (os identificadores únicos) aos Jacks e ao Blackburn. Os números que utilizamos para os dois Jack Straws provêm do [Virtual International Authority File](https://www.oclc.org/en/viaf.html) (em inglês) (VIAF) (Arquivo Internacional de Autoridade Virtual), que é mantido por um consórcio de bibliotecas de todo o mundo, de modo a tentar resolver o problema da miríade de maneiras pelas quais a mesma pessoa pode ser referida. O identificador único que utilizamos para o distrito eleitoral de Blackburn provém da [GeoNames](https://www.geonames.org/) (em inglês), uma base de dados geográfica gratuita. + +Vamos tentar ser mais precisos com o que, neste caso, queremos dizer com 'Blackburn'. Jack Straw representou o círculo eleitoral (uma área representada por um único membro do parlamento) de Blackburn, que mudou os seus limites ao longo do tempo. O projeto "[*Digging Into Linked Parliamentary Data*](https://repository.jisc.ac.uk/6544/1/DiLiPaD_final_report_1.pdf)" (Dilipad) (em inglês), no qual trabalhei, produziu identificadores únicos para as filiações partidárias e circunscrições eleitorais para cada membro do parlamento. Neste exemplo, Jack Straw representou o distrito eleitoral conhecido como 'Blackburn' na sua encarnação pós-1955: + + blackburn1955-presente + +Como o VIAF é um ficheiro de autoridade respeitado e bem mantido, fornece um conjunto óbvio de identificadores a utilizar para Jack Straw. Como o distrito eleitoral representado por Straw estava perfeitamente coberto pelos ficheiros de autoridade criados pelo projeto Dilipad, também era um ficheiro de autoridade lógico a utilizar. Infelizmente, nem sempre é tão óbvio qual das listas publicadas online é a melhor para se usar. Uma pode ser mais utilizada do que outra, mas esta última pode ser mais abrangente para um determinado fim. O GeoNames funcionaria melhor do que os identificadores da Dilipad em alguns casos. Haverá também casos em que não se consegue encontrar um dataset com essa informação. Por exemplo, se quiser escrever pares atributo-valor sobre si próprio e as suas relações familiares imediatas terá de inventar os seus próprios identificadores. + +Esta falta de ficheiros de autoridade coerentes é um dos maiores desafios que o LOD enfrenta neste momento. [Tim Berners-Lee](https://pt.wikipedia.org/wiki/Tim_Berners-Lee), que inventou uma forma de ligar documentos em rede e criou assim a World Wide Web, um dos principais proponentes de LOD, para encorajar uma maior utilização de dados conectados, sugeriu um "[sistema de classificação de cinco estrelas](https://www.w3.org/DesignIssues/LinkedData.html)" (em inglês) para que todos avançassem o mais longe possível em direção ao LOD. Essencialmente, Tim Berners-Lee apoia a publicação aberta de dados, especialmente ao utilizar formatos abertos e normas públicas, mas o melhor é que os dados se liguem também aos dados de outras pessoas. + +Com os identificadores únicos atribuídos a todos os elementos, o próximo passo fundamental na criação de LOD é ter uma forma de *descrição* da relação entre Jack Straw (`64183282`) e Blackburn (`blackburn1955-presente`). Em LOD, as relações são expressas utilizando o que é conhecido como '[tripla semântica](https://en.wikipedia.org/wiki/Semantic_triple)' (em inglês). Vamos fazer uma tripla semântica que represente a relação entre Jack Straw e o seu distrito eleitoral: + + pessoa:64183282 papel:representaNoParlamentoBritânicodistritoeleitoral:"blackburn1955-presente" . + +A apresentação (ou [sintaxe](https://pt.wikipedia.org/wiki/Sintaxe)) das triplas semânticas, incluindo a pontuação utilizada acima, será discutida mais tarde, na secção sobre RDF e formatos de dados. Por agora, vamos focar-nos na estrutura básica. A tripla semântica, não surpreendentemente, tem três partes. Estas são convencionalmente referidas como sujeito (*subject*), predicado (*predicate*) e objeto (*object*): + +| o sujeito | o predicado | o objeto | +| --------------- | ------------------------- | ----------------------- | +| pessoa 64183282 | representadaNoParlamentoBritânico | "blackburn1955-presente" | + +A forma tradicional de representar uma tripla semântica em forma esquemática é a seguinte (em inglês): + +{% include figure.html filename="pt-tr-introducao-dados-abertos-conectados-01.png" alt="Imagem com a representação de uma tripla semântica" caption="Figura 1. Forma tradicional de representar uma tripla semântica." %} + +Assim, a nossa tripla semântica do Jack Straw, apresentado de forma mais legível para o ser humano, poderia assumir a seguinte forma: + +{% include figure.html filename="pt-tr-introducao-dados-abertos-conectados-02.png" alt="Imagem com a representação de uma tripla semântica aplicada ao exemplo de Jack Straw" caption="Figura 2. Diagrama da tripla semântica que demonstra que Jack Straw representava Blackburn." %} + +Por enquanto, é importante fixar três pontos-chave: + +- O LOD deve estar aberto e disponível para qualquer pessoa na Internet (caso contrário, não está "aberto") +- Os defensores do LOD têm como objetivo normalizar as formas de referência a entidades únicas +- O LOD consiste em triplas semânticas que descrevem as relações entre entidades + +## O papel do *Uniform Resource Identifier* (URI) + +Uma parte essencial de LOD é o [Identificador Uniforme de Recurso](https://pt.wikipedia.org/wiki/URI)(*Uniform Resource Identifier* ou URI). O URI é uma forma única e fiável de representar uma entidade (uma pessoa, um objeto, uma relação, etc.), de uma forma que é utilizável por todos no mundo. + +Na secção anterior, utilizamos dois números diferentes para identificar os diferentes Jack Straws. + + pessoa="64183282" + + pessoa="33059614" + +O problema é que em todo o mundo existem muitas bases de dados que contêm pessoas com estes números e são, provavelmente, todas pessoas diferentes. Fora do nosso contexto imediato, estes números não identificam indivíduos únicos. Vamos tentar resolver isso. Aqui estão estes mesmos identificadores, mas como URIs: + + https://viaf.org/viaf/64183282/ + + https://viaf.org/viaf/33059614/ + +Tal como o número único desambiguou os nossos dois Jack Straws, o URI completo acima ajuda-nos a desambiguar entre todos os diferentes ficheiros de autoridade lá fora. Neste caso, é evidente que estamos a utilizar o VIAF como o nosso ficheiro de autoridade. Com certeza, já viu esta forma de desambiguação muitas vezes na web. Existem muitos websites em todo o mundo com páginas chamadas `/home` ou `/faq`. Mas não há confusão porque o [domínio](https://pt.wikipedia.org/wiki/Nome_de_dom%C3%ADnio) (a primeira parte do [Localizador Uniforme de Recursos](https://pt.wikipedia.org/wiki/URL) (*Uniform Resource Locator* ou URL) - ex. `bbc.co.uk`) é único, portanto, todas as páginas que fazem parte desse domínio são únicas em outras páginas `/faq` de outros websites. No endereço `http://www.bbc.co.uk/faqs` é a parte `bbc.co.uk` que torna as páginas subsequentes únicas. Isto é tão óbvio para as pessoas que utilizam a web a toda a hora que não pensam sobre isso. Provavelmente, também sabe que se quiser criar um website chamado `bbc.co.uk` não conseguirá, porque esse nome já foi registado com a autoridade apropriada, que é o [Sistema de Nomes de Domínio](https://pt.wikipedia.org/wiki/Sistema_de_Nomes_de_Dom%C3%ADnio) (*Domain Name System*). O registo garante a singularidade. Os URIs também têm de ser únicos. + +Embora os exemplos acima se pareçam com URLs, também é possível construir um URI que não se pareça nada com um URL. Temos muitas formas de identificar pessoas e coisas de forma única e raramente pensamos ou nos preocupamos com isso. Os códigos de barras, números de passaporte, até mesmo os códigos postais são concebidos para serem únicos. Os números de telefone são frequentemente colocados como placas de loja precisamente porque são únicos. Todos eles podem ser utilizados como URIs. + +Quando criamos URIs para as entidades descritas pelo projeto '[Tobias](https://gtr.ukri.org/projects?ref=AH%2FN003446%2F1#/tabOverview)' (em inglês), escolhemos uma estrutura do tipo URL e escolhemos utilizar o nosso espaço web institucional, pondo de lado `data.history.ac.uk/tobias-project/` como um lugar dedicado à hospedagem destes URIs. Ao colocá-lo em `data.history.ac.uk` em vez de `history.ac.uk`, houve uma separação clara entre URIs e as páginas do website. Por exemplo, um dos URIs do projeto Tobias era 'https://data.history.ac.uk/tobias-project/person/15601'. Embora o formato dos URIs acima mencionados seja o mesmo que um URL, eles não se ligam a websites (tente colá-lo num navegador web). Muitas pessoas novas no LOD acham isto confuso. Todos os URLs são URIs, mas nem todos os URIs são URLs. (nota de tradução: tendo em conta que o site original do projeto Tobias já não se encontra disponível, o leitor da lição deve entender os exemplos aqui indicados como meramente ilustrativos daquilo que o autor pretende demonstrar) Um URI pode descrever qualquer coisa, enquanto o URL descreve a localização de algo na web. Assim, um URL diz-lhe a localização de uma página web, de um ficheiro ou algo semelhante. Um URI faz apenas o trabalho de identificar algo. Tal como o Número internacional Normalizado do Livro (International Standard Book Number ou [ISBN](https://www.iso.org/standard/36563.html) (em inglês) `978-0-1-873354-6` identifica exclusivamente uma edição de capa dura de _Baptism, Brotherhood and Belief in Reformation Germany_, de Kat Hill, mas não diz onde obter uma cópia. Para isso precisaria de algo como um [número de acesso](https://pt.wikipedia.org/wiki/N%C3%BAmero_de_acesso_(biblioteconomia)), que lhe dá uma localização exata de um livro numa prateleira de uma biblioteca específica. + +Há um pouco de jargão em torno de URIs. As pessoas falam sobre se são ou não [desreferenciáveis](https://pt.wikipedia.org/wiki/Refer%C3%AAncia_(ci%C3%AAncia_da_computa%C3%A7%C3%A3o)). Isso apenas significa que *podemos transformar uma referência abstrata em algo diferente?* Por exemplo, se colarmos um URI na barra de endereços de um browser, será que ele encontra algo? O VIAF URI para o historiador Simon Schama é: + + https://viaf.org/viaf/46784579 + +Se o colocarmos no browser, receberemos de volta uma página web sobre Simon Schama que contém dados estruturados sobre ele e a sua história editorial. Isto é muito útil por um motivo. A partir do URI não é óbvio quem ou mesmo o que é que está a ser referido. Da mesma forma, se tratarmos um número de telefone (com código internacional) como o URI de uma pessoa, então deve ser desreferenciável. Alguém pode atender o telefone e pode até ser Schama. + +Mas isto não é essencial. Muitos URIs não são desreferenciáveis, como no exemplo acima do projeto Tobias. Não se pode encontrá-lo em lado nenhum; é uma convenção. + +O exemplo do VIAF leva-nos a outra coisa importante sobre os URIs: não os invente a não ser que tenha de o fazer. As pessoas e organizações têm feito esforços para construir boas listas de URI e o LOD não vai funcionar eficazmente se as pessoas duplicarem esse trabalho criando novos URIs desnecessariamente. Por exemplo, o VIAF tem o apoio de muitas bibliotecas internacionais. Se quiser construir URIs para pessoas, o VIAF é uma escolha muito boa. Se não conseguir encontrar algumas pessoas no VIAF, ou noutras listas de autoridade, só então poderá precisar fazer a sua própria. + +## Como o LOD organiza o conhecimento: ontologias + +Pode não ter sido óbvio a partir das triplas semânticas individuais que analisamos na secção anterior, mas o LOD pode responder a perguntas complexas. Quando se juntam as triplas semânticas, estas formam um [Mapa conceitual](https://pt.wikipedia.org/wiki/Mapa_conceitual), devido à forma como as triplas semânticas se interligam. Suponhamos que queremos encontrar uma lista de todas as pessoas que foram alunos do compositor Franz Liszt. Se a informação estiver em triplas semânticas de dados conectados sobre pianistas e os seus professores, podemos descobrir o que procuramos com uma consulta (veremos esta linguagem de consulta, chamada SPARQL, na secção final). + +Por exemplo, o pianista Charles Rosen foi aluno do pianista Moriz Rosenthal, que foi aluno de Franz Liszt. Vamos agora expressar isto em duas triplas semânticas (vamos cingir-nos às sequências de caracteres para os nomes em vez dos números de identificação, para tornar os exemplos mais legíveis): + + "Franz Liszt" ensinouPianoAo "Moriz Rosenthal" . + "Moriz Rosenthal" ensinouPianoAo "Charles Rosen" . + +Poderíamos igualmente ter criado as nossas triplas semânticas desta forma: + + "Charles Rosen" aprendeuPianoCom "Moriz Rosenthal" . + "Moriz Rosenthal" aprendeuPianoCom "Franz Liszt" . + +Estamos a inventar exemplos simplesmente para fins de ilustração, mas se quiser ligar os seus dados a outros datasets na "nuvem de dados conectados" deve olhar para as convenções que são utilizadas nesses datasets e fazer o mesmo. Na verdade, esta é uma das características mais úteis de LOD porque muito do trabalho já foi feito. As pessoas têm passado muito tempo a desenvolver formas de modelar a informação dentro de uma determinada área de estudo e a pensar sobre como as relações dentro dessa área podem ser representadas. Estes modelos são geralmente conhecidos como ontologias. Uma ontologia é uma abstração que permite a representação de um conhecimento particular sobre o mundo. Neste sentido, estas são bastante recentes e foram concebidas para fazer o que uma [taxonomia](https://pt.wikipedia.org/wiki/Taxonomia_(geral)) hierárquica faz (pense na classificação das espécies na [Taxonomia de Lineu](https://pt.wikipedia.org/wiki/Taxonomia_de_Lineu), mas de uma forma mais flexível. + +Uma ontologia é mais flexível porque não é hierárquica. Visa representar a fluidez do mundo real, onde as coisas podem ser relacionadas umas com as outras de formas mais complexas do que quando são representadas por uma estrutura hierárquica em forma de árvore. Em vez disso, uma ontologia é mais parecida com uma teia de aranha. + +O que quer que pretenda representar com LOD, sugerimos que encontre um vocabulário existente e que o utilize, em vez de tentar escrever o seu próprio vocabulário. Esta página tem [uma lista de alguns dos vocabulários mais populares](https://semanticweb.org/wiki/Main_Page.html) (em inglês). + +Uma vez que o nosso exemplo acima se concentra nos pianistas, seria uma boa ideia encontrar uma ontologia apropriada em vez de criar o nosso próprio sistema. De facto, existe [uma ontologia para música](https://web.archive.org/web/20170715094229/https://www.musicontology.com/) (em inglês). Para além de uma especificação bem desenvolvida, esta tem também alguns exemplos úteis da sua utilização. Pode dar uma olhada nas [páginas de iniciação](https://web.archive.org/web/20170718143925/https://musicontology.com/docs/getting-started.html) (em inglês) para ter uma ideia de como se pode utilizar esta ontologia em particular. + +Infelizmente, não conseguimos encontrar nada que descreva a relação entre um professor e um aluno na Ontologia da Música. Mas a ontologia é publicada abertamente, logo podemos utilizá-la para descrever outras características da música e depois criar a nossa própria extensão. Se então publicássemos a nossa extensão abertamente, outros poderiam utilizá-la se assim o desejassem e este ato pode tornar-se num padrão. Embora o projeto *Music Ontology* (Ontologia Musical) não tenha a relação que precisamos, o [projeto *Linked Jazz*](https://linkedjazz.org/) (em inglês) permite o uso de '*mentorOf*', o que parece funcionar bem no nosso caso. Embora esta não seja uma solução ideal, é uma solução que faz um esforço para utilizar o que já existe por aí. + +Agora, se estivéssemos a estudar a história do pianismo, poderíamos querer identificar muitos pianistas que foram ensinados por alunos de Liszt, para estabelecer uma espécie de árvore genealógica e ver se estes 'netos' de Liszt têm algo em comum. Poderíamos pesquisar os alunos de Liszt, fazer uma grande lista deles, depois pesquisar cada um dos alunos e tentar fazer listas de quaisquer alunos que eles tivessem. Com LOD poderíamos (novamente, se as triplas semânticas existissem) escrever uma query semelhante a: + + Dá-me os nomes de todos os pianistas ensinados por x + onde x aprendeu piano com Liszt + +Isto encontraria todas as pessoas do dataset que eram alunos de alunos de Liszt. Não nos entusiasmemos demasiado: esta pergunta não nos dará todos os alunos de todos os alunos de Liszt que já existiram, porque essa informação provavelmente não existe e não existe dentro de nenhum conjunto de triplas semânticas existentes. Lidar com dados do mundo real mostra todo o tipo de omissões e inconsistências, que veremos quando olharmos para o maior conjunto de LOD, a [DBpedia](https://www.dbpedia.org/), na secção final. + +Se tiver utilizado [bases de dados relacionais](https://pt.wikipedia.org/wiki/Banco_de_dados_relacional) poderá estar a pensar que estas podem desempenhar a mesma função. No nosso caso de Liszt, a informação sobre pianistas acima descrita pode estar organizada numa [tabela](https://pt.wikipedia.org/wiki/Tabela_(banco_de_dados)) de base de dados denominada por algo como 'Alunos'. + +| IDaluno | IDprofessor | +| ------- | --------- | +| 31 | 17 | +| 35 | 17 | +| 49 | 28 | +| 56 | 28 | +| 72 | 40 | + +Se não estiver familiarizado com bases de dados não se preocupe. Mas, provavelmente, ainda pode ver que alguns pianistas nesta tabela tinham o mesmo professor (números 17 e 28). Sem entrar em pormenores, se Liszt estiver nesta tabela de bases de dados, seria bastante fácil extrair os alunos de Liszt, ao utilizar um ``Join`` ([*join*](https://pt.wikipedia.org/wiki/Join_(SQL))). + +De facto, as bases de dados relacionais podem oferecer resultados semelhantes ao LOD. A grande diferença é que o LOD pode ir mais longe: pode ligar datasets que foram criados sem intenção explícita de serem ligados entre si. A utilização do [Quadro de Descrição de Recursos](https://pt.wikipedia.org/wiki/Resource_Description_Framework) (*Resource Description Framework* ou RDF) e URIs permite que isto aconteça. + +## RDF e formatos de dados + +LOD utiliza uma norma, definida pelo [Consórcio World Wide Web](https://www.w3.org/) (em inglês) (*World Wide Web Consortium* ou W3C), chamada *[Resource Description Framework](https://pt.wikipedia.org/wiki/Resource_Description_Framework)* ou apenas RDF. As normas são úteis desde que sejam amplamente adotadas - pense no metro ou nos tamanhos de parafuso padrão - mesmo que sejam essencialmente arbitrárias. O RDF tem sido amplamente adotado como a norma LOD. + +Ouvirá frequentemente o LOD referido simplesmente como RDF. Atrasamos a conversa sobre o RDF até agora porque é bastante abstrato. RDF é um [modelo de dados](https://pt.wikipedia.org/wiki/Modelagem_de_dados) que descreve como é que os dados são estruturados num nível teórico. Assim, a insistência na utilização de triplas semânticas (em vez de quatro partes, ou duas ou nove, por exemplo) é uma regra no RDF. Mas quando se trata de questões mais práticas, há algumas escolhas quanto à implementação. Assim, o RDF diz-lhe o que tem de fazer, mas não exatamente como o tem de fazer. Estas escolhas dividem-se em duas áreas: como se escrevem as coisas (serialização) e as relações que as suas triplas semânticas descrevem. + +### Serialização + +A [Serialização](https://pt.wikipedia.org/wiki/Serializa%C3%A7%C3%A3o) é o termo técnico para "como se escrevem as coisas". O chinês padrão (mandarim) pode ser escrito em caracteres tradicionais, caracteres simplificados ou romanização Pinyin e a língua em si não muda. Tal como o mandarim, o RDF pode ser escrito de várias formas. Aqui vamos olhar para duas (há outras, mas por uma questão de simplicidade, vamos concentrar-nos nestas): + +1) [Turtle](https://en.wikipedia.org/wiki/Turtle_(syntax)) (em inglês) +2) [RDF/XML](https://pt.wikipedia.org/wiki/RDF/XML) + +Reconhecer a serialização que está a ser utilizada significa que podemos então escolher ferramentas apropriadas concebidas para esse formato. Por exemplo, o RDF pode vir serializado no formato [XML](https://pt.wikipedia.org/wiki/XML). Podemos então utilizar uma ferramenta ou biblioteca de códigos concebida para analisar esse formato em particular, o que é útil se já souber como trabalhar com ele. O reconhecimento do formato também lhe dá as palavras-chave certas para procurar ajuda online. Muitos recursos permitem descarregar as suas bases de dados LOD, podendo escolher qual a serialização que deseja fazer o *Download*. + +#### Turtle + +'Turtle' é um jogo de palavras. 'Tur' é a abreviatura de 'terse' e 'tle' - é a abreviatura de '*triple language*' (linguagem de triplos). Turtle é uma forma agradavelmente simples de escrever triplas semânticas. + +O Turtle usa apelidos ou atalhos, conhecidos como [prefixos](https://www.w3.org/TeamSubmission/turtle/#sec-tutorial) (em inglês), o que nos poupa ter de escrever URIs completos todas as vezes. Voltemos ao URI que criamos na secção anterior: + + https://data.history.ac.uk/tobias-project/person/15601 + +Não queremos escrever isto cada vez que nos referimos a esta pessoa (lembrar-se-á de Jack Straw). Por isso, só temos de enunciar o nosso atalho: + + @prefix toby: . + +Então Jack é `toby:15601`, que substitui o longo URI e é mais fácil à vista. Eu escolhi 'toby', mas poderia igualmente escolher qualquer sequência de letras. + +Vamos agora passar de Jack Straw para William Shakespeare e utilizar Turtle para descrever algumas coisas sobre as suas obras. Vamos ter de decidir sobre os ficheiros de autoridade a utilizar, um processo que, como mencionado acima, é melhor ser selecionado ao olhar para outros conjuntos de LOD. Aqui usaremos como um dos nossos prefixos [*Dublin Core*](https://pt.wikipedia.org/wiki/Dublin_Core), uma norma de [metadados](https://pt.wikipedia.org/wiki/Metadados) de bibliotecas [(Número de controle da Biblioteca do Congresso](https://en.wikipedia.org/wiki/Library_of_Congress_Control_Number) (*Library of Congress Control Number*) como outro e, o último (VIAF) deverá ser-lhe familiar. Juntos, estes três ficheiros de autoridade fornecem identificadores únicos para todas as entidades que tenciono utilizar neste exemplo: + + @prefix lccn: . + @prefix dc: . + @prefix viaf: . + + lccn:n82011242 dc:creator viaf:96994048 . + +Note o espaçamento do ponto final após a última linha. Esta é a forma de Turtle indicar o fim. Tecnicamente não é necessário ter o espaço, mas facilita a leitura após uma longa sequência de caracteres. + +No exemplo acima, lccn:n82011242 representa Macbeth; dc:creator liga Macbeth ao seu autor; viaf:96994048 representa William Shakespeare. + +O Turtle também permite listar triplas semânticas sem se preocupar em repetir cada URI quando acabou de o usar. Acrescentemos a data em que os estudiosos pensam que Macbeth foi escrito, utilizando o par atributo-valor Dublin Core: `dc:create 'YYYY'`: + + @prefix lccn: . + @prefix dc: . + @prefix viaf: . + + lccn:n82011242 dc:creator viaf:96994048 ; + dc:created "1606" . + +Lembra-se da estrutura da tripla semântica discutida na secção 1? Aí demos este exemplo: + + 1 pessoa 15601 (o sujeito) 2 representadaNoParlamentoBritânico (o predicado) 3 "Blackburn" (o objeto) + +O essencial é que o predicado liga o sujeito e o objeto. Ele descreve a relação entre eles. O sujeito vem primeiro na tripla semântica, mas isso é uma questão de escolha, como discutimos com o exemplo de pessoas que foram ensinadas a tocar piano por Liszt. + +Pode-se usar um ponto e vírgula se o sujeito for o mesmo mas o predicado e o objeto forem diferentes, ou uma vírgula se o sujeito e o predicado forem o mesmo e apenas o objeto for diferente. + + lccn:no2010025398 dc:creator viaf:96994048 , + viaf:12323361 . + +Aqui estamos a dizer que Shakespeare (96994048) e John Fletcher (12323361) foram ambos os criadores da obra *The Two Noble Kinsmen*. + +Quando analisamos as ontologias anteriormente sugeri que visse a [*Music Ontology*](https://web.archive.org/web/20170718143925/https://musicontology.com/docs/getting-started.html) (em inglês). Dê agora uma olhada novamente. Isto ainda é complicado, mas será que agora fazem mais sentido? + +Uma das ontologias mais acessíveis é a '*Friend of a Friend*' (amigo de um amigo) ou [FOAF](https://en.wikipedia.org/wiki/FOAF_(ontology)) (em inglês). Esta é concebida para descrever pessoas e, talvez por essa razão, é bastante intuitiva. Se, por exemplo, quiser escrever-me para me dizer que este curso é a melhor coisa que já leu, aqui está o meu email expresso como triplas semânticas em FOAF: + + @prefix foaf: . + + :"Jonathan Blaney" foaf:mbox . + +#### RDF/XML + +Em contraste com o Turtle, o RDF/XML pode parecer um pouco pesado. Para começar, vamos apenas converter uma tripla semântica da Turtle acima, aquela que refere que Shakespeare foi o criador de *The Two Noble Kinsmen*: + + lccn:no2010025398 dc:creator viaf:96994048 . + +Em RDF/XML, com os prefixos declarados dentro do trecho de código de XML, fica: + +``` xml + + + + + +``` + +O formato RDF/XML tem a mesma informação básica que o formato Turtle, mas é escrito de forma muito diferente, baseando-se nos princípios das etiquetas XML encaixadas. + +Passemos a um exemplo diferente para mostrar como o RDF/XML combina triplas semânticas e, ao mesmo tempo, introduz o [*Simple Knowledge Organization System*](https://pt.wikipedia.org/wiki/Simple_Knowledge_Organization_System) (SKOS) (Sistema Simples de Organização do Conhecimento), que foi concebido para codificar tesauros ou taxonomias. + + + Abdication + + +Aqui estamos a dizer que o conceito SKOS `21250`, *markdown abdication*, tem um rótulo preferido de "*abdication*". A forma como funciona é que o elemento sujeito (incluindo a parte da '*abdication*', que é um valor de atributo em termos de XML) tem o predicado e o objeto encaixados no seu interior. O elemento encaixado é o predicado e [o nó folha](https://pt.wikipedia.org/wiki/%C3%81rvore_(estrutura_de_dados)#Terminologia) (em inglês), é o objeto. Este exemplo é retirado de um projeto para publicar um [*Tesauro de História Britânica e Irlandesa*](https://www.history.ac.uk/research/digital-history) (em inglês). + +Tal como com o Turtle, podemos acrescentar mais triplas semânticas. Portanto, vamos declarar que o termo mais restrito na nossa hierarquia de sujeitos, um abaixo de *Abdication*, vai ser *Abdication crisis (1936)*. + + + Abdication + + + + + + +Lembra-se de como os predicados e os objetos são encaixados dentro do sujeito? Aqui já o fizemos duas vezes com o mesmo sujeito, para que possamos tornar isto menos prolixo, aninhando ambos os conjuntos de predicados como objetos dentro do mesmo sujeito: + + + Abdication + + + +Se estiver familiarizado com XML isto será fácil. Se não estiver, talvez prefira um formato como o Turtle. Mas a vantagem aqui é que ao criar o seu RDF/XML pode usar as ferramentas habituais disponíveis com XML, como editores e analisadores dedicados ao XML, para verificar se o seu RDF/XML está corretamente formatado. Se não for uma pessoa que use o XML recomendo o Turtle, podendo usar uma [ferramenta online](https://www.easyrdf.org/converter) (em inglês) para verificar se a sua sintaxe está correta. + +## Consulta de RDF com SPARQL + +Para esta secção final iremos interrogar algum LOD e ver o que poderá ser feito com ele. + +A linguagem de consulta que usamos para LOD é chamada [SPARQL](https://pt.wikipedia.org/wiki/SPARQL). É um daqueles acrónimos recursivos amados pelos técnicos: ***S**PARQL **P**rotocol **a**nd **R**DF **Q**uery **L**anguage* (Protocolo SPARQL e Linguagem de Consulta RDF). + +Como mencionado no início, o *Programming Historian* tem [uma lição completa](/en/lessons/retired/graph-databases-and-SPARQL) (em inglês), de Matthew Lincoln, sobre a utilização do SPARQL (embora não seja já mantida (ver nota no início desta tradução). A secção final aqui presente é apenas uma visão geral dos conceitos básicos. Se o SPARQL despertar o seu interesse, pode obter uma fundamentação completa no tutorial de Lincoln. + +Vamos realizar as nossas consultas SPARQL na [DBpedia](https://www.dbpedia.org/), que é um enorme conjunto de LOD derivado da Wikipedia. Além de estar cheio de informação que é muito difícil de encontrar através da habitual interface da Wikipédia, tem vários "pontos de extremidade" (end points) SPARQL - interfaces onde se podem digitar as consultas SPARQL e obter resultados a partir das triplas semânticas da DBpedia. + +O end point de consulta SPARQL que é utilizado chama-se snorql: `http://dbpedia.org/snorql/` (em inglês). Estes end points ocasionalmente ficam offline. Se for o seu caso, tente procurar por *dbpedia sparql* e deverá encontrar um substituto semelhante. + +Se for ao URL snorql acima verá, no início, um número de prefixos que já nos foram declarados, o que é útil. Agora também irá reconhecer alguns dos prefixos. + +{% include figure.html filename="en-or-intro-to-linked-data-03.png" alt="Captura de tela com a interface de criação de consultas snorql" caption="Figura 3. Caixa de consulta padrão do snorql, com alguns prefixos declarados para si." %} + +Na caixa de consulta abaixo das declarações de prefixo, deverá ver o seguinte: + + SELECT * WHERE { + ... + } + +Se alguma vez escreveu uma consulta de bases de dados em *Structured Query Language*, [mais conhecida como SQL](https://pt.wikipedia.org/wiki/SQL), isto vai parecer-lhe bastante familiar e vai ajudá-lo a aprender SPARQL. Se não, não se preocupe. As palavras-chave aqui utilizadas, ``SELECT`` (SELECIONAR) e ``WHERE`` (ONDE) não são sensíveis a maiúsculas e minúsculas, mas algumas partes de uma consulta SPARQL podem ser (ver abaixo), por isso recomendo que se cinja ao caso dado ao longo das consultas neste curso. + +Aqui `SELECT` significa "encontrar alguma coisa" e `*` significa "dá-me tudo". `WHERE` introduz uma condição, que é onde vamos colocar os detalhes de que tipo de coisas queremos que a consulta encontre. + +Vamos começar com algo simples para ver como é que isto funciona. Cole (ou, melhor, escreva) isto na caixa de consulta: + + SELECT * WHERE { + :Lyndal_Roper ?b ?c + } + +Clique em '*go*' (ir). Se deixar o menu *drop-down* como '*browse*' (navegar) deverá obter duas colunas com os rótulos "b" e "c". (Note que aqui, as maiúsculas/minúsculas importam: lyndal_roper não lhe dará resultados). + + +{% include figure.html filename="en-or-intro-to-linked-data-04.png" alt="Captura de tela com a interface de resultados de consultas snorql" caption="Figura 4. Topo das listas de resultados de uma consulta com todas as triplas semânticas com 'Lyndal_Roper' como sujeito." %} + +Então o que é que acabou de acontecer? E como é que soubemos o que escrever? + +Na verdade, não sabíamos. Esse é um dos problemas com end points do SPARQL. Quando se conhece um dataset, é preciso experimentar coisas e descobrir que termos são utilizados. Porque isto vem da *Wikipedia* e nós estávamos interessados sobre que informação sobre historiadores podíamos encontrar. Então vamos à página da *Wikipedia* da historiadora [Lyndal Roper](https://en.wikipedia.org/wiki/Lyndal_Roper) (em inglês). + +A parte final do URL é `Lyndal_Roper` e concluímos então que é provável que esta cadeia de caracteres seja a forma como Roper é referida na DBpedia. Porque não sabemos o que mais poderia estar em triplas semânticas que mencionam Roper, nós utilizamos `?b` e `?c`: estes são apenas marcadores de posição. Poderia igualmente ter digitado `?whatever` e `?you_like` e as colunas teriam esses rótulos. Quando quiser ser mais preciso sobre o que se está a pesquisar, será importante etiquetar as colunas de forma significativa. + +Experimente agora a sua própria consulta SPARQL: escolha uma página *Wikipedia* e copie a parte final do URL, após a barra final, e coloque-a no lugar de Lyndal_Roper. Depois clique em 'go'. + +A partir da informação que se obtém destes resultados é possível gerar *queries* mais precisas. Isto pode ser pouco fiável, por isso não se preocupe se algumas não funcionarem. + +Vamos voltar aos resultados para a consulta que fizemos há momentos: + + SELECT * WHERE { + :Lyndal_Roper ?b ?c + } + +Podemos ver uma longa lista na coluna etiquetada _c_. Estes são todos os atributos que Roper tem na *DBpedia* e que nos ajudarão a encontrar outras pessoas com estes atributos. Por exemplo, podemos ver ```http://dbpedia.org/class/yago/Historian110177150```. Poderemos utilizar isto para obter uma lista de historiadores? Vamos colocá-lo na nossa pergunta, mas em terceiro lugar, porque era onde estava quando a encontrei nos resultados da Lyndal Roper. A minha consulta tem este aspecto: + + SELECT * WHERE { + ?historian_name ?predicate + } + +Fizemos uma pequena mudança aqui. Se esta consulta funcionar de todo, então esperemos que os nossos historiadores estejam na primeira coluna, porque 'historiador' não parece poder ser um predicado: não funciona como um verbo numa frase; por isso vamos chamar à nossa primeira coluna de resultados 'historian_name' e à minha segunda (sobre a qual não sabemos nada) 'predicate' (predicado). + +Execute a *querie*. Deverá encontrar uma grande lista de historiadores. + +{% include figure.html filename="en-or-intro-to-linked-data-05.png" alt="Duas capturas de tela com a interface de consultas snorql e respectivos resultados" caption="Figura 5. Historiadores de acordo com a DBpedia." %} + +Assim, esta ferramenta funciona para criar listas, o que é útil, mas seria muito mais poderoso combinar listas para obter intersecções de conjuntos. Encontrei mais algumas coisas que podem ser interessantes consultar nos atributos DBpedia de Lyndal Roper: `http://dbpedia.org/class/yago/WikicatBritishHistorians` e `http://dbpedia.org/class/yago/WikicatWomenHistorians`. É muito fácil combiná-los pedindo uma variável a ser devolvida (no nosso caso isto é `?name` (nome)) e depois utilizando-a em múltiplas linhas de uma *querie*. Note também o espaço e o ponto completo no final da primeira linha que começa com `?name`: + + SELECT ?name + WHERE { + ?name ?b . + ?name ?b + } + +Funciona! Devemos obter cinco resultados. Na altura em que escrevo, há cinco historiadoras britânicas na *DBpedia*... + +{% include figure.html filename="en-or-intro-to-linked-data-06.png" alt="Duas capturas de tela com a interface de consultas snorql e respectivos resultados" caption="Figura 6. Historiadoras britânicas segundo a DBpedia." %} + +Apenas cinco historiadoras britânicas? Claro que há, na realidade, muitas mais do que isso, como poderíamos facilmente mostrá-lo substituindo o nome de, digamos, Alison Weir na nossa primeira consulta sobre Lyndal Roper. Isto leva-nos ao problema com a *Dbpedia* que mencionamos anteriormente: não é muito consistentemente marcado com informação estrutural do tipo que a *DBpedia* que utiliza. A nossa consulta pode listar algumas historiadoras britânicas mas acontece que não podemos utilizá-la para gerar uma lista significativa de pessoas nesta categoria. Tudo o que encontrámos foram as pessoas nas entradas da *Wikipedia* que alguém decidiu classificar como "historiadora britânica" e "historiadora". + +Com SPARQL na *DBpedia*, é preciso ter cuidado com as inconsistências do material de origem coletiva. Poderá usar o SPARQL exatamente da mesma forma num dataset mais confiável, por exemplo, os dados do governo britânico: https://data-gov.tw.rpi.edu//sparql (em inglês) e esperar obter resultados mais robustos (há aqui um breve tutorial para este dataset: https://data-gov.tw.rpi.edu/wiki/A_crash_course_in_SPARQL (em inglês). + +No entanto, apesar das suas inconsistências, a *DBpedia* é um ótimo local para aprender SPARQL. Esta foi apenas uma breve introdução, mas há muito mais em [Usando SPARQL para aceder ao Linked Open Data](/en/lessons/retired/graph-databases-and-SPARQL) (em inglês). + + +## Leituras e recursos adicionais + +* Dean Allemang e James Hendler, *Semantic Web for the Working Ontologist*, 2nd edn, Elsevier, 2011 +* Tim Berners-Lee [*Linked Data*](https://www.w3.org/DesignIssues/LinkedData.html) (em inglês) +* Bob DuCharme, *Learning SPARQL*, O'Reilly, 2011 +* [Blog de Bob DuCharme](https://www.snee.com/bobdc.blog/) (em inglês) também vale a pena ler +* Richard Gartner, *Metadata: Shaping Knowledge from Antiquity to the Semantic Web*, Springer, 2016 +* Seth van Hooland and Ruben Verborgh, *Linked Data for Libraries, Archives and Museums*, 2015 +* Matthew Lincoln ['*Using SPARQL to access Linked Open Data*'](/en/lessons/retired/graph-databases-and-SPARQL) (em inglês) +* [*Linked Data guides and tutorials*](https://web.archive.org/web/20170515070722/http://linkeddata.org/guides-and-tutorials) (em inglês) +* Dominic Oldman, Martin Doerr e Stefan Gradmann, '*Zen and the Art of Linked Data: New Strategies for a Semantic Web of Humanist Knowledge*', em *A New Companion to Digital Humanities*, editado por Susan Schreibman et al. +* Max Schmachtenberg, Christian Bizer e Heiko Paulheim, [*State of the LOD Cloud 2017*](https://linkeddatacatalog.dws.informatik.uni-mannheim.de/state/) (em inglês) +* David Wood, Marsha Zaidman e Luke Ruth, *Linked Data: Structured data on the Web*, Manning, 2014 + +## Agradecimentos + +Gostaria de agradecer aos meus dois colegas revisores, Matthew Lincoln e Terhi Nurmikko-Fuller e ao meu editor, Adam Crymble, por me ajudarem generosamente a melhorar esta lição com numerosas sugestões, esclarecimentos e correções. Este tutorial baseia-se num outro escrito como parte do '*Thesaurus of British and Irish History as SKOS*' [*(Tobias) project*](https://gtr.ukri.org/projects?ref=AH%2FN003446%2F1#/tabOverview) (em inglês), financiado pelo [AHRC](https://www.ukri.org/councils/ahrc/) (em inglês). A lição foi revista para o projeto *Programming Historian*. diff --git a/pt/licoes/introducao-estilometria-python.md b/pt/licoes/introducao-estilometria-python.md index 8a52335fd8..13ce91ca8f 100644 --- a/pt/licoes/introducao-estilometria-python.md +++ b/pt/licoes/introducao-estilometria-python.md @@ -1,708 +1,708 @@ ---- -title: Introdução à estilometria com Python -layout: lesson -slug: introducao-estilometria-python -date: 2018-04-21 -translation_date: 2021-12-27 -authors: -- François Dominic Laramée -reviewers: -- Folgert Karsdorp -- Jan Rybicki -- Antonio Rojas Castro -editors: -- Adam Crymble -translator: -- Daniel Bonatto Seco -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Bruno Almeida -- Suemi HIguchi -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/445 -activity: analyzing -topics: [distant-reading, python] -abstract: "Nesta lição, aprenderá a realizar análises estilométricas e a determinar a autoria de textos. A lição cobre três métodos: Curvas Características de Composição de Mendenhall, Método Qui-Quadrado de Kilgariff e Método Delta de John Burrows." -original: introduction-to-stylometry-with-python -avatar_alt: Mulher a ler junto a uma pintura -doi: 10.46430/phpt0024 ---- - - -{% include toc.html %} - -# Introdução - -[Estilometria](https://perma.cc/NYH2-KWLA) é o estudo quantitativo do estilo literário por meio de métodos de [leitura distante](https://perma.cc/XK8J-F6ZF) computacional. É baseado na observação de que os autores tendem a escrever de maneiras relativamente consistentes, reconhecíveis e únicas. Por exemplo: - -* Cada pessoa tem seu próprio vocabulário único, às vezes rico, às vezes limitado. Embora um vocabulário mais amplo esteja geralmente associado à qualidade literária, nem sempre é esse o caso. Ernest Hemingway é famoso por usar um número surpreendentemente pequeno de palavras diferentes em sua escrita,[^1] o que não o impediu de ganhar o Prêmio Nobel de Literatura em 1954; -* Algumas pessoas escrevem frases curtas, enquanto outras preferem blocos longos de texto compostos por muitas frases; -* Não há duas pessoas que usem ponto-e-vírgulas, travessões e outras formas de pontuação exatamente da mesma maneira. - -As maneiras como os escritores usam pequenas [*function words*](https://perma.cc/284C-CNHD), como artigos, preposições e conjunções, mostram-se particularmente reveladoras. Em uma pesquisa dos métodos estilométricos históricos e atuais, Efstathios Stamatatos aponta que as palavras funcionais são "usadas de maneira amplamente inconsciente pelos autores e são independentes do tópico"[^2]. Para a análise estilométrica, isso é muito vantajoso, visto que esse padrão inconsciente tende a variar menos no [*corpus*](https://perma.cc/9XQ4-J4A5) de um autor do que seu vocabulário geral (e também é muito difícil para um pretenso falsificador copiar). As palavras funcionais também foram identificadas como marcadores importantes do gênero literário e da cronologia. - -Os pesquisadores têm usado a estilometria como uma ferramenta para estudar uma variedade de questões culturais. Por exemplo, uma quantidade considerável de pesquisas estudou as diferenças entre as maneiras como homens e mulheres escrevem[^3] ou sobre o que escrevem.[^4] Outros pesquisadores estudaram as maneiras como uma mudança repentina no estilo de escrita em um único texto pode indicar plágio[^5] e até mesmo a maneira como as letras dos músicos John Lennon e Paul McCartney se tornaram cada vez menos alegres e menos ativas à medida que os [Beatles](https://perma.cc/DQ66-M79T) se aproximavam do fim de sua carreira de gravação na década de 1960.[^6] - -No entanto, uma das aplicações mais comuns da estilometria é na atribuição de autoria. Dado um texto anônimo, às vezes é possível inferir quem o escreveu medindo certas características, como o número médio de palavras por frase ou a propensão do autor de usar "todavia" em vez de "no entanto", e comparando as medidas com outros textos escritos pelo suposto autor. Este é o objetivo deste tutorial, onde a partir de um conjunto de obras clássicas de romancistas lusos e brasileiros do século XIX iremos comparar exemplares de suas obras com o estilo literário do conjunto de autores a fim de tentar inferir suas respectivas autorias (nota de tradução: foi decidido mudar o _corpus_ usado nesta lição para um que fosse culturalmente mais relevante para o público que fala e escreve português; foi mantida a restante estrutura da lição original, com excepção de ligeiras adaptações face à mudança do _corpus_). - -## Objetivos de aprendizado - -No final desta lição, teremos percorrido os seguintes tópicos: - -* Como aplicar vários métodos estilométricos para inferir a autoria de um texto anônimo ou conjunto de textos; -* Como usar estruturas de dados relativamente avançadas, incluindo [dicionários](https://perma.cc/TTF4-SJ23) de [strings](https://perma.cc/7DCC-M9AT) e dicionários de dicionários, em [Python](https://perma.cc/Z82S-3L3M); -* O básico do [Natural Language Toolkit](https://perma.cc/E7LZ-WECZ) (NLTK), um módulo Python popular dedicado a [processamento de linguagem natural](https://perma.cc/MFX4-LAVZ). - -## Leitura prévia - -Se você não tem experiência com a linguagem de programação Python ou está tendo dificuldade nos exemplos apresentados neste tutorial, o autor recomenda que você leia as lições [Trabalhando com ficheiros de texto em Python](/pt/licoes/trabalhando-ficheiros-texto-python) e [Manipular Strings com Python](/pt/licoes/manipular-strings-python). Note que essas lições foram escritas em Python versão 2, enquanto esta usa Python versão 3. As diferenças de [sintaxe](https://perma.cc/E5LQ-S65P) entre as duas versões da linguagem podem ser sutis. Se você ficar em dúvida, siga os exemplos conforme descritos nesta lição e use as outras lições como material de apoio. (Este tutorial encontra-se atualizado até à versão [Python 3.8.5](https://perma.cc/XCT2-Q4AT); as [strings literais formatadas](https://perma.cc/U6Q6-59V3) na linha `with open(f'data/pg{filename}.txt', 'r', encoding='utf-8') as f:`, por exemplo, requerem Python 3.6 ou uma versão mais recente da linguagem.) - -## Materiais requeridos - -Este tutorial usa conjuntos de dados e software que você terá que baixar e instalar. - -### O conjunto de dados ### - -Para trabalhar nesta lição, você precisará baixar e descompactar o ficheiro [.zip](/assets/introduction-to-stylometry-with-python/dataset_estilometria.zip) contendo as 15 obras que compõem o *corpus* que será utilizado neste tutorial. As obras foram originalmente extraídas do [Projeto Gutenberg](https://perma.cc/8GTT-3M9N). Ao descompactar o ficheiro, será criada uma pasta com o nome `dados`. Este será o seu [diretório de trabalho](https://perma.cc/9KVS-T3A5) e todo o trabalho deve ser salvo aqui durante a execução da lição. - -### O software ### - -Esta lição usa as seguintes versões da linguagem Python e [bibliotecas](https://pt.wikipedia.org/wiki/Biblioteca_(computa%C3%A7%C3%A3o)): -* [Python 3.x](https://www.python.org/downloads/) - a última versão estável é recomendada; -* [nltk](https://www.nltk.org/) - Natural Language Toolkit, geralmente abreviado `nltk`; -* [matplotlib](https://matplotlib.org/) - visualização de dados e geração de gráficos; -* [re](https://docs.python.org/pt-br/3/library/re.html) - limpeza de dados via Regex (veremos durante o tutorial o porquê). - -Alguns desses módulos podem não estar pré-instalados em seu computador. Se você encontrar mensagens de erro como: "Módulo não encontrado" ou similares, você terá que baixar e instalar o(s) módulo(s) ausente(s). A forma mais simples de realizar esta tarefa é através do comando `pip`. Mais detalhes estão disponíveis através do tutorial do *Programming Historian* [Instalação de Módulos Python com pip](/pt/licoes/instalacao-modulos-python-pip). - -## Algumas notas sobre Independência Linguística - -Este tutorial aplica a análise estilométrica a um conjunto de textos em português (PT-PT e PT-BR) usando uma biblioteca Python chamada `nltk`. Muitas das funcionalidades fornecidas pelo `nltk` operam com outros idiomas. Contanto que um idioma forneça uma maneira clara de distinguir os limites de uma palavra, o `nltk` deve ter um bom desempenho. Idiomas como o chinês, para os quais não há distinção clara entre os limites das palavras, podem ser problemáticos. O autor original desta lição utilizou `nltk` com textos em francês sem nenhum problema; outros idiomas que usam [diacríticos](https://perma.cc/7VGD-5968), como espanhol e alemão, também devem funcionar bem com `nltk`. Consulte a [documentação do nltk](https://perma.cc/S4EX-2DBT) para obter detalhes. - -Apenas uma das tarefas neste tutorial requer código dependente do idioma. Para dividir um texto em um conjunto de palavras em uma língua diferente do inglês, você precisará especificar o idioma apropriado como um parâmetro para o [tokenizador](https://perma.cc/NGM5-4MED) da biblioteca `nltk`, que usa o inglês como padrão. Isso será explicado no tutorial. - -Por fim, observe que algumas tarefas linguísticas, como [*part-of-speech tagging*](https://perma.cc/L9SU-PS9D), podem não ser suportadas pelo `nltk` em outros idiomas além do inglês. Este tutorial não cobre a aplicação de *part-of-speech tagging*. Se você precisar para os seus próprios projetos, consulte a [documentação do nltk](https://perma.cc/S4EX-2DBT) para obter orientações. - -# O *corpus* - Contextualização - -No [exemplo original deste tutorial em inglês](/en/lessons/introduction-to-stylometry-with-python), utilizaram-se os [papéis federalistas](https://perma.cc/DW5V-MH5W) como um exemplo de aplicação de estilometria, utilizando as técnicas que serão apresentadas para inferir a autoria dos textos contestados dentro do conjunto de documentos que configura o *corpus*.[^7] -Como na língua portuguesa não temos um conjunto de textos que possua estas mesmas características, no exemplo que apresentaremos traremos um total de 15 obras completas de 5 autores diferentes, três deles portugueses e dois brasileiros, todos romancistas do século XIX, disponibilizadas pelo [Projeto Gutenberg](https://perma.cc/5PRR-TM3D). Utilizaremos duas obras de cada autor para definir seus respectivos estilos e uma terceira para constituir o conjunto de testes, para avaliarmos se as técnicas utilizadas realizarão a inferência correta de autoria através do grau de similaridade de cada obra deste conjunto com o estilo obtido de cada autor. - -Os autores e obras utilizadas são os seguintes: - -| Autor | Obra 1 | Obra 2 | Obra 3 | -| --------- | --------- | --------- | --------- | -| [Machado de **Assis**](https://perma.cc/6BMU-UKZL) (Brasil)| [Quincas Borba](https://www.gutenberg.org/ebooks/55682) (**55682**) | [Memorias Posthumas de Braz Cubas](https://www.gutenberg.org/ebooks/54829) (**54829**) | [Dom Casmurro](https://www.gutenberg.org/ebooks/55752) (**55752**) | -| [José de **Alencar**](https://perma.cc/Y3Y2-VHJ5) (Brasil) | [Ubirajara](https://www.gutenberg.org/ebooks/38496) (**38496**) | [Cinco minutos](https://www.gutenberg.org/ebooks/44540) (**44540**) | [Como e porque sou romancista](https://www.gutenberg.org/ebooks/29040) (**29040**) | -| [Camilo **Castelo Branco**](https://perma.cc/Q4AJ-VZBH) (Portugal) | [Carlota Angela](https://www.gutenberg.org/ebooks/26025) (**26025**) | [Amor de Salvação](https://www.gutenberg.org/ebooks/26988) (**26988**) | [Amor de Perdição: Memorias d'uma familia](https://www.gutenberg.org/ebooks/16425) (**16425**) | -| [António Feliciano de **Castilho**](https://perma.cc/LZ9J-3H5Z) (Portugal) | [A Chave do Enigma](https://www.gutenberg.org/ebooks/32002) (**32002**) | [A Primavera](https://www.gutenberg.org/ebooks/65021) (**65021**) | [O presbyterio da montanha](https://www.gutenberg.org/ebooks/28127) (**28127**) | -| [Manuel Pinheiro **Chagas**](https://perma.cc/8LU3-RADW) (Portugal) | [Historia alegre de Portugal](https://www.gutenberg.org/ebooks/29394) (**29394**) | [A Lenda da Meia-Noite](https://www.gutenberg.org/ebooks/23400) (**23400**) | [Astucias de Namorada, e Um melodrama em Santo Thyrso](https://www.gutenberg.org/ebooks/29342) (**29342**) | - -As partes destacadas do nome de cada autor indicam como os mesmos serão referenciados neste tutorial a partir deste ponto. Para os códigos utilizaremos o `EBook-No.` (número de referência da obra no Projeto Gutenberg), presente no nome dos ficheiros disponibilizados. - -# Nossos casos de teste - -Nesta lição, usaremos obras de romancistas brasileiros e portugueses do século XIX como um estudo de caso para demonstrar três abordagens estilométricas diferentes: - -1. Curvas características de composição de Mendenhall -2. Método Qui-Quadrado de Kilgariff -3. Método Delta de John Burrows - -Em todas as abordagens acima mencionadas, utilizaremos os documentos das colunas **Obra 1** e **Obra 2** para definir o estilo de cada autor. Os documentos da coluna **Obra 3** serão testados individualmente com cada um dos 5 autores para tentarmos inferir a autoria pela proximidade de estilo. - -# Preparando os dados para análise - -Antes de prosseguirmos com a análise estilométrica, precisamos carregar os ficheiros contendo todas as 15 obras em [estruturas de dados](https://perma.cc/P843-J4LB) na memória do computador. - -O primeiro passo neste processo é designar cada obra para o seu respectivo conjunto. Como cada obra está relacionada com o seu respectivo `EBook-No.`, podemos atribuir cada obra (valor) à chave do seu autor (ou a uma chave separada, se ela fizer parte da amostra de teste) usando um *dicionário* Python. O dicionário é um tipo de conjunto de dados composto de um número arbitrário de pares de chave-valor; neste caso, os nomes dos autores servirão como chaves (separados entre treino e teste), enquanto os `EBook-No.` das obras serão os valores associados a essas chaves. - -```python -ids_obras = { - 'Assis' : [55752, 54829], - 'Alencar' : [38496, 44540], - 'Castelo Branco' : [26025, 26988], - 'Castilho' : [32002, 65021], - 'Chagas' : [29394, 23400], - 'Assis (teste)' : [55682], - 'Alencar (teste)' : [29040], - 'Castelo Branco (teste)' : [16425], - 'Castilho (teste)' : [28127], - 'Chagas (teste)' : [29342] -} -``` - -Os dicionários Python são muito flexíveis. Por exemplo, podemos acessar um valor específico *indexando* o dicionário com uma de suas chaves, podemos varrer o dicionário inteiro fazendo um loop em sua lista de chaves, etc. Faremos amplo uso desta funcionalidade à medida que avançarmos. - -A seguir, como estamos interessados no vocabulário de cada autor, definiremos uma breve [função](https://perma.cc/P8CA-Y43Q) em Python que irá criar uma longa lista de palavras em cada uma das obras atribuídas a um único autor. Isso será armazenado como uma [string](https://perma.cc/7DCC-M9AT). -Abra o seu ambiente de desenvolvimento Python escolhido. Se você não sabe como fazer isso, leia "Configurar um ambiente de desenvolvimento integrado para Python" ([Windows](/pt/licoes/instalacao-windows), [Linux](/pt/licoes/instalacao-linux), [Mac](/pt/licoes/instalacao-mac)) antes de prosseguir. - -```python -# Função que compila todos os ficheiros de texto de cada grupo em uma única string - -import re - -def ler_ficheiros_para_string(ids_ficheiros): - global texto - strings = [] - for id_ficheiro in ids_ficheiros: - with open(f'dados/pg{id_ficheiro}.txt', 'r', - encoding='utf-8') as f: - texto = f.read() - texto = re.search(r"(START.*?\*\*\*)(.*)(\*\*\* END)", - texto, - re.DOTALL).group(2) - strings.append(texto) - return '\n'.join(strings) -``` - -Perceba que, dentro da função, temos também uma etapa de limpeza dos textos usando [expressões regulares](https://perma.cc/DT3K-XUBG). Isso foi necessário para este corpus específico pois as obras publicadas no Projeto Gutenberg possuem uma estrutura de cabeçalho e rodapé de [metadados](https://perma.cc/E8P8-GKDR) que não pode ser considerada na análise estilométrica, uma vez que não foram redigidas pelos autores analisados. A utilização de expressões regulares não faz parte do escopo deste tutorial, então limitaremo-nos a compreender que estamos utilizando a biblioteca `re` para capturar apenas o conjunto de caracteres entre os marcadores `*** START OF THIS PROJECT GUTENBERG [NOME DA OBRA] ***` e `*** END OF THIS PROJECT GUTENBERG [NOME DA OBRA] ***` presentes em cada documento do projeto. Para maiores dúvidas sobre a utilização de expressões regulares e da biblioteca `re`, consulte a [documentação](https://perma.cc/JFP3-B4P4). - -Na sequência, construímos uma nova estrutura de dados chamando repetidamente a função `ler_ficheiros_para_string ()`, passando a ela uma lista diferente de documentos a cada vez. Armazenaremos os resultados em outro dicionário, este com nomes do autor/caso de teste como chaves e todo o texto dos respectivos documentos como valores. Para simplificar, iremos nos referir à string contendo uma lista de documentos como "corpus do autor". - -```python -# Criar um dicionário com os corpora dos autores -obras = {} -for autor, ids_ficheiros in ids_obras.items(): - obras[autor] = ler_ficheiros_para_string(ids_ficheiros) -``` - -Para nos certificarmos de que os ficheiros foram carregados corretamente, imprima os primeiros cem caracteres de cada entrada do dicionário na tela: - -```python -for autor in obras: - print(obras[autor][:100]) -``` - -Se esta operação de impressão exibir quaisquer trechos de texto no console, então a operação de leitura dos ficheiros funcionou conforme o esperado e você pode prosseguir para a análise estilométrica. - -
    -Se os ficheiros não forem carregados, o motivo mais provável é que o seu diretório de trabalho atual não seja o repositório `dados` criado ao descompactar o ficheiro da seção de Materiais Requeridos acima; mudar o seu diretório de trabalho deve resolver o problema. Como você faz isso depende do seu ambiente de desenvolvimento Python. -
    - -# Primeiro teste estilométrico: curvas características de composição de Mendenhall - -O pesquisador literário T. C. Mendenhall escreveu certa vez que a assinatura estilística de um autor pode ser encontrada contando a frequência com que usa palavras de tamanhos diferentes.[^8] Por exemplo, se contarmos os tamanhos de palavras em vários segmentos de 1.000 ou 5.000 palavras de qualquer romance e, em seguida, traçarmos um gráfico das distribuições de comprimento das palavras, as curvas pareceriam praticamente as mesmas, não importando que partes do romance tivéssemos escolhido. Na verdade, Mendenhall acreditava que se alguém contasse palavras suficientes selecionadas de várias partes da obra de toda a vida de um escritor (digamos, 100.000 ou mais), a "curva característica" de uso de comprimento de palavras do autor se tornaria tão precisa que seria constante ao longo de sua vida. - -Pelos padrões de hoje, contar o comprimento das palavras parece uma forma muito direta (e talvez simplista) de medir o estilo literário. O método de Mendenhall não leva em consideração as palavras do vocabulário de um autor, o que é obviamente problemático. Portanto, não devemos tratar as curvas características como uma fonte particularmente confiável de evidência estilométrica. No entanto, Mendenhall publicou a sua teoria há mais de cento e trinta anos e fez todos os cálculos à mão. É compreensível que ele tivesse optado por trabalhar com uma estatística que, embora grosseira, fosse ao menos fácil de compilar. Em honra ao valor histórico de sua tentativa inicial de estilometria, e porque a curva característica produz resultados visuais interessantes que podem ser implementados rapidamente, usaremos o método de Mendenhall como um primeiro passo em nossa exploração das técnicas de atribuição de autoria. - -O trecho de código necessário para calcular e exibir as curvas características para os autores e os documentos de teste é o seguinte: - -```python -# Carregar nltk e matpotlib -import nltk -nltk.download('punkt') -import matplotlib.pylab as plt - -obras_tokens = {} -obras_distribuicao_comprimento = {} - -id_subplot = 1 -fig = plt.figure(figsize=(20,20)) - -autores = list(obras.keys()) - -for autor in autores: - # Transformar os corpora dos autores em listas de tokens de palavras - tokens = nltk.word_tokenize(obras[autor], language="portuguese") - - # Filtrar pontuação - obras_tokens[autor] = ([token for token in tokens - if any(c.isalpha() for c in token)]) - -# Obter a distribuição de comprimentos de tokens -token_comprimentos = [len(token) for token in obras_tokens[autor]] -obras_distribuicao_comprimento[autor] = nltk.FreqDist(token_comprimentos) - - # Plotar a curva característica de composição - lista_chaves = [] - lista_valores = [] - - for i in range(1,16): - lista_chaves.append(i) - lista_valores.append(obras_distribuicao_comprimento[autor][i]) - - lista_valores_normalizado = [value/max(lista_valores) for value in lista_valores] - - plt.subplot(5, 5, id_subplot) - plt.plot(lista_chaves, lista_valores_normalizado) - plt.xticks(lista_chaves) - plt.title(autor) - id_subplot += 1 - -plt.savefig("stilometry_comparacao.jpeg", dpi=300, bbox_inches='tight') -plt.show() -``` - -Se você estiver trabalhando em um [Jupyter Notebook](http://jupyter.org/), adicione a expressão `%matplotlib inline` após a importação das bibliotecas; caso contrário, você pode não ver os gráficos em sua tela. Se você estiver trabalhando em um [Jupyter Lab](http://jupyterlab.readthedocs.io/en/stable/getting_started/installation.html), substitua esta expressão por `%matplotlib ipympl`. - -A primeira linha no trecho de código acima carrega o módulo *Natural Language Toolkit (nltk)*, que contém um número enorme de funções e recursos úteis para processamento de texto. Mal tocaremos em seus fundamentos nesta lição; se você decidir explorar mais a análise de texto em Python, recomendo fortemente que comece com [a documentação do nltk](https://www.nltk.org/). - -As próximas linhas configuram estruturas de dados que serão preenchidas pelo bloco de código dentro do loop `for`. Este loop faz os mesmos cálculos para todos os nossos "autores": - -* Invoca o método `word_tokenize()` do `nltk`, explicitando a linguagem do _corpus_ para português através do argumento `language="portuguese"`, e divide o _corpus_ em _tokens_, ou seja, palavras, números, pontuação, etc.; -* Olha para esta lista de tokens e filtra as não-palavras; -* Cria uma lista contendo os comprimentos de cada token de palavra restante; -* Cria um objeto de _distribuição de frequência_ a partir dessa lista de comprimentos de palavra, basicamente contando quantas palavras de uma letra, palavras de duas letras, etc., existem no _corpus_ do autor, e em seguida realiza a normalização dessa distribuição, ou seja, ajusta todos os valores em um intervalo entre 0 e 1. Esta etapa é realizada para comparar gráficos de distribuição em _corpus_ de tamanhos diferentes de forma mais clara; -* Plota um gráfico da distribuição de comprimentos de palavras no corpus, para todas as palavras de até 15 caracteres. - -Os resultados que obtemos são os seguintes: -{% include figure.html filename="introducao-estilometria-python-01.jpeg" caption="Imagem 1: Comparação da curva de Mendenhall para cada corpus." %} - -Como podemos ver pelos gráficos, é possível notar diferenças (embora sutis) entre todas as 5 curvas características de cada autor (linha superior de gráficos). Ao compararmos os documentos de teste (linha inferior de gráficos) com os autores, podemos notar que a curva característica dos documentos de teste dos autores Assis, Castilho e Chagas se assemelham mais à curva dos seus respectivos autores que de qualquer outro, o que seriam inferências corretas. O documento de Alencar é o que mais diverge da curva característica do autor. Isso pode ocorrer pelo fato do documento de teste ser uma autobiografia do autor, enquanto os documentos de treino são duas obras de ficção, o que poderia influenciar no seu estilo de escrita. Veremos nas próximas abordagens se conseguimos contornar esta situação. O documento de Castelo Branco também parece não ter se assemelhado à curva característica do autor. - -Para além desta análise meramente visual (que pode muitas vezes induzir ao erro), podemos ter um resultado quantitativo calculando a soma das distâncias entre os valores (normalizados) de frequência de cada documento de teste com os valores de frequência do *corpus* de cada possível autor. Por consequência, o autor que possuir a menor distância de frequência com o documento de teste seria o mais provável autor deste documento. Podemos implementar isso da seguinte forma: - -```python -# Dividir a lista de corpus entre autores e obras destacadas -autores = list(obras.keys())[:5] -obras_destacadas = list(obras.keys())[5:] - -obras_distribuicao_comprimento_normalizado = {} - -# Normalizar a distribuição de comprimentos de tokens em um novo dicionário -for index, obra in obras_distribuicao_comprimento.items(): - obras_distribuicao_comprimento_normalizado[index] = {k: - v/max(obra.values()) - for k, v in dict(obra).items()} - -# Calcular a soma da diferença da distribuição entre o documento de teste e cada autor (de 1 até 15 caracteres) -for obra in obras_destacadas: - for autor in autores: - soma_diferenca = 0 - for i in range(1,16): - diferenca = abs(obras_distribuicao_comprimento_normalizado[obra][i] - - obras_distribuicao_comprimento_normalizado[autor][i]) - soma_diferenca = soma_diferenca + diferenca - print('A soma da diferença do documento ' + - obra + - ' para o autor ' + - autor + - ' é ' + - str(soma_diferenca)) - print('\n') -``` - -O resultado deste trecho serão 5 blocos, cada um comparando um documento com os 5 possíveis autores. Abaixo o exemplo de como o primeiro bloco deve parecer: - -``` -A soma da diferença do documento Assis (teste) para o autor Assis é 0.25782806530977137 -A soma da diferença do documento Assis (teste) para o autor Alencar é 0.5192643726222002 -A soma da diferença do documento Assis (teste) para o autor Castelo Branco é 0.7410205025846326 -A soma da diferença do documento Assis (teste) para o autor Castilho é 0.46876355973646266 -A soma da diferença do documento Assis (teste) para o autor Chagas é 0.3466043230715998 -``` - -Vamos colocar os resultados dos 5 testes em uma [matriz de confusão](https://perma.cc/K42B-NQSR) (limitando a 4 casas decimais) para avaliarmos: - -| | Assis | Alencar | Castelo Branco | Castilho | Chagas | -| --------- | --------- | --------- | --------- | --------- | --------- | -| **Assis (teste)** | **0.2578** | 0.5192 | 0.7410 | 0.4687 | 0.3466 | -| **Alencar (teste)** | 0.9744 | **0.9844** | 0.4313 | 0.6979 | 0.7897 | -| **Castelo Branco (teste)** | 0.2812 | 0.4436 | **0.4761** | 0.2772 | 0.2803 | -| **Castilho (teste)** | 0.4396 | 0.4624 | 0.4114 | **0.1394** | 0.3184 | -| **Chagas (teste)** | 0.7746 | 0.5883 | 0.6636 | 0.6732 | **0.5888** | - -Os documentos de teste de Assis e Castilho possuem menor valor com seus respectivos autores, o que indica a maior proximidade. Isso é condizente com a similaridade dos gráficos que vimos anteriormente. O documento de teste de Chagas teve um "empate técnico" entre o estilo do próprio autor (0.5888) e Alencar (0.5883). Tanto os documentos de teste de Alencar quanto Castelo Branco ficaram com o maior valor em relação aos seus respectivos autores, logo a técnica não foi eficaz para estes dois autores. - -Se não tivéssemos informações adicionais para trabalharmos, poderíamos inferir corretamente 50% da atribuição de autoria (2 acertos, 2 erros e um "empate"), o que é um resultado considerável para uma técnica relativamente simples. Felizmente, a ciência estilométrica avançou muito desde a época de Mendenhall. - -# Segundo teste estilométrico: método qui-quadrado de Kilgariff - -Em um artigo de 2001, Adam Kilgarriff[^9] recomenda o uso da estatística qui-quadrado para determinar a autoria. Leitores familiarizados com métodos estatísticos podem se lembrar que o qui-quadrado às vezes é usado para testar se um conjunto de observações (digamos, as intenções dos eleitores conforme declarado em uma pesquisa) segue uma certa [distribuição de probabilidade](https://perma.cc/668N-9GPD) ou padrão. Não é isso que buscamos aqui. Em vez disso, simplesmente usaremos a estatística para medir a "distância" entre os vocabulários empregados em dois conjuntos de textos. Quanto mais semelhantes os vocabulários, mais provável é que o mesmo autor tenha escrito os textos em ambos os conjuntos. Isso pressupõe que o vocabulário de uma pessoa e os padrões de uso das palavras são relativamente constantes. - -Veja como aplicar a estatística para atribuição de autoria: - -* Pegue os corpora associados a dois autores; -* Junte-os em um único corpus, maior; -* Conte os tokens para cada uma das palavras que podem ser encontradas neste corpus maior; -* Selecione as [`n`](https://perma.cc/D9ND-3C83) palavras mais comuns no corpus maior; -* Calcule quantos tokens dessas `n` palavras mais comuns esperaríamos encontrar em cada um dos dois corpora originais se fossem do mesmo autor. Isso significa simplesmente dividir o número de tokens que observamos no corpus combinado em dois valores, com base nos tamanhos relativos das contribuições dos dois autores para o corpus comum; -* Calcule uma distância qui-quadrada somando, sobre as `n` palavras mais comuns, os _quadrados das diferenças entre os números reais de tokens encontrados no corpus de cada autor e os números esperados_, divididos pelos números esperados; A Figura 2 mostra a equação para a estatística qui-quadrado, onde C(i) representa o número observado de tokens para o recurso 'i' e E(i), o número esperado para esse recurso. - -{% include figure.html filename="stylometry-python-6.jpg" caption="Imagem 2: Equação para a estatística qui-quadrado." %} - -Quanto menor o valor do qui-quadrado, mais semelhantes são os dois corpora. Portanto, calcularemos o qui-quadrado de cada documento de teste com os 5 possíveis autores: os menores valores representarão a possível autoria de cada documento (assim como vimos no primeiro exemplo). - -Nota: Independentemente do método estilométrico que usamos, a escolha de `n`, o número de palavras a levar em consideração, é uma espécie de arte sombria. Na literatura pesquisada por Stamatatos[^2], pesquisadores sugeriram entre 100 e 1.000 das palavras mais comuns; um projeto chegou a usar cada palavra que aparecia no corpus pelo menos duas vezes. Como diretriz, quanto maior o corpus, maior o número de palavras que podem ser usadas como elementos sem correr o risco de dar importância indevida a uma palavra que ocorra apenas algumas vezes. Nesta lição, usaremos um `n` relativamente grande para o método qui-quadrado e um menor para o próximo método. Mudar o valor de `n` certamente mudará um pouco os resultados numéricos; no entanto, se uma pequena modificação de `n` causar uma mudança na atribuição de autoria, isso é um sinal de que o teste que você está realizando não é capaz de fornecer evidências significativas sobre o seu caso de teste. - -O seguinte trecho de código implementa o método de Kilgariff, com as frequências das 500 palavras mais comuns no corpus conjunto sendo usadas no cálculo: - -```python -# Converter os tokens para caracteres minúsculos para que a mesma palavra, -# maiúscula ou não, conte como uma palavra - -for autor in autores: - obras_tokens[autor] = ( - [token.lower() for token in obras_tokens[autor]]) - -# Calcular o qui-quadrado de cada documento de teste com cada um dos 5 autores -for obra in obras_destacadas: - for autor in autores: - - # Primeiro, construir um corpus conjunto e identificar - # as 500 palavras mais frequentes nele - corpus_conjunto= (obras_tokens[obra] + - obras_tokens[autor]) - freq_dist_conjunto = nltk.FreqDist(corpus_conjunto) - termos_comuns = list(freq_dist_conjunto.most_common(500)) - - # Que proporção do corpus conjunto é constituído pelos - # tokens do autor candidato? - autor_compartihado = (len(obras_tokens[autor]) - / len(corpus_conjunto)) - - # Agora, vamos observar as 500 palavras mais frequentes no corpus do candidato - # e comparar o número de vezes que elas podem ser observadas - # ao que seria esperado se os artigos do autor e o documento de teste - # fossem ambas amostras aleatórias do mesmo conjunto. - quiquadrado = 0 - for word,count_conjunto in termos_comuns: - - # Com que frequência vemos essa palavra comum? - autor_count = obras_tokens[autor].count(word) - obra_count = obras_tokens[obra].count(word) - - # Com que frequência deveríamos vê-la? - autor_count_esperado = count_conjunto * autor_compartihado - teste_count_esperado = count_conjunto * (1-autor_compartihado) - - # Adicionar a contribuição da palavra para a estatística qui-quadrado - quiquadrado += ((autor_count-autor_count_esperado) * - (autor_count-autor_count_esperado) / - autor_count_esperado) - - quiquadrado += ((obra_count-teste_count_esperado) * - (obra_count-teste_count_esperado) - / teste_count_esperado) - - print("A estatística de qui-quadrado do documento", - obra, - "para o candidato", - autor, - "é =", - quiquadrado) - print("\n") -``` - -Assim como no primeiro exemplo, o resultado será 5 blocos de resultados, cada um para um documento de teste. O primeiro bloco se parecerá com isso: -``` -A estatística de qui-quadrado do documento Assis (teste) para o candidato Assis é = 12266.387624251674 -A estatística de qui-quadrado do documento Assis (teste) para o candidato Alencar é = 13832.008019914058 -A estatística de qui-quadrado do documento Assis (teste) para o candidato Castelo Branco é = 15659.980573183348 -A estatística de qui-quadrado do documento Assis (teste) para o candidato Castilho é = 19458.24314684532 -A estatística de qui-quadrado do documento Assis (teste) para o candidato Chagas é = 13681.732446564287 -``` - -
    -No código acima, convertemos os tokens em minúsculas para não contar os tokens de palavras que começam com uma letra maiúscula porque aparecem no início de uma frase e os tokens minúsculos da mesma palavra como duas palavras diferentes. Às vezes, isso pode causar alguns erros, por exemplo, quando um substantivo próprio e um substantivo comum são escritos da mesma forma, exceto para maiúsculas, mas geralmente esta técnica aumenta a precisão. -
    - -Agora, vamos dar uma olhada na matriz de confusão dos resultados para esta técnica: - -| | Assis | Alencar | Castelo Branco | Castilho | Chagas | -| --------- | --------- | --------- | --------- | --------- | --------- | -| **Assis (teste)** | **12266** | 13832| 15659 | 19458 | 13681 | -| **Alencar (teste)** | 2550 | **3153** | 2581 | 2663 | 2765 | -| **Castelo Branco (teste)** | 17294 | 12063 | **11187** | 18133 | 13954 | -| **Castilho (teste)** | 11349 | 9203 | 8925 | **4531** | 7548 | -| **Chagas (teste)** | 6683 | 5700 | 5836 | 6970 | **5332** | - -Como podemos observar, o teste de qui-quadrado obteve um resultado superior à curva característica de composição de Mendenhall. Assis e Castilho permanecem com a inferência correta de autoria. Chagas, que passou pelo "empate técnico" na curva de composição, com o qui-quadrado também faz a inferência correta com uma distância considerável entre os demais possíveis autores. Dos autores que não haviam sido avaliados corretamente na curva de composição, Castelo Branco possui o menor valor de qui-quadrado, outra inferência correta. Alencar, no entanto, segue como o maior valor entre os 5 possíveis autores. De qualquer forma, já passamos de 50% de acerto com a curva característica de composição para 80% com o método qui-quadrado! - -No entanto, o qui-quadrado ainda é um método pouco refinado. Por um lado, palavras que aparecem com muita frequência tendem a ter um peso desproporcional no cálculo final. Às vezes, isso é bom; outras vezes, diferenças sutis de estilo representadas pelas maneiras como os autores usam palavras mais incomuns passarão despercebidas. - -## Uma nota sobre classes gramaticais - -Em alguns casos e idiomas, pode ser útil aplicar a marcação de [Part-of-speech (classes gramaticais)](https://perma.cc/ER5P-CFQE) aos tokens de palavras antes de contá-los, de modo que a mesma palavra usada como duas classes gramaticais diferentes possa contar como dois elementos diferentes (por exemplo, o termo "mais" sendo usado como substantivo ou como advérbio de intensidade). Esta lição não usa marcação de classes gramaticais, mas poderia refinar os resultados em estudos de caso mais complexos. - -Se você precisar aplicar a marcação de classe gramatical aos seus próprios dados, poderá fazer o download de marcadores para outros idiomas, para trabalhar com uma ferramenta de terceiros como [Tree Tagger](https://perma.cc/DG9G-S5T2), ou mesmo para treinar o seu próprio marcador, mas essas técnicas estão muito além do escopo da lição atual. - -# Terceiro teste estilométrico: método Delta de John Burrows (avançado) - -Os primeiros dois métodos estilométricos foram mais fáceis de implementar. Este próximo, baseado na estatística *Delta* de John Burrows[^10], é consideravelmente mais complexo, tanto conceitualmente (a matemática é mais complicada) quanto computacionalmente (mais código necessário). É, no entanto, um dos métodos estilométricos mais proeminentes em uso hoje. - -Assim como o qui-quadrado de Kilgariff, o método Delta de Burrows é uma medida da "distância" entre um texto cuja autoria queremos averiguar e algum outro corpus. Ao contrário do qui-quadrado, no entanto, o método Delta é projetado para comparar um texto anônimo (ou conjunto de textos) com as assinaturas de vários autores diferentes ao mesmo tempo. Mais precisamente, o método Delta mede como o texto anônimo *e conjuntos de textos escritos por um número arbitrário de autores conhecidos* divergem da média de todos eles juntos. Além disso, o método Delta atribui peso igual a todas as características que mede, evitando assim o problema de palavras comuns sobrecarregarem os resultados, o que era um problema com os testes de qui-quadrado. Por todas essas razões, o método Delta de John Burrows é geralmente uma solução mais eficaz para a questão da autoria. - -O algoritmo original de Burrows pode ser resumido da seguinte forma: - -* Reúna um grande corpus composto por textos escritos por um número arbitrário de autores; digamos que o número de autores seja `x`; -* Encontre as `n` palavras mais frequentes no corpus para usar como elementos; -* Para cada uma dessas `n` características, calcule a participação de cada subcorpora dos `x` autores, como uma porcentagem do número total de palavras. Por exemplo, a palavra "ele" pode representar 4,72% das palavras no subcorpus do Autor A; -* Em seguida, calcule a média e o desvio padrão desses `x` valores e use-os como a média oficial e o desvio padrão para esse elemento em todo o corpus. Em outras palavras, estaremos usando uma _média de médias_ em vez de calcular um único valor que represente a parcela de todo o corpus dado por cada palavra. Fazemos isso porque queremos evitar que um subcorpus maior tenha maior influência nos resultados a seu favor e defina a norma do corpus de tal forma que se espere que tudo se pareça com ele; -* Para cada um dos `n` elementos e `x` subcorpora, calcule um [`z-score`](https://perma.cc/S2RH-LF9K) descrevendo o quão distante da norma do corpus está o uso desse elemento particular neste subcorpus específico. Para fazer isso, subtraia a "média das médias" de um dado elemento da frequência com que ela é encontrada no subcorpus e divida o resultado pelo seu desvio padrão. A Figura 3 mostra a equação de z-score para o elemento 'i', onde C(i) representa a frequência observada, a letra grega mu representa a média das médias e a letra grega sigma, o desvio padrão; - -{% include figure.html filename="stylometry-python-7.jpg" caption="Imagem 3: Equação para a estatística de z-score." %} - -* Em seguida, calcule os mesmos `z-scores` para cada elemento no texto para o qual queremos determinar a autoria; -* Finalmente, calcule um *score delta* comparando o documento de teste com o subcorpus de cada candidato. Para fazer isso, tome a *média dos valores absolutos das diferenças entre os `z-scores` para cada elemento entre o documento de teste e o subcorpus do candidato*. (leia duas vezes!) Isso dá peso igual a cada elemento, não importa a frequência com que as palavras ocorram nos textos; caso contrário, os 3 ou 4 principais elementos sobrecarregariam todo o resto. A Figura 4 mostra a equação para Delta, onde Z(c,i) é o `z-score` para o elemento 'i' no candidato 'c', e Z(t,i) é o `z-score` para o elemento 'i' no caso de teste; - -{% include figure.html filename="stylometry-python-8.jpg" caption="Imagem 4: Equação para a estatística Delta de John Burrows." %} - -* O candidato "vencedor", assim como nas duas outras técnicas que aplicamos, é o autor para o qual a pontuação delta entre o subcorpus do autor e o documento de teste é a mais baixa. - -Stefan Evert _et al_.[^11] fornece uma discussão aprofundada das variantes, refinamentos e complexidades do método, mas nos ateremos ao essencial para os propósitos desta lição. Uma explicação diferente de Delta, escrita em espanhol, e uma aplicação a um corpus de romances espanhóis também podem ser encontradas em um artigo recente de José Calvo Tello.[^12] - -## Seleção de elementos - -Vamos combinar todos os subcorpora em um único corpus para Delta calcular um "padrão" para trabalhar. Então, vamos selecionar um número de palavras para usar como característica. Lembre-se de que usamos 500 palavras para calcular o qui-quadrado de Kilgariff; desta vez, usaremos um conjunto menor de 30 palavras (a maioria, senão todas, palavras funcionais e verbos comuns) como nossos elementos. - -```python -# Combinar todos os corpora, exceto os documentos de teste, em um único corpus -corpus_completo = [] -for autor in autores: - corpus_completo += obras_tokens[autor] - -# Obter uma distribuição de frequência -freq_dist_corpus_completo = list(nltk.FreqDist(corpus_completo).most_common(30)) -freq_dist_corpus_completo[ :10 ] -``` - -Uma amostra das palavras mais frequentes e suas respectivas ocorrências parece com o seguinte: - -``` -[('a', 17619), - ('que', 17345), - ('de', 17033), - ('e', 15449), - ('o', 14283), - ('não', 7086), - ('do', 6019), - ('da', 5647), - ('os', 5299), - ('um', 4873)] -``` - -## Calculando elementos para cada subcorpus - -Vejamos as frequências de cada característica no subcorpus de cada candidato, como uma proporção do número total de tokens no subcorpus. Vamos calcular esses valores e armazená-los em um dicionário de dicionários, uma maneira conveniente de construir um [array bidimensional](https://perma.cc/HR9K-24MG) em Python. - -```python -# Criar uma lista com os elementos e a estrutura principal de dados -features = [word for word,freq in freq_dist_corpus_completo] -feature_freqs = {} - -for autor in autores: - # Criar um dicionário para os elementos de cada candidato - feature_freqs[autor] = {} - - # Obter um valor auxiliar contendo o número de tokens no subcorpus do autor - geral = len(obras_tokens[autor]) - - # Calcular a presença de cada elemento no subcorpus - for feature in features: - presenca = obras_tokens[autor].count(feature) - feature_freqs[autor][feature] = presenca / geral -``` - -## Calculando médias de elementos e desvios-padrão - -Dadas as frequências de elementos para todos os subcorpora que acabamos de calcular, podemos encontrar uma "média das médias" e um desvio padrão para cada elemento. Armazenaremos esses valores em outro "dicionário de dicionários". - -```python -import math - -# A estrutura de dados na qual iremos armazenar -# as "estatísticas padrão do corpus" -corpus_features = {} - -# Para cada elemento... -for feature in features: - # Criar um subdicionário que conterá a média e o desvio padrão do elemento - corpus_features[feature] = {} - - # Calcular a média das frequências expressas no subcorpora - feature_average = 0 - for autor in autores: - feature_average += feature_freqs[autor][feature] - feature_average /= len(autores) - corpus_features[feature]["Mean"] = feature_average - - # Calcular o desvio padrão usando a fórmula básica para uma amostra - feature_stdev = 0 - for autor in autores: - diff = feature_freqs[autor][feature] - corpus_features[feature]["Mean"] - feature_stdev += diff * diff - feature_stdev /= (len(autores) - 1) - feature_stdev = math.sqrt(feature_stdev) - corpus_features[feature]["StdDev"] = feature_stdev -``` - -## Calculando z-scores - -Em seguida, transformamos as frequências de características observadas no subcorpora dos cinco candidatos em `z-scores`, descrevendo o quão distante da "estatística padrão do corpus" essas observações estão. Nada extravagante aqui: nós meramente aplicamos a definição do `z-score` para cada elemento e armazenamos os resultados em outro array bidimensional. - -```python -feature_zscores = {} - -for autor in autores: - feature_zscores[autor] = {} - - for feature in features: - # Definição do z-score = (value - mean) / stddev - # Usamos variáveis intermediárias para tornar o - # código mais fácil de ler - feature_val = feature_freqs[autor][feature] - feature_mean = corpus_features[feature]["Mean"] - feature_stdev = corpus_features[feature]["StdDev"] - feature_zscores[autor][feature] = ((feature_val-feature_mean) / - feature_stdev) -``` - -## Calculando elementos, z-scores e Delta para nosso caso de teste - -Em seguida, precisamos comparar os documentos de teste com o corpus. O seguinte trecho de código, que essencialmente recapitula tudo o que fizemos até agora, conta as frequências de cada um de nossos 30 elementos nos documentos de teste e calcula os `z-scores` de acordo. -Por fim, usamos a fórmula para Delta definida por Burrows para extrair uma única pontuação comparando cada documento de teste com cada um dos cinco "autores candidatos". Lembre-se: quanto menor a pontuação Delta, mais semelhante a assinatura estilométrica do documento à do candidato. - -```python -for obra in obras_destacadas: - # Tokenizar o documento de teste - testcase_tokens = nltk.word_tokenize(obras[obra]) - - # Filtrar a pontuação e colocar os tokens em minúsculas - testcase_tokens = [token.lower() for token in testcase_tokens - if any(c.isalpha() for c in token)] - - # Calcular as frequências dos elementos do documento de teste - geral = len(testcase_tokens) - testcase_freqs = {} - for feature in features: - presenca = testcase_tokens.count(feature) - testcase_freqs[feature] = presenca / geral - - # Calcular os z-scores dos elementos do documento de teste - testcase_zscores = {} - for feature in features: - feature_val = testcase_freqs[feature] - feature_mean = corpus_features[feature]["Mean"] - feature_stdev = corpus_features[feature]["StdDev"] - testcase_zscores[feature] = (feature_val - feature_mean) / feature_stdev - - # Calcular Delta para cada autor - for autor in autores: - delta = 0 - for feature in features: - delta += math.fabs((testcase_zscores[feature] - - feature_zscores[autor][feature])) - delta /= len(features) - print( "Delta score do documento", - obra, - "para o candidato", - autor, - "é =", - delta ) - print("\n") -``` - -Como nas outras duas técnicas, o resultado serão 5 blocos de código dando o valor de Delta de cada documento para cada suposto autor. O primeiro bloco se parecerá com isso: - -``` -Delta score do documento Assis (teste) para o candidato Assis é = 0.8715781237572774 -Delta score do documento Assis (teste) para o candidato Alencar é = 1.2624531605759595 -Delta score do documento Assis (teste) para o candidato Castelo Branco é = 1.2303968803032856 -Delta score do documento Assis (teste) para o candidato Castilho é = 1.6276770882853728 -Delta score do documento Assis (teste) para o candidato Chagas é = 1.0527125070730734 -``` - -Vamos avaliar todos os valores Delta na nossa matriz de confusão (reduzidos para 4 casas decimais): - -| | Assis | Alencar | Castelo Branco | Castilho | Chagas | -| --------- | --------- | --------- | --------- | --------- | --------- | -| **Assis (teste)** | **0.8715** | 1.2624 | 1.2303 | 1.6276 | 1.0527 | -| **Alencar (teste)** | 1.9762 | **1.3355** | 1.3878 | 1.6425 | 1.5042 | -| **Castelo Branco (teste)** | 1.004 | 1.3208 | **0.8182** | 1.5202 | 1.2829 | -| **Castilho (teste)** | 1.5705 | 1.2553 | 1.0970 | **0.4518** | 0.8176 | -| **Chagas (teste)** | 1.1444 | 1.0169 | 0.9462 | 0.9864 | **0.7756** | - -Com o método Delta, pudemos inferir corretamente 100% da autoria dos documentos de teste! Alencar, que teve o pior valor nas duas outras técnicas, aqui aparece com o menor valor entre os 5 candidatos. -Ao utilizarmos autores brasileiros e portugueses, tínhamos em mente também a possibilidade de que a comparação entre ficheiros de autores de uma mesma nacionalidade pudessem ter valores mais próximos que entre autores de nacionalidades distintas, em função de particularidades linguísticas, o que parece que não foi o caso aqui. Por se tratarem de obras do século XIX, poderíamos buscar explicações para isso na maior similaridade das línguas na época, na influência da Academia Portuguesa no Brasil, ou mesmo do letramento e influências dos autores. Uma segunda análise com obras mais contemporâneas seria um excelente segundo passo para esta análise, e fica como sugestão para o leitor. - -# Leituras adicionais e recursos - -## Estudos de caso interessantes - -Estilometria e/ou atribuição de autoria têm sido utilizadas em diversos contextos, empregando diversas técnicas. Aqui estão alguns estudos de caso interessantes: - -* Javier de la Rosa e Juan Luis Suárez procuram o autor de um famoso romance espanhol do século XVI entre uma lista considerável de candidatos. [^13] -* Maria Slautina e Mikhail Marusenko usam o reconhecimento de padrões em um conjunto de recursos sintáticos, gramaticais e lexicais, desde a contagem de palavras simples (com marcação de classe gramatical) a vários tipos de frases, a fim de estabelecer semelhanças estilísticas entre os textos medievais.[^14] -* Ellen Jordan, Hugh Craig e Alexis Antonia examinam o caso de periódicos britânicos do século XIX, nos quais os artigos geralmente não eram assinados, para determinar o autor de quatro resenhas de trabalhos de ou sobre as irmãs Brontë.[^15] Este estudo de caso aplica uma versão inicial de outro método desenvolvido por John Burrows, o método Zeta, que se concentra nas palavras favoritas de um autor em vez de palavras de função comum.[^16] -* Valérie Beaudoin e François Yvon analisaram 58 peças em verso dos dramaturgos franceses Corneille, Racine e Molière, descobrindo que as duas primeiras foram muito mais consistentes na maneira como estruturaram sua escrita do que as últimas.[^17] -* Marcelo Luiz Brocardo, Issa Traore, Sherif Saad e Isaac Woungang aplicam [aprendizagem supervisionada](https://perma.cc/7TAQ-JECD) e [modelos n-gram](https://perma.cc/X34K-5R9X) para determinar a autoria de mensagens curtas com um grande número de autores em potencial, como e-mails e tweets.[^18] -* Moshe Koppel e Winter Yaron propõem o "método do impostor", que tenta determinar se dois textos foram escritos pelo mesmo autor, inserindo-os em um conjunto de textos escritos por falsos candidatos.[^19] Justin Anthony Stover _et al._ recentemente aplicou a técnica para determinar a autoria de um manuscrito do século II recém-descoberto.[^20] -* Finalmente, uma equipe liderada por David I. Holmes estudou o caso peculiar de documentos escritos por um soldado da Guerra Civil ou por sua viúva que pode ter copiado intencionalmente seu estilo de escrita.[^21] - -## Referências adicionais sobre autoria e estilometria - -A referência mais exaustiva em todos os assuntos relacionados à atribuição de autoria, incluindo a história do campo, seus fundamentos matemáticos e linguísticos e seus vários métodos, foi escrita por Patrick Juola em 2007.[^22] O Capítulo 7, em particular, mostra como a atribuição de autoria pode servir como um marcador para várias identidades de grupo (gênero, nacionalidade, dialeto, etc.), para mudanças na linguagem ao longo do tempo, e até mesmo para personalidade e saúde mental. - -Uma pesquisa mais curta pode ser encontrada em Moshe Koppel _et al._, que discute casos em que há um único autor candidato cuja autoria deve ser confirmada, um grande número de candidatos para os quais apenas pequenas amostras de escrita estão disponíveis para treinar um algoritmo de aprendizado de máquina, ou nenhum candidato conhecido.[^23] - -O artigo de Stamatatos citado anteriormente[^2] também contém uma pesquisa qualitativa do campo. - -## Varia - -*Programming historians* que desejam explorar mais a estilometria podem fazer o download do pacote [Stylo](https://cran.r-project.org/web/packages/stylo/index.html),[^24] que se tornou um padrão _de facto_. Entre outras coisas, o pacote Stylo fornece uma implementação do método Delta, funcionalidade de extração de recursos e interfaces gráficas de usuário convenientes tanto para manipulação de dados quanto para produção de resultados visualmente atraentes. Observe que o Stylo é escrito em [R](https://www.r-project.org/), o que significa que você precisará do R instalado no seu computador para executá-lo, mas entre a interface gráfica do usuário e os tutoriais, pouco ou nenhum conhecimento prévio de programação R deve ser necessário. - -Leitores fluentes em francês interessados em explorar as implicações [epistemológicas](https://perma.cc/6DFE-QTWV) das interações entre métodos quantitativos e qualitativos na análise do estilo de escrita devem ler Clémence Jacquot.[^25] - -Surpreendentemente, os dados obtidos por meio de [reconhecimento ótico de caracteres](https://perma.cc/R9U6-TRGE) (OCR) se mostraram adequados para fins de atribuição de autoria, mesmo quando os dados sofrem de altas taxas de erro de OCR.[^26] - -Por fim, existe um [grupo Zotero](https://www.zotero.org/groups/643516/stylometry_bibliography/items) dedicado à estilometria, onde você pode encontrar muitas outras referências a métodos e estudos. - -# Agradecimentos - -Agradecimentos a Stéfan Sinclair e Andrew Piper, em cujos seminários na Universidade McGill este projeto começou. Também agradeço à minha orientadora de tese, Susan Dalton, cuja orientação é sempre inestimável. - -# Notas finais - -[^1]: Veja, por exemplo, Justin Rice, ["What Makes Hemingway Hemingway? A statistical analysis of the data behind Hemingway's style"](https://perma.cc/W8TR-UH6S) - -[^2]: Efstathios Stamatatos, “A Survey of Modern Authorship Attribution Method,” _Journal of the American Society for Information Science and Technology_, vol. 60, no. 3 (December 2008), p. 538–56, citation on p. 540, [https://doi.org/10.1002/asi.21001](https://doi.org/10.1002/asi.21001). - -[^3]: Jan Rybicki, “Vive La Différence: Tracing the (Authorial) Gender Signal by Multivariate Analysis of Word Frequencies,” _Digital Scholarship in the Humanities_, vol. 31, no. 4 (December 2016), pp. 746–61, [https://doi.org/10.1093/llc/fqv023](https://doi.org/10.1093/llc/fqv023). Sean G. Weidman e James O’Sullivan, “The Limits of Distinctive Words: Re-Evaluating Literature’s Gender Marker Debate,” _Digital Scholarship in the Humanities_, 2017, [https://doi.org/10.1093/llc/fqx017](https://doi.org/10.1093/llc/fqx017). - -[^4]: Ted Underwood, David Bamman, e Sabrina Lee, “The Transformation of Gender in English-Language Fiction”, _Cultural Analytics_, Feb. 13, 2018, [https://doi.org/10.22148/16.019](https://doi.org/10.22148/16.019). - -[^5]: Sven Meyer zu Eissen e Benno Stein, “Intrinsic Plagiarism Detection,” in _ECIR 2006_, edited by Mounia Lalmas, Andy MacFarlane, Stefan Rüger, Anastasios Tombros, Theodora Tsikrika, e Alexei Yavlinsky, Berlin, Heidelberg: Springer, 2006, pp. 565–69, [https://doi.org/10.1007/11735106_66](https://doi.org/10.1007/11735106_66). - -[^6]: Cynthia Whissell, “Traditional and Emotional Stylometric Analysis of the Songs of Beatles Paul McCartney and John Lennon,” _Computers and the Humanities_, vol. 30, no. 3 (1996), pp. 257–65. - -[^7]: Douglass Adair, "The Authorship of the Disputed Federalist Papers", _The William and Mary Quarterly_, vol. 1, no. 2 (April 1944), pp. 97-122. - -[^8]: T. C. Mendenhall, "The Characteristic Curves of Composition", _Science_, vol. 9, no. 214 (Mar. 11, 1887), pp. 237-249. - -[^9]: Adam Kilgarriff, "Comparing Corpora", _International Journal of Corpus Linguistics_, vol. 6, no. 1 (2001), pp. 97-133. - -[^10]: John Burrows, "'Delta': a Measure of Stylistic Difference and a Guide to Likely Authorship", _Literary and Linguistic Computing_, vol. 17, no. 3 (2002), pp. 267-287. - -[^11]: Stefan Evert et al., "Understanding and explaining Delta measures for authorship attribution", _Digital Scholarship in the Humanities_, vol. 32, no. suppl_2 (2017), pp. ii4-ii16. - -[^12]: José Calvo Tello, “Entendiendo Delta desde las Humanidades,” [_Caracteres_, vol.5, no.1 (May 27 2016)](https://perma.cc/LNF3-QP8V), pp.140-176. - -[^13]: Javier de la Rosa and Juan Luis Suárez, “The Life of Lazarillo de Tormes and of His Machine Learning Adversities,” _Lemir_, vol. 20 (2016), pp. 373-438. - -[^14]: Maria Slautina e Mikhaïl Marusenko, “L’émergence du style, The emergence of style,” _Les Cahiers du numérique_, vol. 10, no. 4 (November 2014), pp. 179–215, [https://doi.org/10.3166/LCN.10.4.179-215](https://doi.org/10.3166/LCN.10.4.179-215). - -[^15]: Ellen Jordan, Hugh Craig, e Alexis Antonia, “The Brontë Sisters and the ‘Christian Remembrancer’: A Pilot Study in the Use of the ‘Burrows Method’ to Identify the Authorship of Unsigned Articles in the Nineteenth-Century Periodical Press,” _Victorian Periodicals Review_, vol. 39, no. 1 (2006), pp. 21–45. - -[^16]: John Burrows, “All the Way Through: Testing for Authorship in Different Frequency Strata,” _Literary and Linguistic Computing_, vol. 22, no. 1 (April 2007), pp. 27–47, [https://doi.org/10.1093/llc/fqi067](https://doi.org/10.1093/llc/fqi067). - -[^17]: Valérie Beaudoin e François Yvon, “Contribution de La Métrique à La Stylométrie,” _JADT 2004: 7e Journées internationales d'Analyse statistique des Données Textuelles_, vol. 1, Louvain La Neuve, Presses Universitaires de Louvain, 2004, pp. 107–18. - -[^18]: Marcelo Luiz Brocardo, Issa Traore, Sherif Saad e Isaac Woungang, “Authorship Verification for Short Messages Using Stylometry,” _2013 International Conference on Computer, Information and Telecommunication Systems (CITS)_, 2013, [https://doi.org/10.1109/CITS.2013.6705711](https://doi.org/10.1109/CITS.2013.6705711). - -[^19]: Moshe Koppel e Winter Yaron, “Determining If Two Documents Are Written by the Same Author,” _Journal of the Association for Information Science and Technology_, vol. 65, no. 1 (October 2013), pp. 178–87, [https://doi.org/10.1002/asi.22954](https://doi.org/10.1002/asi.22954). - -[^20]: Justin Anthony Stover et al., "Computational authorship verification method attributes a new work to a major 2nd century African author", _Journal of the Association for Information Science and Technology_, vol. 67, no. 1 (2016), pp. 239–242. - -[^21]: David I. Holmes, Lesley J. Gordon, e Christine Wilson, "A widow and her soldier: Stylometry and the American Civil War", _Literary and Linguistic Computing_, vol. 16, no 4 (2001), pp. 403–420. - -[^22]: Patrick Juola, “Authorship Attribution,” _Foundations and Trends in Information Retrieval_, vol. 1, no. 3 (2007), pp. 233–334, [https://doi.org/10.1561/1500000005](https://doi.org/10.1561/1500000005). - -[^23]: Moshe Koppel, Jonathan Schler, e Shlomo Argamon, “Computational Methods in Authorship Attribution,” _Journal of the Association for Information Science and Technology_. vol. 60, no. 1 (January 2009), pp. 9–26, [https://doi.org/10.1002/asi.v60:1](https://doi.org/10.1002/asi.v60:1). - -[^24]: Maciej Eder, Jan Rybicki, e Mike Kestemont, “Stylometry with R: A Package for Computational Text Analysis,” _The R Journal_, vol. 8, no. 1 (2016), pp. 107–21. - -[^25]: Clémence Jacquot, “Rêve d'une épiphanie du style: visibilité et saillance en stylistique et en stylométrie,” _Revue d’Histoire Littéraire de la France_ , vol. 116, no. 3 (2016), pp. 619–39. - -[^26]: Patrick Juola, John Noecker Jr, e Michael Ryan, "Authorship Attribution and Optical Character Recognition Errors", _TAL_, vol. 53, no. 3 (2012), pp. 101–127. +--- +title: Introdução à estilometria com Python +layout: lesson +slug: introducao-estilometria-python +date: 2018-04-21 +translation_date: 2021-12-27 +authors: +- François Dominic Laramée +reviewers: +- Folgert Karsdorp +- Jan Rybicki +- Antonio Rojas Castro +editors: +- Adam Crymble +translator: +- Daniel Bonatto Seco +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Bruno Almeida +- Suemi HIguchi +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/445 +activity: analyzing +topics: [distant-reading, python] +abstract: "Nesta lição, aprenderá a realizar análises estilométricas e a determinar a autoria de textos. A lição cobre três métodos: Curvas Características de Composição de Mendenhall, Método Qui-Quadrado de Kilgariff e Método Delta de John Burrows." +original: introduction-to-stylometry-with-python +avatar_alt: Mulher a ler junto a uma pintura +doi: 10.46430/phpt0024 +--- + + +{% include toc.html %} + +# Introdução + +[Estilometria](https://perma.cc/NYH2-KWLA) é o estudo quantitativo do estilo literário por meio de métodos de [leitura distante](https://perma.cc/XK8J-F6ZF) computacional. É baseado na observação de que os autores tendem a escrever de maneiras relativamente consistentes, reconhecíveis e únicas. Por exemplo: + +* Cada pessoa tem seu próprio vocabulário único, às vezes rico, às vezes limitado. Embora um vocabulário mais amplo esteja geralmente associado à qualidade literária, nem sempre é esse o caso. Ernest Hemingway é famoso por usar um número surpreendentemente pequeno de palavras diferentes em sua escrita,[^1] o que não o impediu de ganhar o Prêmio Nobel de Literatura em 1954; +* Algumas pessoas escrevem frases curtas, enquanto outras preferem blocos longos de texto compostos por muitas frases; +* Não há duas pessoas que usem ponto-e-vírgulas, travessões e outras formas de pontuação exatamente da mesma maneira. + +As maneiras como os escritores usam pequenas [*function words*](https://perma.cc/284C-CNHD), como artigos, preposições e conjunções, mostram-se particularmente reveladoras. Em uma pesquisa dos métodos estilométricos históricos e atuais, Efstathios Stamatatos aponta que as palavras funcionais são "usadas de maneira amplamente inconsciente pelos autores e são independentes do tópico"[^2]. Para a análise estilométrica, isso é muito vantajoso, visto que esse padrão inconsciente tende a variar menos no [*corpus*](https://perma.cc/9XQ4-J4A5) de um autor do que seu vocabulário geral (e também é muito difícil para um pretenso falsificador copiar). As palavras funcionais também foram identificadas como marcadores importantes do gênero literário e da cronologia. + +Os pesquisadores têm usado a estilometria como uma ferramenta para estudar uma variedade de questões culturais. Por exemplo, uma quantidade considerável de pesquisas estudou as diferenças entre as maneiras como homens e mulheres escrevem[^3] ou sobre o que escrevem.[^4] Outros pesquisadores estudaram as maneiras como uma mudança repentina no estilo de escrita em um único texto pode indicar plágio[^5] e até mesmo a maneira como as letras dos músicos John Lennon e Paul McCartney se tornaram cada vez menos alegres e menos ativas à medida que os [Beatles](https://perma.cc/DQ66-M79T) se aproximavam do fim de sua carreira de gravação na década de 1960.[^6] + +No entanto, uma das aplicações mais comuns da estilometria é na atribuição de autoria. Dado um texto anônimo, às vezes é possível inferir quem o escreveu medindo certas características, como o número médio de palavras por frase ou a propensão do autor de usar "todavia" em vez de "no entanto", e comparando as medidas com outros textos escritos pelo suposto autor. Este é o objetivo deste tutorial, onde a partir de um conjunto de obras clássicas de romancistas lusos e brasileiros do século XIX iremos comparar exemplares de suas obras com o estilo literário do conjunto de autores a fim de tentar inferir suas respectivas autorias (nota de tradução: foi decidido mudar o _corpus_ usado nesta lição para um que fosse culturalmente mais relevante para o público que fala e escreve português; foi mantida a restante estrutura da lição original, com excepção de ligeiras adaptações face à mudança do _corpus_). + +## Objetivos de aprendizado + +No final desta lição, teremos percorrido os seguintes tópicos: + +* Como aplicar vários métodos estilométricos para inferir a autoria de um texto anônimo ou conjunto de textos; +* Como usar estruturas de dados relativamente avançadas, incluindo [dicionários](https://perma.cc/TTF4-SJ23) de [strings](https://perma.cc/7DCC-M9AT) e dicionários de dicionários, em [Python](https://perma.cc/Z82S-3L3M); +* O básico do [Natural Language Toolkit](https://perma.cc/E7LZ-WECZ) (NLTK), um módulo Python popular dedicado a [processamento de linguagem natural](https://perma.cc/MFX4-LAVZ). + +## Leitura prévia + +Se você não tem experiência com a linguagem de programação Python ou está tendo dificuldade nos exemplos apresentados neste tutorial, o autor recomenda que você leia as lições [Trabalhando com ficheiros de texto em Python](/pt/licoes/trabalhando-ficheiros-texto-python) e [Manipular Strings com Python](/pt/licoes/manipular-strings-python). Note que essas lições foram escritas em Python versão 2, enquanto esta usa Python versão 3. As diferenças de [sintaxe](https://perma.cc/E5LQ-S65P) entre as duas versões da linguagem podem ser sutis. Se você ficar em dúvida, siga os exemplos conforme descritos nesta lição e use as outras lições como material de apoio. (Este tutorial encontra-se atualizado até à versão [Python 3.8.5](https://perma.cc/XCT2-Q4AT); as [strings literais formatadas](https://perma.cc/U6Q6-59V3) na linha `with open(f'data/pg{filename}.txt', 'r', encoding='utf-8') as f:`, por exemplo, requerem Python 3.6 ou uma versão mais recente da linguagem.) + +## Materiais requeridos + +Este tutorial usa conjuntos de dados e software que você terá que baixar e instalar. + +### O conjunto de dados ### + +Para trabalhar nesta lição, você precisará baixar e descompactar o ficheiro [.zip](/assets/introduction-to-stylometry-with-python/dataset_estilometria.zip) contendo as 15 obras que compõem o *corpus* que será utilizado neste tutorial. As obras foram originalmente extraídas do [Projeto Gutenberg](https://perma.cc/8GTT-3M9N). Ao descompactar o ficheiro, será criada uma pasta com o nome `dados`. Este será o seu [diretório de trabalho](https://perma.cc/9KVS-T3A5) e todo o trabalho deve ser salvo aqui durante a execução da lição. + +### O software ### + +Esta lição usa as seguintes versões da linguagem Python e [bibliotecas](https://pt.wikipedia.org/wiki/Biblioteca_(computa%C3%A7%C3%A3o)): +* [Python 3.x](https://www.python.org/downloads/) - a última versão estável é recomendada; +* [nltk](https://www.nltk.org/) - Natural Language Toolkit, geralmente abreviado `nltk`; +* [matplotlib](https://matplotlib.org/) - visualização de dados e geração de gráficos; +* [re](https://docs.python.org/pt-br/3/library/re.html) - limpeza de dados via Regex (veremos durante o tutorial o porquê). + +Alguns desses módulos podem não estar pré-instalados em seu computador. Se você encontrar mensagens de erro como: "Módulo não encontrado" ou similares, você terá que baixar e instalar o(s) módulo(s) ausente(s). A forma mais simples de realizar esta tarefa é através do comando `pip`. Mais detalhes estão disponíveis através do tutorial do *Programming Historian* [Instalação de Módulos Python com pip](/pt/licoes/instalacao-modulos-python-pip). + +## Algumas notas sobre Independência Linguística + +Este tutorial aplica a análise estilométrica a um conjunto de textos em português (PT-PT e PT-BR) usando uma biblioteca Python chamada `nltk`. Muitas das funcionalidades fornecidas pelo `nltk` operam com outros idiomas. Contanto que um idioma forneça uma maneira clara de distinguir os limites de uma palavra, o `nltk` deve ter um bom desempenho. Idiomas como o chinês, para os quais não há distinção clara entre os limites das palavras, podem ser problemáticos. O autor original desta lição utilizou `nltk` com textos em francês sem nenhum problema; outros idiomas que usam [diacríticos](https://perma.cc/7VGD-5968), como espanhol e alemão, também devem funcionar bem com `nltk`. Consulte a [documentação do nltk](https://perma.cc/S4EX-2DBT) para obter detalhes. + +Apenas uma das tarefas neste tutorial requer código dependente do idioma. Para dividir um texto em um conjunto de palavras em uma língua diferente do inglês, você precisará especificar o idioma apropriado como um parâmetro para o [tokenizador](https://perma.cc/NGM5-4MED) da biblioteca `nltk`, que usa o inglês como padrão. Isso será explicado no tutorial. + +Por fim, observe que algumas tarefas linguísticas, como [*part-of-speech tagging*](https://perma.cc/L9SU-PS9D), podem não ser suportadas pelo `nltk` em outros idiomas além do inglês. Este tutorial não cobre a aplicação de *part-of-speech tagging*. Se você precisar para os seus próprios projetos, consulte a [documentação do nltk](https://perma.cc/S4EX-2DBT) para obter orientações. + +# O *corpus* - Contextualização + +No [exemplo original deste tutorial em inglês](/en/lessons/introduction-to-stylometry-with-python), utilizaram-se os [papéis federalistas](https://perma.cc/DW5V-MH5W) como um exemplo de aplicação de estilometria, utilizando as técnicas que serão apresentadas para inferir a autoria dos textos contestados dentro do conjunto de documentos que configura o *corpus*.[^7] +Como na língua portuguesa não temos um conjunto de textos que possua estas mesmas características, no exemplo que apresentaremos traremos um total de 15 obras completas de 5 autores diferentes, três deles portugueses e dois brasileiros, todos romancistas do século XIX, disponibilizadas pelo [Projeto Gutenberg](https://perma.cc/5PRR-TM3D). Utilizaremos duas obras de cada autor para definir seus respectivos estilos e uma terceira para constituir o conjunto de testes, para avaliarmos se as técnicas utilizadas realizarão a inferência correta de autoria através do grau de similaridade de cada obra deste conjunto com o estilo obtido de cada autor. + +Os autores e obras utilizadas são os seguintes: + +| Autor | Obra 1 | Obra 2 | Obra 3 | +| --------- | --------- | --------- | --------- | +| [Machado de **Assis**](https://perma.cc/6BMU-UKZL) (Brasil)| [Quincas Borba](https://www.gutenberg.org/ebooks/55682) (**55682**) | [Memorias Posthumas de Braz Cubas](https://www.gutenberg.org/ebooks/54829) (**54829**) | [Dom Casmurro](https://www.gutenberg.org/ebooks/55752) (**55752**) | +| [José de **Alencar**](https://perma.cc/Y3Y2-VHJ5) (Brasil) | [Ubirajara](https://www.gutenberg.org/ebooks/38496) (**38496**) | [Cinco minutos](https://www.gutenberg.org/ebooks/44540) (**44540**) | [Como e porque sou romancista](https://www.gutenberg.org/ebooks/29040) (**29040**) | +| [Camilo **Castelo Branco**](https://perma.cc/Q4AJ-VZBH) (Portugal) | [Carlota Angela](https://www.gutenberg.org/ebooks/26025) (**26025**) | [Amor de Salvação](https://www.gutenberg.org/ebooks/26988) (**26988**) | [Amor de Perdição: Memorias d'uma familia](https://www.gutenberg.org/ebooks/16425) (**16425**) | +| [António Feliciano de **Castilho**](https://perma.cc/LZ9J-3H5Z) (Portugal) | [A Chave do Enigma](https://www.gutenberg.org/ebooks/32002) (**32002**) | [A Primavera](https://www.gutenberg.org/ebooks/65021) (**65021**) | [O presbyterio da montanha](https://www.gutenberg.org/ebooks/28127) (**28127**) | +| [Manuel Pinheiro **Chagas**](https://perma.cc/8LU3-RADW) (Portugal) | [Historia alegre de Portugal](https://www.gutenberg.org/ebooks/29394) (**29394**) | [A Lenda da Meia-Noite](https://www.gutenberg.org/ebooks/23400) (**23400**) | [Astucias de Namorada, e Um melodrama em Santo Thyrso](https://www.gutenberg.org/ebooks/29342) (**29342**) | + +As partes destacadas do nome de cada autor indicam como os mesmos serão referenciados neste tutorial a partir deste ponto. Para os códigos utilizaremos o `EBook-No.` (número de referência da obra no Projeto Gutenberg), presente no nome dos ficheiros disponibilizados. + +# Nossos casos de teste + +Nesta lição, usaremos obras de romancistas brasileiros e portugueses do século XIX como um estudo de caso para demonstrar três abordagens estilométricas diferentes: + +1. Curvas características de composição de Mendenhall +2. Método Qui-Quadrado de Kilgariff +3. Método Delta de John Burrows + +Em todas as abordagens acima mencionadas, utilizaremos os documentos das colunas **Obra 1** e **Obra 2** para definir o estilo de cada autor. Os documentos da coluna **Obra 3** serão testados individualmente com cada um dos 5 autores para tentarmos inferir a autoria pela proximidade de estilo. + +# Preparando os dados para análise + +Antes de prosseguirmos com a análise estilométrica, precisamos carregar os ficheiros contendo todas as 15 obras em [estruturas de dados](https://perma.cc/P843-J4LB) na memória do computador. + +O primeiro passo neste processo é designar cada obra para o seu respectivo conjunto. Como cada obra está relacionada com o seu respectivo `EBook-No.`, podemos atribuir cada obra (valor) à chave do seu autor (ou a uma chave separada, se ela fizer parte da amostra de teste) usando um *dicionário* Python. O dicionário é um tipo de conjunto de dados composto de um número arbitrário de pares de chave-valor; neste caso, os nomes dos autores servirão como chaves (separados entre treino e teste), enquanto os `EBook-No.` das obras serão os valores associados a essas chaves. + +```python +ids_obras = { + 'Assis' : [55752, 54829], + 'Alencar' : [38496, 44540], + 'Castelo Branco' : [26025, 26988], + 'Castilho' : [32002, 65021], + 'Chagas' : [29394, 23400], + 'Assis (teste)' : [55682], + 'Alencar (teste)' : [29040], + 'Castelo Branco (teste)' : [16425], + 'Castilho (teste)' : [28127], + 'Chagas (teste)' : [29342] +} +``` + +Os dicionários Python são muito flexíveis. Por exemplo, podemos acessar um valor específico *indexando* o dicionário com uma de suas chaves, podemos varrer o dicionário inteiro fazendo um loop em sua lista de chaves, etc. Faremos amplo uso desta funcionalidade à medida que avançarmos. + +A seguir, como estamos interessados no vocabulário de cada autor, definiremos uma breve [função](https://perma.cc/P8CA-Y43Q) em Python que irá criar uma longa lista de palavras em cada uma das obras atribuídas a um único autor. Isso será armazenado como uma [string](https://perma.cc/7DCC-M9AT). +Abra o seu ambiente de desenvolvimento Python escolhido. Se você não sabe como fazer isso, leia "Configurar um ambiente de desenvolvimento integrado para Python" ([Windows](/pt/licoes/instalacao-windows), [Linux](/pt/licoes/instalacao-linux), [Mac](/pt/licoes/instalacao-mac)) antes de prosseguir. + +```python +# Função que compila todos os ficheiros de texto de cada grupo em uma única string + +import re + +def ler_ficheiros_para_string(ids_ficheiros): + global texto + strings = [] + for id_ficheiro in ids_ficheiros: + with open(f'dados/pg{id_ficheiro}.txt', 'r', + encoding='utf-8') as f: + texto = f.read() + texto = re.search(r"(START.*?\*\*\*)(.*)(\*\*\* END)", + texto, + re.DOTALL).group(2) + strings.append(texto) + return '\n'.join(strings) +``` + +Perceba que, dentro da função, temos também uma etapa de limpeza dos textos usando [expressões regulares](https://perma.cc/DT3K-XUBG). Isso foi necessário para este corpus específico pois as obras publicadas no Projeto Gutenberg possuem uma estrutura de cabeçalho e rodapé de [metadados](https://perma.cc/E8P8-GKDR) que não pode ser considerada na análise estilométrica, uma vez que não foram redigidas pelos autores analisados. A utilização de expressões regulares não faz parte do escopo deste tutorial, então limitaremo-nos a compreender que estamos utilizando a biblioteca `re` para capturar apenas o conjunto de caracteres entre os marcadores `*** START OF THIS PROJECT GUTENBERG [NOME DA OBRA] ***` e `*** END OF THIS PROJECT GUTENBERG [NOME DA OBRA] ***` presentes em cada documento do projeto. Para maiores dúvidas sobre a utilização de expressões regulares e da biblioteca `re`, consulte a [documentação](https://perma.cc/JFP3-B4P4). + +Na sequência, construímos uma nova estrutura de dados chamando repetidamente a função `ler_ficheiros_para_string ()`, passando a ela uma lista diferente de documentos a cada vez. Armazenaremos os resultados em outro dicionário, este com nomes do autor/caso de teste como chaves e todo o texto dos respectivos documentos como valores. Para simplificar, iremos nos referir à string contendo uma lista de documentos como "corpus do autor". + +```python +# Criar um dicionário com os corpora dos autores +obras = {} +for autor, ids_ficheiros in ids_obras.items(): + obras[autor] = ler_ficheiros_para_string(ids_ficheiros) +``` + +Para nos certificarmos de que os ficheiros foram carregados corretamente, imprima os primeiros cem caracteres de cada entrada do dicionário na tela: + +```python +for autor in obras: + print(obras[autor][:100]) +``` + +Se esta operação de impressão exibir quaisquer trechos de texto no console, então a operação de leitura dos ficheiros funcionou conforme o esperado e você pode prosseguir para a análise estilométrica. + +
    +Se os ficheiros não forem carregados, o motivo mais provável é que o seu diretório de trabalho atual não seja o repositório `dados` criado ao descompactar o ficheiro da seção de Materiais Requeridos acima; mudar o seu diretório de trabalho deve resolver o problema. Como você faz isso depende do seu ambiente de desenvolvimento Python. +
    + +# Primeiro teste estilométrico: curvas características de composição de Mendenhall + +O pesquisador literário T. C. Mendenhall escreveu certa vez que a assinatura estilística de um autor pode ser encontrada contando a frequência com que usa palavras de tamanhos diferentes.[^8] Por exemplo, se contarmos os tamanhos de palavras em vários segmentos de 1.000 ou 5.000 palavras de qualquer romance e, em seguida, traçarmos um gráfico das distribuições de comprimento das palavras, as curvas pareceriam praticamente as mesmas, não importando que partes do romance tivéssemos escolhido. Na verdade, Mendenhall acreditava que se alguém contasse palavras suficientes selecionadas de várias partes da obra de toda a vida de um escritor (digamos, 100.000 ou mais), a "curva característica" de uso de comprimento de palavras do autor se tornaria tão precisa que seria constante ao longo de sua vida. + +Pelos padrões de hoje, contar o comprimento das palavras parece uma forma muito direta (e talvez simplista) de medir o estilo literário. O método de Mendenhall não leva em consideração as palavras do vocabulário de um autor, o que é obviamente problemático. Portanto, não devemos tratar as curvas características como uma fonte particularmente confiável de evidência estilométrica. No entanto, Mendenhall publicou a sua teoria há mais de cento e trinta anos e fez todos os cálculos à mão. É compreensível que ele tivesse optado por trabalhar com uma estatística que, embora grosseira, fosse ao menos fácil de compilar. Em honra ao valor histórico de sua tentativa inicial de estilometria, e porque a curva característica produz resultados visuais interessantes que podem ser implementados rapidamente, usaremos o método de Mendenhall como um primeiro passo em nossa exploração das técnicas de atribuição de autoria. + +O trecho de código necessário para calcular e exibir as curvas características para os autores e os documentos de teste é o seguinte: + +```python +# Carregar nltk e matpotlib +import nltk +nltk.download('punkt') +import matplotlib.pylab as plt + +obras_tokens = {} +obras_distribuicao_comprimento = {} + +id_subplot = 1 +fig = plt.figure(figsize=(20,20)) + +autores = list(obras.keys()) + +for autor in autores: + # Transformar os corpora dos autores em listas de tokens de palavras + tokens = nltk.word_tokenize(obras[autor], language="portuguese") + + # Filtrar pontuação + obras_tokens[autor] = ([token for token in tokens + if any(c.isalpha() for c in token)]) + +# Obter a distribuição de comprimentos de tokens +token_comprimentos = [len(token) for token in obras_tokens[autor]] +obras_distribuicao_comprimento[autor] = nltk.FreqDist(token_comprimentos) + + # Plotar a curva característica de composição + lista_chaves = [] + lista_valores = [] + + for i in range(1,16): + lista_chaves.append(i) + lista_valores.append(obras_distribuicao_comprimento[autor][i]) + + lista_valores_normalizado = [value/max(lista_valores) for value in lista_valores] + + plt.subplot(5, 5, id_subplot) + plt.plot(lista_chaves, lista_valores_normalizado) + plt.xticks(lista_chaves) + plt.title(autor) + id_subplot += 1 + +plt.savefig("stilometry_comparacao.jpeg", dpi=300, bbox_inches='tight') +plt.show() +``` + +Se você estiver trabalhando em um [Jupyter Notebook](https://jupyter.org/), adicione a expressão `%matplotlib inline` após a importação das bibliotecas; caso contrário, você pode não ver os gráficos em sua tela. Se você estiver trabalhando em um [Jupyter Lab](https://jupyterlab.readthedocs.io/en/stable/getting_started/installation.html), substitua esta expressão por `%matplotlib ipympl`. + +A primeira linha no trecho de código acima carrega o módulo *Natural Language Toolkit (nltk)*, que contém um número enorme de funções e recursos úteis para processamento de texto. Mal tocaremos em seus fundamentos nesta lição; se você decidir explorar mais a análise de texto em Python, recomendo fortemente que comece com [a documentação do nltk](https://www.nltk.org/). + +As próximas linhas configuram estruturas de dados que serão preenchidas pelo bloco de código dentro do loop `for`. Este loop faz os mesmos cálculos para todos os nossos "autores": + +* Invoca o método `word_tokenize()` do `nltk`, explicitando a linguagem do _corpus_ para português através do argumento `language="portuguese"`, e divide o _corpus_ em _tokens_, ou seja, palavras, números, pontuação, etc.; +* Olha para esta lista de tokens e filtra as não-palavras; +* Cria uma lista contendo os comprimentos de cada token de palavra restante; +* Cria um objeto de _distribuição de frequência_ a partir dessa lista de comprimentos de palavra, basicamente contando quantas palavras de uma letra, palavras de duas letras, etc., existem no _corpus_ do autor, e em seguida realiza a normalização dessa distribuição, ou seja, ajusta todos os valores em um intervalo entre 0 e 1. Esta etapa é realizada para comparar gráficos de distribuição em _corpus_ de tamanhos diferentes de forma mais clara; +* Plota um gráfico da distribuição de comprimentos de palavras no corpus, para todas as palavras de até 15 caracteres. + +Os resultados que obtemos são os seguintes: +{% include figure.html filename="introducao-estilometria-python-01.jpeg" caption="Imagem 1: Comparação da curva de Mendenhall para cada corpus." %} + +Como podemos ver pelos gráficos, é possível notar diferenças (embora sutis) entre todas as 5 curvas características de cada autor (linha superior de gráficos). Ao compararmos os documentos de teste (linha inferior de gráficos) com os autores, podemos notar que a curva característica dos documentos de teste dos autores Assis, Castilho e Chagas se assemelham mais à curva dos seus respectivos autores que de qualquer outro, o que seriam inferências corretas. O documento de Alencar é o que mais diverge da curva característica do autor. Isso pode ocorrer pelo fato do documento de teste ser uma autobiografia do autor, enquanto os documentos de treino são duas obras de ficção, o que poderia influenciar no seu estilo de escrita. Veremos nas próximas abordagens se conseguimos contornar esta situação. O documento de Castelo Branco também parece não ter se assemelhado à curva característica do autor. + +Para além desta análise meramente visual (que pode muitas vezes induzir ao erro), podemos ter um resultado quantitativo calculando a soma das distâncias entre os valores (normalizados) de frequência de cada documento de teste com os valores de frequência do *corpus* de cada possível autor. Por consequência, o autor que possuir a menor distância de frequência com o documento de teste seria o mais provável autor deste documento. Podemos implementar isso da seguinte forma: + +```python +# Dividir a lista de corpus entre autores e obras destacadas +autores = list(obras.keys())[:5] +obras_destacadas = list(obras.keys())[5:] + +obras_distribuicao_comprimento_normalizado = {} + +# Normalizar a distribuição de comprimentos de tokens em um novo dicionário +for index, obra in obras_distribuicao_comprimento.items(): + obras_distribuicao_comprimento_normalizado[index] = {k: + v/max(obra.values()) + for k, v in dict(obra).items()} + +# Calcular a soma da diferença da distribuição entre o documento de teste e cada autor (de 1 até 15 caracteres) +for obra in obras_destacadas: + for autor in autores: + soma_diferenca = 0 + for i in range(1,16): + diferenca = abs(obras_distribuicao_comprimento_normalizado[obra][i] - + obras_distribuicao_comprimento_normalizado[autor][i]) + soma_diferenca = soma_diferenca + diferenca + print('A soma da diferença do documento ' + + obra + + ' para o autor ' + + autor + + ' é ' + + str(soma_diferenca)) + print('\n') +``` + +O resultado deste trecho serão 5 blocos, cada um comparando um documento com os 5 possíveis autores. Abaixo o exemplo de como o primeiro bloco deve parecer: + +``` +A soma da diferença do documento Assis (teste) para o autor Assis é 0.25782806530977137 +A soma da diferença do documento Assis (teste) para o autor Alencar é 0.5192643726222002 +A soma da diferença do documento Assis (teste) para o autor Castelo Branco é 0.7410205025846326 +A soma da diferença do documento Assis (teste) para o autor Castilho é 0.46876355973646266 +A soma da diferença do documento Assis (teste) para o autor Chagas é 0.3466043230715998 +``` + +Vamos colocar os resultados dos 5 testes em uma [matriz de confusão](https://perma.cc/K42B-NQSR) (limitando a 4 casas decimais) para avaliarmos: + +| | Assis | Alencar | Castelo Branco | Castilho | Chagas | +| --------- | --------- | --------- | --------- | --------- | --------- | +| **Assis (teste)** | **0.2578** | 0.5192 | 0.7410 | 0.4687 | 0.3466 | +| **Alencar (teste)** | 0.9744 | **0.9844** | 0.4313 | 0.6979 | 0.7897 | +| **Castelo Branco (teste)** | 0.2812 | 0.4436 | **0.4761** | 0.2772 | 0.2803 | +| **Castilho (teste)** | 0.4396 | 0.4624 | 0.4114 | **0.1394** | 0.3184 | +| **Chagas (teste)** | 0.7746 | 0.5883 | 0.6636 | 0.6732 | **0.5888** | + +Os documentos de teste de Assis e Castilho possuem menor valor com seus respectivos autores, o que indica a maior proximidade. Isso é condizente com a similaridade dos gráficos que vimos anteriormente. O documento de teste de Chagas teve um "empate técnico" entre o estilo do próprio autor (0.5888) e Alencar (0.5883). Tanto os documentos de teste de Alencar quanto Castelo Branco ficaram com o maior valor em relação aos seus respectivos autores, logo a técnica não foi eficaz para estes dois autores. + +Se não tivéssemos informações adicionais para trabalharmos, poderíamos inferir corretamente 50% da atribuição de autoria (2 acertos, 2 erros e um "empate"), o que é um resultado considerável para uma técnica relativamente simples. Felizmente, a ciência estilométrica avançou muito desde a época de Mendenhall. + +# Segundo teste estilométrico: método qui-quadrado de Kilgariff + +Em um artigo de 2001, Adam Kilgarriff[^9] recomenda o uso da estatística qui-quadrado para determinar a autoria. Leitores familiarizados com métodos estatísticos podem se lembrar que o qui-quadrado às vezes é usado para testar se um conjunto de observações (digamos, as intenções dos eleitores conforme declarado em uma pesquisa) segue uma certa [distribuição de probabilidade](https://perma.cc/668N-9GPD) ou padrão. Não é isso que buscamos aqui. Em vez disso, simplesmente usaremos a estatística para medir a "distância" entre os vocabulários empregados em dois conjuntos de textos. Quanto mais semelhantes os vocabulários, mais provável é que o mesmo autor tenha escrito os textos em ambos os conjuntos. Isso pressupõe que o vocabulário de uma pessoa e os padrões de uso das palavras são relativamente constantes. + +Veja como aplicar a estatística para atribuição de autoria: + +* Pegue os corpora associados a dois autores; +* Junte-os em um único corpus, maior; +* Conte os tokens para cada uma das palavras que podem ser encontradas neste corpus maior; +* Selecione as [`n`](https://perma.cc/D9ND-3C83) palavras mais comuns no corpus maior; +* Calcule quantos tokens dessas `n` palavras mais comuns esperaríamos encontrar em cada um dos dois corpora originais se fossem do mesmo autor. Isso significa simplesmente dividir o número de tokens que observamos no corpus combinado em dois valores, com base nos tamanhos relativos das contribuições dos dois autores para o corpus comum; +* Calcule uma distância qui-quadrada somando, sobre as `n` palavras mais comuns, os _quadrados das diferenças entre os números reais de tokens encontrados no corpus de cada autor e os números esperados_, divididos pelos números esperados; A Figura 2 mostra a equação para a estatística qui-quadrado, onde C(i) representa o número observado de tokens para o recurso 'i' e E(i), o número esperado para esse recurso. + +{% include figure.html filename="stylometry-python-6.jpg" caption="Imagem 2: Equação para a estatística qui-quadrado." %} + +Quanto menor o valor do qui-quadrado, mais semelhantes são os dois corpora. Portanto, calcularemos o qui-quadrado de cada documento de teste com os 5 possíveis autores: os menores valores representarão a possível autoria de cada documento (assim como vimos no primeiro exemplo). + +Nota: Independentemente do método estilométrico que usamos, a escolha de `n`, o número de palavras a levar em consideração, é uma espécie de arte sombria. Na literatura pesquisada por Stamatatos[^2], pesquisadores sugeriram entre 100 e 1.000 das palavras mais comuns; um projeto chegou a usar cada palavra que aparecia no corpus pelo menos duas vezes. Como diretriz, quanto maior o corpus, maior o número de palavras que podem ser usadas como elementos sem correr o risco de dar importância indevida a uma palavra que ocorra apenas algumas vezes. Nesta lição, usaremos um `n` relativamente grande para o método qui-quadrado e um menor para o próximo método. Mudar o valor de `n` certamente mudará um pouco os resultados numéricos; no entanto, se uma pequena modificação de `n` causar uma mudança na atribuição de autoria, isso é um sinal de que o teste que você está realizando não é capaz de fornecer evidências significativas sobre o seu caso de teste. + +O seguinte trecho de código implementa o método de Kilgariff, com as frequências das 500 palavras mais comuns no corpus conjunto sendo usadas no cálculo: + +```python +# Converter os tokens para caracteres minúsculos para que a mesma palavra, +# maiúscula ou não, conte como uma palavra + +for autor in autores: + obras_tokens[autor] = ( + [token.lower() for token in obras_tokens[autor]]) + +# Calcular o qui-quadrado de cada documento de teste com cada um dos 5 autores +for obra in obras_destacadas: + for autor in autores: + + # Primeiro, construir um corpus conjunto e identificar + # as 500 palavras mais frequentes nele + corpus_conjunto= (obras_tokens[obra] + + obras_tokens[autor]) + freq_dist_conjunto = nltk.FreqDist(corpus_conjunto) + termos_comuns = list(freq_dist_conjunto.most_common(500)) + + # Que proporção do corpus conjunto é constituído pelos + # tokens do autor candidato? + autor_compartihado = (len(obras_tokens[autor]) + / len(corpus_conjunto)) + + # Agora, vamos observar as 500 palavras mais frequentes no corpus do candidato + # e comparar o número de vezes que elas podem ser observadas + # ao que seria esperado se os artigos do autor e o documento de teste + # fossem ambas amostras aleatórias do mesmo conjunto. + quiquadrado = 0 + for word,count_conjunto in termos_comuns: + + # Com que frequência vemos essa palavra comum? + autor_count = obras_tokens[autor].count(word) + obra_count = obras_tokens[obra].count(word) + + # Com que frequência deveríamos vê-la? + autor_count_esperado = count_conjunto * autor_compartihado + teste_count_esperado = count_conjunto * (1-autor_compartihado) + + # Adicionar a contribuição da palavra para a estatística qui-quadrado + quiquadrado += ((autor_count-autor_count_esperado) * + (autor_count-autor_count_esperado) / + autor_count_esperado) + + quiquadrado += ((obra_count-teste_count_esperado) * + (obra_count-teste_count_esperado) + / teste_count_esperado) + + print("A estatística de qui-quadrado do documento", + obra, + "para o candidato", + autor, + "é =", + quiquadrado) + print("\n") +``` + +Assim como no primeiro exemplo, o resultado será 5 blocos de resultados, cada um para um documento de teste. O primeiro bloco se parecerá com isso: +``` +A estatística de qui-quadrado do documento Assis (teste) para o candidato Assis é = 12266.387624251674 +A estatística de qui-quadrado do documento Assis (teste) para o candidato Alencar é = 13832.008019914058 +A estatística de qui-quadrado do documento Assis (teste) para o candidato Castelo Branco é = 15659.980573183348 +A estatística de qui-quadrado do documento Assis (teste) para o candidato Castilho é = 19458.24314684532 +A estatística de qui-quadrado do documento Assis (teste) para o candidato Chagas é = 13681.732446564287 +``` + +
    +No código acima, convertemos os tokens em minúsculas para não contar os tokens de palavras que começam com uma letra maiúscula porque aparecem no início de uma frase e os tokens minúsculos da mesma palavra como duas palavras diferentes. Às vezes, isso pode causar alguns erros, por exemplo, quando um substantivo próprio e um substantivo comum são escritos da mesma forma, exceto para maiúsculas, mas geralmente esta técnica aumenta a precisão. +
    + +Agora, vamos dar uma olhada na matriz de confusão dos resultados para esta técnica: + +| | Assis | Alencar | Castelo Branco | Castilho | Chagas | +| --------- | --------- | --------- | --------- | --------- | --------- | +| **Assis (teste)** | **12266** | 13832| 15659 | 19458 | 13681 | +| **Alencar (teste)** | 2550 | **3153** | 2581 | 2663 | 2765 | +| **Castelo Branco (teste)** | 17294 | 12063 | **11187** | 18133 | 13954 | +| **Castilho (teste)** | 11349 | 9203 | 8925 | **4531** | 7548 | +| **Chagas (teste)** | 6683 | 5700 | 5836 | 6970 | **5332** | + +Como podemos observar, o teste de qui-quadrado obteve um resultado superior à curva característica de composição de Mendenhall. Assis e Castilho permanecem com a inferência correta de autoria. Chagas, que passou pelo "empate técnico" na curva de composição, com o qui-quadrado também faz a inferência correta com uma distância considerável entre os demais possíveis autores. Dos autores que não haviam sido avaliados corretamente na curva de composição, Castelo Branco possui o menor valor de qui-quadrado, outra inferência correta. Alencar, no entanto, segue como o maior valor entre os 5 possíveis autores. De qualquer forma, já passamos de 50% de acerto com a curva característica de composição para 80% com o método qui-quadrado! + +No entanto, o qui-quadrado ainda é um método pouco refinado. Por um lado, palavras que aparecem com muita frequência tendem a ter um peso desproporcional no cálculo final. Às vezes, isso é bom; outras vezes, diferenças sutis de estilo representadas pelas maneiras como os autores usam palavras mais incomuns passarão despercebidas. + +## Uma nota sobre classes gramaticais + +Em alguns casos e idiomas, pode ser útil aplicar a marcação de [Part-of-speech (classes gramaticais)](https://perma.cc/ER5P-CFQE) aos tokens de palavras antes de contá-los, de modo que a mesma palavra usada como duas classes gramaticais diferentes possa contar como dois elementos diferentes (por exemplo, o termo "mais" sendo usado como substantivo ou como advérbio de intensidade). Esta lição não usa marcação de classes gramaticais, mas poderia refinar os resultados em estudos de caso mais complexos. + +Se você precisar aplicar a marcação de classe gramatical aos seus próprios dados, poderá fazer o download de marcadores para outros idiomas, para trabalhar com uma ferramenta de terceiros como [Tree Tagger](https://perma.cc/DG9G-S5T2), ou mesmo para treinar o seu próprio marcador, mas essas técnicas estão muito além do escopo da lição atual. + +# Terceiro teste estilométrico: método Delta de John Burrows (avançado) + +Os primeiros dois métodos estilométricos foram mais fáceis de implementar. Este próximo, baseado na estatística *Delta* de John Burrows[^10], é consideravelmente mais complexo, tanto conceitualmente (a matemática é mais complicada) quanto computacionalmente (mais código necessário). É, no entanto, um dos métodos estilométricos mais proeminentes em uso hoje. + +Assim como o qui-quadrado de Kilgariff, o método Delta de Burrows é uma medida da "distância" entre um texto cuja autoria queremos averiguar e algum outro corpus. Ao contrário do qui-quadrado, no entanto, o método Delta é projetado para comparar um texto anônimo (ou conjunto de textos) com as assinaturas de vários autores diferentes ao mesmo tempo. Mais precisamente, o método Delta mede como o texto anônimo *e conjuntos de textos escritos por um número arbitrário de autores conhecidos* divergem da média de todos eles juntos. Além disso, o método Delta atribui peso igual a todas as características que mede, evitando assim o problema de palavras comuns sobrecarregarem os resultados, o que era um problema com os testes de qui-quadrado. Por todas essas razões, o método Delta de John Burrows é geralmente uma solução mais eficaz para a questão da autoria. + +O algoritmo original de Burrows pode ser resumido da seguinte forma: + +* Reúna um grande corpus composto por textos escritos por um número arbitrário de autores; digamos que o número de autores seja `x`; +* Encontre as `n` palavras mais frequentes no corpus para usar como elementos; +* Para cada uma dessas `n` características, calcule a participação de cada subcorpora dos `x` autores, como uma porcentagem do número total de palavras. Por exemplo, a palavra "ele" pode representar 4,72% das palavras no subcorpus do Autor A; +* Em seguida, calcule a média e o desvio padrão desses `x` valores e use-os como a média oficial e o desvio padrão para esse elemento em todo o corpus. Em outras palavras, estaremos usando uma _média de médias_ em vez de calcular um único valor que represente a parcela de todo o corpus dado por cada palavra. Fazemos isso porque queremos evitar que um subcorpus maior tenha maior influência nos resultados a seu favor e defina a norma do corpus de tal forma que se espere que tudo se pareça com ele; +* Para cada um dos `n` elementos e `x` subcorpora, calcule um [`z-score`](https://perma.cc/S2RH-LF9K) descrevendo o quão distante da norma do corpus está o uso desse elemento particular neste subcorpus específico. Para fazer isso, subtraia a "média das médias" de um dado elemento da frequência com que ela é encontrada no subcorpus e divida o resultado pelo seu desvio padrão. A Figura 3 mostra a equação de z-score para o elemento 'i', onde C(i) representa a frequência observada, a letra grega mu representa a média das médias e a letra grega sigma, o desvio padrão; + +{% include figure.html filename="stylometry-python-7.jpg" caption="Imagem 3: Equação para a estatística de z-score." %} + +* Em seguida, calcule os mesmos `z-scores` para cada elemento no texto para o qual queremos determinar a autoria; +* Finalmente, calcule um *score delta* comparando o documento de teste com o subcorpus de cada candidato. Para fazer isso, tome a *média dos valores absolutos das diferenças entre os `z-scores` para cada elemento entre o documento de teste e o subcorpus do candidato*. (leia duas vezes!) Isso dá peso igual a cada elemento, não importa a frequência com que as palavras ocorram nos textos; caso contrário, os 3 ou 4 principais elementos sobrecarregariam todo o resto. A Figura 4 mostra a equação para Delta, onde Z(c,i) é o `z-score` para o elemento 'i' no candidato 'c', e Z(t,i) é o `z-score` para o elemento 'i' no caso de teste; + +{% include figure.html filename="stylometry-python-8.jpg" caption="Imagem 4: Equação para a estatística Delta de John Burrows." %} + +* O candidato "vencedor", assim como nas duas outras técnicas que aplicamos, é o autor para o qual a pontuação delta entre o subcorpus do autor e o documento de teste é a mais baixa. + +Stefan Evert _et al_.[^11] fornece uma discussão aprofundada das variantes, refinamentos e complexidades do método, mas nos ateremos ao essencial para os propósitos desta lição. Uma explicação diferente de Delta, escrita em espanhol, e uma aplicação a um corpus de romances espanhóis também podem ser encontradas em um artigo recente de José Calvo Tello.[^12] + +## Seleção de elementos + +Vamos combinar todos os subcorpora em um único corpus para Delta calcular um "padrão" para trabalhar. Então, vamos selecionar um número de palavras para usar como característica. Lembre-se de que usamos 500 palavras para calcular o qui-quadrado de Kilgariff; desta vez, usaremos um conjunto menor de 30 palavras (a maioria, senão todas, palavras funcionais e verbos comuns) como nossos elementos. + +```python +# Combinar todos os corpora, exceto os documentos de teste, em um único corpus +corpus_completo = [] +for autor in autores: + corpus_completo += obras_tokens[autor] + +# Obter uma distribuição de frequência +freq_dist_corpus_completo = list(nltk.FreqDist(corpus_completo).most_common(30)) +freq_dist_corpus_completo[ :10 ] +``` + +Uma amostra das palavras mais frequentes e suas respectivas ocorrências parece com o seguinte: + +``` +[('a', 17619), + ('que', 17345), + ('de', 17033), + ('e', 15449), + ('o', 14283), + ('não', 7086), + ('do', 6019), + ('da', 5647), + ('os', 5299), + ('um', 4873)] +``` + +## Calculando elementos para cada subcorpus + +Vejamos as frequências de cada característica no subcorpus de cada candidato, como uma proporção do número total de tokens no subcorpus. Vamos calcular esses valores e armazená-los em um dicionário de dicionários, uma maneira conveniente de construir um [array bidimensional](https://perma.cc/HR9K-24MG) em Python. + +```python +# Criar uma lista com os elementos e a estrutura principal de dados +features = [word for word,freq in freq_dist_corpus_completo] +feature_freqs = {} + +for autor in autores: + # Criar um dicionário para os elementos de cada candidato + feature_freqs[autor] = {} + + # Obter um valor auxiliar contendo o número de tokens no subcorpus do autor + geral = len(obras_tokens[autor]) + + # Calcular a presença de cada elemento no subcorpus + for feature in features: + presenca = obras_tokens[autor].count(feature) + feature_freqs[autor][feature] = presenca / geral +``` + +## Calculando médias de elementos e desvios-padrão + +Dadas as frequências de elementos para todos os subcorpora que acabamos de calcular, podemos encontrar uma "média das médias" e um desvio padrão para cada elemento. Armazenaremos esses valores em outro "dicionário de dicionários". + +```python +import math + +# A estrutura de dados na qual iremos armazenar +# as "estatísticas padrão do corpus" +corpus_features = {} + +# Para cada elemento... +for feature in features: + # Criar um subdicionário que conterá a média e o desvio padrão do elemento + corpus_features[feature] = {} + + # Calcular a média das frequências expressas no subcorpora + feature_average = 0 + for autor in autores: + feature_average += feature_freqs[autor][feature] + feature_average /= len(autores) + corpus_features[feature]["Mean"] = feature_average + + # Calcular o desvio padrão usando a fórmula básica para uma amostra + feature_stdev = 0 + for autor in autores: + diff = feature_freqs[autor][feature] - corpus_features[feature]["Mean"] + feature_stdev += diff * diff + feature_stdev /= (len(autores) - 1) + feature_stdev = math.sqrt(feature_stdev) + corpus_features[feature]["StdDev"] = feature_stdev +``` + +## Calculando z-scores + +Em seguida, transformamos as frequências de características observadas no subcorpora dos cinco candidatos em `z-scores`, descrevendo o quão distante da "estatística padrão do corpus" essas observações estão. Nada extravagante aqui: nós meramente aplicamos a definição do `z-score` para cada elemento e armazenamos os resultados em outro array bidimensional. + +```python +feature_zscores = {} + +for autor in autores: + feature_zscores[autor] = {} + + for feature in features: + # Definição do z-score = (value - mean) / stddev + # Usamos variáveis intermediárias para tornar o + # código mais fácil de ler + feature_val = feature_freqs[autor][feature] + feature_mean = corpus_features[feature]["Mean"] + feature_stdev = corpus_features[feature]["StdDev"] + feature_zscores[autor][feature] = ((feature_val-feature_mean) / + feature_stdev) +``` + +## Calculando elementos, z-scores e Delta para nosso caso de teste + +Em seguida, precisamos comparar os documentos de teste com o corpus. O seguinte trecho de código, que essencialmente recapitula tudo o que fizemos até agora, conta as frequências de cada um de nossos 30 elementos nos documentos de teste e calcula os `z-scores` de acordo. +Por fim, usamos a fórmula para Delta definida por Burrows para extrair uma única pontuação comparando cada documento de teste com cada um dos cinco "autores candidatos". Lembre-se: quanto menor a pontuação Delta, mais semelhante a assinatura estilométrica do documento à do candidato. + +```python +for obra in obras_destacadas: + # Tokenizar o documento de teste + testcase_tokens = nltk.word_tokenize(obras[obra]) + + # Filtrar a pontuação e colocar os tokens em minúsculas + testcase_tokens = [token.lower() for token in testcase_tokens + if any(c.isalpha() for c in token)] + + # Calcular as frequências dos elementos do documento de teste + geral = len(testcase_tokens) + testcase_freqs = {} + for feature in features: + presenca = testcase_tokens.count(feature) + testcase_freqs[feature] = presenca / geral + + # Calcular os z-scores dos elementos do documento de teste + testcase_zscores = {} + for feature in features: + feature_val = testcase_freqs[feature] + feature_mean = corpus_features[feature]["Mean"] + feature_stdev = corpus_features[feature]["StdDev"] + testcase_zscores[feature] = (feature_val - feature_mean) / feature_stdev + + # Calcular Delta para cada autor + for autor in autores: + delta = 0 + for feature in features: + delta += math.fabs((testcase_zscores[feature] - + feature_zscores[autor][feature])) + delta /= len(features) + print( "Delta score do documento", + obra, + "para o candidato", + autor, + "é =", + delta ) + print("\n") +``` + +Como nas outras duas técnicas, o resultado serão 5 blocos de código dando o valor de Delta de cada documento para cada suposto autor. O primeiro bloco se parecerá com isso: + +``` +Delta score do documento Assis (teste) para o candidato Assis é = 0.8715781237572774 +Delta score do documento Assis (teste) para o candidato Alencar é = 1.2624531605759595 +Delta score do documento Assis (teste) para o candidato Castelo Branco é = 1.2303968803032856 +Delta score do documento Assis (teste) para o candidato Castilho é = 1.6276770882853728 +Delta score do documento Assis (teste) para o candidato Chagas é = 1.0527125070730734 +``` + +Vamos avaliar todos os valores Delta na nossa matriz de confusão (reduzidos para 4 casas decimais): + +| | Assis | Alencar | Castelo Branco | Castilho | Chagas | +| --------- | --------- | --------- | --------- | --------- | --------- | +| **Assis (teste)** | **0.8715** | 1.2624 | 1.2303 | 1.6276 | 1.0527 | +| **Alencar (teste)** | 1.9762 | **1.3355** | 1.3878 | 1.6425 | 1.5042 | +| **Castelo Branco (teste)** | 1.004 | 1.3208 | **0.8182** | 1.5202 | 1.2829 | +| **Castilho (teste)** | 1.5705 | 1.2553 | 1.0970 | **0.4518** | 0.8176 | +| **Chagas (teste)** | 1.1444 | 1.0169 | 0.9462 | 0.9864 | **0.7756** | + +Com o método Delta, pudemos inferir corretamente 100% da autoria dos documentos de teste! Alencar, que teve o pior valor nas duas outras técnicas, aqui aparece com o menor valor entre os 5 candidatos. +Ao utilizarmos autores brasileiros e portugueses, tínhamos em mente também a possibilidade de que a comparação entre ficheiros de autores de uma mesma nacionalidade pudessem ter valores mais próximos que entre autores de nacionalidades distintas, em função de particularidades linguísticas, o que parece que não foi o caso aqui. Por se tratarem de obras do século XIX, poderíamos buscar explicações para isso na maior similaridade das línguas na época, na influência da Academia Portuguesa no Brasil, ou mesmo do letramento e influências dos autores. Uma segunda análise com obras mais contemporâneas seria um excelente segundo passo para esta análise, e fica como sugestão para o leitor. + +# Leituras adicionais e recursos + +## Estudos de caso interessantes + +Estilometria e/ou atribuição de autoria têm sido utilizadas em diversos contextos, empregando diversas técnicas. Aqui estão alguns estudos de caso interessantes: + +* Javier de la Rosa e Juan Luis Suárez procuram o autor de um famoso romance espanhol do século XVI entre uma lista considerável de candidatos. [^13] +* Maria Slautina e Mikhail Marusenko usam o reconhecimento de padrões em um conjunto de recursos sintáticos, gramaticais e lexicais, desde a contagem de palavras simples (com marcação de classe gramatical) a vários tipos de frases, a fim de estabelecer semelhanças estilísticas entre os textos medievais.[^14] +* Ellen Jordan, Hugh Craig e Alexis Antonia examinam o caso de periódicos britânicos do século XIX, nos quais os artigos geralmente não eram assinados, para determinar o autor de quatro resenhas de trabalhos de ou sobre as irmãs Brontë.[^15] Este estudo de caso aplica uma versão inicial de outro método desenvolvido por John Burrows, o método Zeta, que se concentra nas palavras favoritas de um autor em vez de palavras de função comum.[^16] +* Valérie Beaudoin e François Yvon analisaram 58 peças em verso dos dramaturgos franceses Corneille, Racine e Molière, descobrindo que as duas primeiras foram muito mais consistentes na maneira como estruturaram sua escrita do que as últimas.[^17] +* Marcelo Luiz Brocardo, Issa Traore, Sherif Saad e Isaac Woungang aplicam [aprendizagem supervisionada](https://perma.cc/7TAQ-JECD) e [modelos n-gram](https://perma.cc/X34K-5R9X) para determinar a autoria de mensagens curtas com um grande número de autores em potencial, como e-mails e tweets.[^18] +* Moshe Koppel e Winter Yaron propõem o "método do impostor", que tenta determinar se dois textos foram escritos pelo mesmo autor, inserindo-os em um conjunto de textos escritos por falsos candidatos.[^19] Justin Anthony Stover _et al._ recentemente aplicou a técnica para determinar a autoria de um manuscrito do século II recém-descoberto.[^20] +* Finalmente, uma equipe liderada por David I. Holmes estudou o caso peculiar de documentos escritos por um soldado da Guerra Civil ou por sua viúva que pode ter copiado intencionalmente seu estilo de escrita.[^21] + +## Referências adicionais sobre autoria e estilometria + +A referência mais exaustiva em todos os assuntos relacionados à atribuição de autoria, incluindo a história do campo, seus fundamentos matemáticos e linguísticos e seus vários métodos, foi escrita por Patrick Juola em 2007.[^22] O Capítulo 7, em particular, mostra como a atribuição de autoria pode servir como um marcador para várias identidades de grupo (gênero, nacionalidade, dialeto, etc.), para mudanças na linguagem ao longo do tempo, e até mesmo para personalidade e saúde mental. + +Uma pesquisa mais curta pode ser encontrada em Moshe Koppel _et al._, que discute casos em que há um único autor candidato cuja autoria deve ser confirmada, um grande número de candidatos para os quais apenas pequenas amostras de escrita estão disponíveis para treinar um algoritmo de aprendizado de máquina, ou nenhum candidato conhecido.[^23] + +O artigo de Stamatatos citado anteriormente[^2] também contém uma pesquisa qualitativa do campo. + +## Varia + +*Programming historians* que desejam explorar mais a estilometria podem fazer o download do pacote [Stylo](https://cran.r-project.org/web/packages/stylo/index.html),[^24] que se tornou um padrão _de facto_. Entre outras coisas, o pacote Stylo fornece uma implementação do método Delta, funcionalidade de extração de recursos e interfaces gráficas de usuário convenientes tanto para manipulação de dados quanto para produção de resultados visualmente atraentes. Observe que o Stylo é escrito em [R](https://www.r-project.org/), o que significa que você precisará do R instalado no seu computador para executá-lo, mas entre a interface gráfica do usuário e os tutoriais, pouco ou nenhum conhecimento prévio de programação R deve ser necessário. + +Leitores fluentes em francês interessados em explorar as implicações [epistemológicas](https://perma.cc/6DFE-QTWV) das interações entre métodos quantitativos e qualitativos na análise do estilo de escrita devem ler Clémence Jacquot.[^25] + +Surpreendentemente, os dados obtidos por meio de [reconhecimento ótico de caracteres](https://perma.cc/R9U6-TRGE) (OCR) se mostraram adequados para fins de atribuição de autoria, mesmo quando os dados sofrem de altas taxas de erro de OCR.[^26] + +Por fim, existe um [grupo Zotero](https://www.zotero.org/groups/643516/stylometry_bibliography/items) dedicado à estilometria, onde você pode encontrar muitas outras referências a métodos e estudos. + +# Agradecimentos + +Agradecimentos a Stéfan Sinclair e Andrew Piper, em cujos seminários na Universidade McGill este projeto começou. Também agradeço à minha orientadora de tese, Susan Dalton, cuja orientação é sempre inestimável. + +# Notas finais + +[^1]: Veja, por exemplo, Justin Rice, ["What Makes Hemingway Hemingway? A statistical analysis of the data behind Hemingway's style"](https://perma.cc/W8TR-UH6S) + +[^2]: Efstathios Stamatatos, “A Survey of Modern Authorship Attribution Method,” _Journal of the American Society for Information Science and Technology_, vol. 60, no. 3 (December 2008), p. 538–56, citation on p. 540, [https://doi.org/10.1002/asi.21001](https://doi.org/10.1002/asi.21001). + +[^3]: Jan Rybicki, “Vive La Différence: Tracing the (Authorial) Gender Signal by Multivariate Analysis of Word Frequencies,” _Digital Scholarship in the Humanities_, vol. 31, no. 4 (December 2016), pp. 746–61, [https://doi.org/10.1093/llc/fqv023](https://doi.org/10.1093/llc/fqv023). Sean G. Weidman e James O’Sullivan, “The Limits of Distinctive Words: Re-Evaluating Literature’s Gender Marker Debate,” _Digital Scholarship in the Humanities_, 2017, [https://doi.org/10.1093/llc/fqx017](https://doi.org/10.1093/llc/fqx017). + +[^4]: Ted Underwood, David Bamman, e Sabrina Lee, “The Transformation of Gender in English-Language Fiction”, _Cultural Analytics_, Feb. 13, 2018, [https://doi.org/10.22148/16.019](https://doi.org/10.22148/16.019). + +[^5]: Sven Meyer zu Eissen e Benno Stein, “Intrinsic Plagiarism Detection,” in _ECIR 2006_, edited by Mounia Lalmas, Andy MacFarlane, Stefan Rüger, Anastasios Tombros, Theodora Tsikrika, e Alexei Yavlinsky, Berlin, Heidelberg: Springer, 2006, pp. 565–69, [https://doi.org/10.1007/11735106_66](https://doi.org/10.1007/11735106_66). + +[^6]: Cynthia Whissell, “Traditional and Emotional Stylometric Analysis of the Songs of Beatles Paul McCartney and John Lennon,” _Computers and the Humanities_, vol. 30, no. 3 (1996), pp. 257–65. + +[^7]: Douglass Adair, "The Authorship of the Disputed Federalist Papers", _The William and Mary Quarterly_, vol. 1, no. 2 (April 1944), pp. 97-122. + +[^8]: T. C. Mendenhall, "The Characteristic Curves of Composition", _Science_, vol. 9, no. 214 (Mar. 11, 1887), pp. 237-249. + +[^9]: Adam Kilgarriff, "Comparing Corpora", _International Journal of Corpus Linguistics_, vol. 6, no. 1 (2001), pp. 97-133. + +[^10]: John Burrows, "'Delta': a Measure of Stylistic Difference and a Guide to Likely Authorship", _Literary and Linguistic Computing_, vol. 17, no. 3 (2002), pp. 267-287. + +[^11]: Stefan Evert et al., "Understanding and explaining Delta measures for authorship attribution", _Digital Scholarship in the Humanities_, vol. 32, no. suppl_2 (2017), pp. ii4-ii16. + +[^12]: José Calvo Tello, “Entendiendo Delta desde las Humanidades,” [_Caracteres_, vol.5, no.1 (May 27 2016)](https://perma.cc/LNF3-QP8V), pp.140-176. + +[^13]: Javier de la Rosa and Juan Luis Suárez, “The Life of Lazarillo de Tormes and of His Machine Learning Adversities,” _Lemir_, vol. 20 (2016), pp. 373-438. + +[^14]: Maria Slautina e Mikhaïl Marusenko, “L’émergence du style, The emergence of style,” _Les Cahiers du numérique_, vol. 10, no. 4 (November 2014), pp. 179–215, [https://doi.org/10.3166/LCN.10.4.179-215](https://doi.org/10.3166/LCN.10.4.179-215). + +[^15]: Ellen Jordan, Hugh Craig, e Alexis Antonia, “The Brontë Sisters and the ‘Christian Remembrancer’: A Pilot Study in the Use of the ‘Burrows Method’ to Identify the Authorship of Unsigned Articles in the Nineteenth-Century Periodical Press,” _Victorian Periodicals Review_, vol. 39, no. 1 (2006), pp. 21–45. + +[^16]: John Burrows, “All the Way Through: Testing for Authorship in Different Frequency Strata,” _Literary and Linguistic Computing_, vol. 22, no. 1 (April 2007), pp. 27–47, [https://doi.org/10.1093/llc/fqi067](https://doi.org/10.1093/llc/fqi067). + +[^17]: Valérie Beaudoin e François Yvon, “Contribution de La Métrique à La Stylométrie,” _JADT 2004: 7e Journées internationales d'Analyse statistique des Données Textuelles_, vol. 1, Louvain La Neuve, Presses Universitaires de Louvain, 2004, pp. 107–18. + +[^18]: Marcelo Luiz Brocardo, Issa Traore, Sherif Saad e Isaac Woungang, “Authorship Verification for Short Messages Using Stylometry,” _2013 International Conference on Computer, Information and Telecommunication Systems (CITS)_, 2013, [https://doi.org/10.1109/CITS.2013.6705711](https://doi.org/10.1109/CITS.2013.6705711). + +[^19]: Moshe Koppel e Winter Yaron, “Determining If Two Documents Are Written by the Same Author,” _Journal of the Association for Information Science and Technology_, vol. 65, no. 1 (October 2013), pp. 178–87, [https://doi.org/10.1002/asi.22954](https://doi.org/10.1002/asi.22954). + +[^20]: Justin Anthony Stover et al., "Computational authorship verification method attributes a new work to a major 2nd century African author", _Journal of the Association for Information Science and Technology_, vol. 67, no. 1 (2016), pp. 239–242. + +[^21]: David I. Holmes, Lesley J. Gordon, e Christine Wilson, "A widow and her soldier: Stylometry and the American Civil War", _Literary and Linguistic Computing_, vol. 16, no 4 (2001), pp. 403–420. + +[^22]: Patrick Juola, “Authorship Attribution,” _Foundations and Trends in Information Retrieval_, vol. 1, no. 3 (2007), pp. 233–334, [https://doi.org/10.1561/1500000005](https://doi.org/10.1561/1500000005). + +[^23]: Moshe Koppel, Jonathan Schler, e Shlomo Argamon, “Computational Methods in Authorship Attribution,” _Journal of the Association for Information Science and Technology_. vol. 60, no. 1 (January 2009), pp. 9–26, [https://doi.org/10.1002/asi.v60:1](https://doi.org/10.1002/asi.v60:1). + +[^24]: Maciej Eder, Jan Rybicki, e Mike Kestemont, “Stylometry with R: A Package for Computational Text Analysis,” _The R Journal_, vol. 8, no. 1 (2016), pp. 107–21. + +[^25]: Clémence Jacquot, “Rêve d'une épiphanie du style: visibilité et saillance en stylistique et en stylométrie,” _Revue d’Histoire Littéraire de la France_ , vol. 116, no. 3 (2016), pp. 619–39. + +[^26]: Patrick Juola, John Noecker Jr, e Michael Ryan, "Authorship Attribution and Optical Character Recognition Errors", _TAL_, vol. 53, no. 3 (2012), pp. 101–127. diff --git a/pt/licoes/introducao-instalacao-python.md b/pt/licoes/introducao-instalacao-python.md index bce06c2e30..2ca8017807 100644 --- a/pt/licoes/introducao-instalacao-python.md +++ b/pt/licoes/introducao-instalacao-python.md @@ -1,88 +1,88 @@ ---- -title: Introdução e instalação do Python -slug: introducao-instalacao-python -layout: lesson -date: 2012-07-17 -translation_date: 2021-05-13 -authors: -- William J. Turkel -- Adam Crymble -reviewers: -- Jim Clifford -- Amanda Morton -editors: -- Miriam Posner -translator: -- Josir C. Gomes -translation-editor: -- Danielle Sanches -translation-reviewer: -- Bruno Martins -- Renato Rocha Souza -difficulty: 1 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/323 -activity: transforming -topics: [python, get-ready] -abstract: "Essa primeira lição em nossa seção para tratar de Fontes Online -é preparada para que você e o seu computador estejam prontos para se iniciarem na programação. -Nós iremos nos concentrar em instalar os softwares necessários – todos livres e de boa reputação -– e finalmente nós iremos te auxiliar a experimentar os primeiros passos na programação -para que você tenha resultados imediatos." -next: nocoes-basicas-paginas-web-html -python_warning: false -original: introduction-and-installation -avatar_alt: Uma cobra enrolada -doi: 10.46430/phpt0004 ---- - -{% include toc.html %} - - - - - -## Objetivos da Lição - -Essa primeira lição na nossa seção sobre Fontes Online é preparada para que você e o seu computador estejam preparados para se iniciarem na programação. -Iremos focar na instalação do software necessário, livre e de boa reputação. Posteriormente, iremos auxiliar na experimentação com os primeiros passos na programação, para que você tenha resultados rápidos. - -Neste módulo de abertura, você irá instalar a [Linguagem de Programação Python][], o [analisador de HTML/XML Beautiful Soup][], e um editor de texto. - -Os ecrãns de exemplo mostrados aqui correspondem ao [Komodo Edit][], mas você pode utilizar qualquer editor de texto apto a trabalhar com o Python. Aqui está uma lista de outras opções: [Editores Python][]. Uma vez que tudo esteja instalado, você irá escrever os seus primeiros programas, "Olá Mundo" em Python e HTML. - -## A linguagem de programação Python - -A linguagem de programação que usaremos nesta série de lições é Python, uma linguagem livre e de código aberto. -A menos que seja observado o contrário, usaremos a versão **Python 3** daqui em diante. -A versão 2 não é mais suportada, mas ainda pode estar sendo usada em projetos ou lições mais antigas. - -[Python 3 tem algumas poucas diferenças na formatação](http://sebastianraschka.com/Articles/2014_python_2_3_key_diff.html) (pense em regras gramaticais). Assim, fique atento se você encontrar exemplos online que ainda utilizam o Python 2. Esses exemplos podem não funcionar nas versões atuais do Python. - - -## Faça backups do seu trabalho! - -Antes de fazer o download ou instalar qualquer novo software, é crucial que você faça backups do seu trabalho. Você deve fazer backup de todo o seu computador pelo menos uma vez por semana, e de preferência com uma frequência ainda menor. É também uma boa ideia fazer backups fora do seu ambiente local pois, dessa forma, fica salvaguardado caso algo aconteça com o seu computador, com a sua casa ou o seu escritório. Sites como o [Jungle Disk][] ou o [Dropbox][] fornecem opções de backup fáceis de usar e relativamente baratas. - -### Escolha o seu Sistema Operativo - -## Passo 1 – Instale e configure o Software - -Para que você consiga trabalhar nas técnicas aqui apresentadas, você precisará descarregar e instalar software disponível gratuitamente. -Nós fornecemos instruções para o Mac, Windows e Linux. Uma vez que você tenha instalado o software no seu Sistema Operativo, siga para a lição '[Noções básicas de páginas web e HTML][]'. Se você encontrar dificuldades com as nossas instruções ou achar que algo não funciona na sua plataforma, por favor nos informe. - -- [Instalação do Python para Mac][] -- [Instalação do Python para Windows][] -- [Instalação do Python para Linux][] - - [Linguagem de Programação Python]: http://www.python.org/ - [Analisador de HTML/XML Beautiful Soup]: http://www.crummy.com/software/BeautifulSoup/ - [Komodo Edit]: https://github.com/ActiveState/OpenKomodoIDE - [Editores Python]: https://wiki.python.org/python/PythonEditors - [Jungle Disk]: https://www.jungledisk.com/ - [Dropbox]: https://www.dropbox.com/home - [Noções básicas de páginas web e HTML]: nocoes-basicas-paginas-web-html - [Instalação do Python para Mac]: instalacao-mac - [Instalação do Python para Windows]: instalacao-windows - [Instalação do Python para Linux]: instalacao-linux - - +--- +title: Introdução e instalação do Python +slug: introducao-instalacao-python +layout: lesson +date: 2012-07-17 +translation_date: 2021-05-13 +authors: +- William J. Turkel +- Adam Crymble +reviewers: +- Jim Clifford +- Amanda Morton +editors: +- Miriam Posner +translator: +- Josir C. Gomes +translation-editor: +- Danielle Sanches +translation-reviewer: +- Bruno Martins +- Renato Rocha Souza +difficulty: 1 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/323 +activity: transforming +topics: [python, get-ready] +abstract: "Essa primeira lição em nossa seção para tratar de Fontes Online +é preparada para que você e o seu computador estejam prontos para se iniciarem na programação. +Nós iremos nos concentrar em instalar os softwares necessários – todos livres e de boa reputação +– e finalmente nós iremos te auxiliar a experimentar os primeiros passos na programação +para que você tenha resultados imediatos." +next: /pt/licoes/nocoes-basicas-paginas-web-html +python_warning: false +original: introduction-and-installation +avatar_alt: Uma cobra enrolada +doi: 10.46430/phpt0004 +--- + +{% include toc.html %} + + + + + +## Objetivos da Lição + +Essa primeira lição na nossa seção sobre Fontes Online é preparada para que você e o seu computador estejam preparados para se iniciarem na programação. +Iremos focar na instalação do software necessário, livre e de boa reputação. Posteriormente, iremos auxiliar na experimentação com os primeiros passos na programação, para que você tenha resultados rápidos. + +Neste módulo de abertura, você irá instalar a [Linguagem de Programação Python][], o [analisador de HTML/XML Beautiful Soup][], e um editor de texto. + +Os ecrãns de exemplo mostrados aqui correspondem ao [Komodo Edit][], mas você pode utilizar qualquer editor de texto apto a trabalhar com o Python. Aqui está uma lista de outras opções: [Editores Python][]. Uma vez que tudo esteja instalado, você irá escrever os seus primeiros programas, "Olá Mundo" em Python e HTML. + +## A linguagem de programação Python + +A linguagem de programação que usaremos nesta série de lições é Python, uma linguagem livre e de código aberto. +A menos que seja observado o contrário, usaremos a versão **Python 3** daqui em diante. +A versão 2 não é mais suportada, mas ainda pode estar sendo usada em projetos ou lições mais antigas. + +[Python 3 tem algumas poucas diferenças na formatação](https://sebastianraschka.com/Articles/2014_python_2_3_key_diff.html) (pense em regras gramaticais). Assim, fique atento se você encontrar exemplos online que ainda utilizam o Python 2. Esses exemplos podem não funcionar nas versões atuais do Python. + + +## Faça backups do seu trabalho! + +Antes de fazer o download ou instalar qualquer novo software, é crucial que você faça backups do seu trabalho. Você deve fazer backup de todo o seu computador pelo menos uma vez por semana, e de preferência com uma frequência ainda menor. É também uma boa ideia fazer backups fora do seu ambiente local pois, dessa forma, fica salvaguardado caso algo aconteça com o seu computador, com a sua casa ou o seu escritório. Sites como o [Jungle Disk][] ou o [Dropbox][] fornecem opções de backup fáceis de usar e relativamente baratas. + +### Escolha o seu Sistema Operativo + +## Passo 1 – Instale e configure o Software + +Para que você consiga trabalhar nas técnicas aqui apresentadas, você precisará descarregar e instalar software disponível gratuitamente. +Nós fornecemos instruções para o Mac, Windows e Linux. Uma vez que você tenha instalado o software no seu Sistema Operativo, siga para a lição '[Noções básicas de páginas web e HTML][]'. Se você encontrar dificuldades com as nossas instruções ou achar que algo não funciona na sua plataforma, por favor nos informe. + +- [Instalação do Python para Mac][] +- [Instalação do Python para Windows][] +- [Instalação do Python para Linux][] + + [Linguagem de Programação Python]: https://www.python.org/ + [Analisador de HTML/XML Beautiful Soup]: https://www.crummy.com/software/BeautifulSoup/ + [Komodo Edit]: https://github.com/ActiveState/OpenKomodoIDE + [Editores Python]: https://wiki.python.org/python/PythonEditors + [Jungle Disk]: https://www.jungledisk.com/ + [Dropbox]: https://www.dropbox.com/home + [Noções básicas de páginas web e HTML]: /pt/licoes/nocoes-basicas-paginas-web-html + [Instalação do Python para Mac]: /pt/licoes/instalacao-mac + [Instalação do Python para Windows]: /pt/licoes/instalacao-windows + [Instalação do Python para Linux]: /pt/licoes/instalacao-linux + + diff --git a/pt/licoes/introducao-jupyter-notebooks.md b/pt/licoes/introducao-jupyter-notebooks.md index 8d821f891e..df8b611185 100644 --- a/pt/licoes/introducao-jupyter-notebooks.md +++ b/pt/licoes/introducao-jupyter-notebooks.md @@ -1,421 +1,421 @@ ---- -title: "Introdução ao Jupyter Notebook" -slug: introducao-jupyter-notebooks -original: jupyter-notebooks -layout: lesson -collection: lessons -date: 2019-12-08 -translation_date: 2023-06-02 -authors: -- Quinn Dombrowski -- Tassie Gniady -- David Kloster -reviewers: -- Patrick Burns -- Jeri Wieringa -editors: -- Brandon Walsh -translator: -- Vânia Rosa -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Juliana Marques -- Caio Mello -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/431 -difficulty: 1 -activity: presenting -topics: [python, website] -abstract: Jupyter Notebook fornece um ambiente onde você pode trabalhar com facilidade o seu código na linguagem Python. Esta lição descreve como instalar o software Jupyter Notebook, como executar e criar ficheiros para o Jupyter Notebook. -avatar_alt: O planeta Júpiter -doi: 10.46430/phpt0043 ---- - -{% include toc.html %} - -## Introdução - -Quando a computação é uma parte intrínseca de sua prática de pesquisa, como você publica um argumento acadêmico de forma que torne o código tão acessível e legível como a prosa que o acompanha? Na área das humanidades, a publicação de uma pesquisa assume principalmente a forma de prosa escrita, artigo ou monografia. Embora as editoras estejam cada vez mais abertas à inclusão de códigos suplementares ou outros materiais, tal arranjo inerentemente os relega a um estatuto secundário relativo ao texto escrito. - -E se você pudesse publicar sua pesquisa em um formato que desse um peso equilibrado entre a prosa e o código? A realidade das atuais diretrizes de publicação acadêmica significa que a separação forçosa do seu código e da argumentação pode ser uma necessidade, e sua reunificação pode ser impossível sem que se navegue por numerosos obstáculos. Atualmente o código é tipicamente publicado em separado no GitHub ou em outro repositório, caso no qual os leitores têm que procurar uma nota de rodapé no texto para descobrir quais scripts estão sendo referenciados, encontrar a URL do repositório, acessar a URL, procurar os scripts, baixá-los e também os ficheiro(s) de dados associados, e então executar os códigos. No entanto, se você tiver os direitos e permissões necessários para republicar o texto de sua pesquisa em outro formato, o Jupyter Notebook fornece um ambiente onde código e prosa podem ser justapostos e apresentados com igual peso e valor. - -Os Jupyter Notebooks têm visto uma adoção entusiástica na comunidade de ciência de dados, a ponto de cada vez mais substituir o Microsoft Word como um ambiente padrão de escrita da pesquisa. Dentro da literatura de humanidades digitais, pode-se encontrar referência a Jupyter Notebooks (separados do iPython, ou Python interativo, notebooks em 2014) desde 2015. - -Os Jupyter Notebooks também ganharam força nas humanidades digitais como uma ferramenta pedagógica. Diversos tutoriais do Programming Historian, como [Mineração de texto em Python através do leitor de recursos HTRC](/en/lessons/text-mining-with-extracted-features), e [Extraindo páginas ilustradas de bibliotecas digitais com python](/pt/licoes/extrair-paginas-ilustradas-com-python#jupyter-notebooks), assim como outros materiais pedagógicos para oficinas fazem referência à colocação de código em um Jupyter Notebook ou ao uso do Jupyter Notebook para orientar os estudantes, permitindo que eles remixem e editem o código livremente. O formato do notebook é ideal para o ensino, especialmente quando os estudantes têm diferentes níveis de proficiência técnica e de conforto com escrita e edição dos códigos. - -O objetivo dos Jupyter Notebooks é fornecer uma interface mais acessível para o código usado em pesquisa ou práticas pedagógicas com suporte digital. Ferramentas como os Jupyter Notebook são menos significativas para aprender ou ensinar no vácuo, porque os Jupyter Notebooks em si não fazem nada para promover diretamente a pesquisa ou a pedagogia. Antes de começar esta lição, pense no que você quer obter usando Jupyter Notebooks. Deseja organizar o fluxo de trabalho do seu projeto? Você quer trabalhar analisando seus dados, acompanhando as coisas que você tenta ao longo do caminho? Você quer que os leitores da sua pesquisa possam seguir os lados teóricos e técnicos do seu argumento sem alternar entre um PDF e uma pasta de scripts? Quer ministrar oficinas de programação mais acessíveis aos participantes com uma gama de conhecimentos técnicos? Você quer usar ou adaptar notebooks que outras pessoas escreveram? Tenha seu objetivo em mente enquanto você trabalha nesta lição. Dependendo de como você imagina usar Jupyter Notebooks, você pode ser capaz de pular seções que são mais aplicáveis em outro contexto. - -## Metas de lição - -Nesta lição você aprenderá: - -- O que são Jupyter Notebooks - -- Como instalar, configurar e usar o pacote de software do Jupyter Notebook - -- Quando os cadernos podem ser úteis em pesquisas e contextos pedagógicos - -Para esta lição, vamos trabalhar em um cenário de uso de Jupyter Notebooks para analisar dados e, em seguida, adaptar esse mesmo notebook e dados para uso em sala de aula. A aula também abordará temas mais avançados relacionados aos Jupyter Notebooks, tais como: - -- Usando Jupyter Notebook para linguagens de programação que não sejam Python - -- Convertendo o código Python existente em Jupyter Notebooks - -- Usando Jupyter Notebooks para ampliar a capacidade computacional em ambientes como clusters de computação de alto desempenho - -## Pré-requisitos - -Esta lição é adequada para iniciantes intrépidos, assumindo pouca experiência técnica anterior. - -Na verdade, o Jupyter Notebook é um ótimo recurso para pessoas que estão aprendendo a escrever código. - -Dependendo do notebook que você quer executar, você pode precisar [instalar alguns módulos Python com pip](/pt/licoes/instalacao-modulos-python-pip), que assume alguma familiaridade com a linha de comando (para [windows aqui](/en/lessons/intro-to-powershell), ou [Mac/Linux aqui](/en/lessons/intro-to-bash) (em inglês)). - -A lição é escrita usando o Jupyter Notebook 6.0, mas a interface do usuário e a funcionalidade do software tem sido bastante consistente entre as versões. - -## Computação Letrada - -A relação entre código legível por computador e texto legível por humanos ganhou visibilidade dentro da ciência da computação na década de 1970, quando Donald Knuth propôs o paradigma da "programação letrada" (ou “programação alfabetizada”). Em vez de organizar o código de acordo com os requisitos que privilegiam a execução do código pelo computador, a programação letrada trata um programa como literatura compreensível aos seres humanos, priorizando o próprio processo de pensamento do programador. A programação letrada projetada por Knuth assume a forma de prosa escrita, com código acionável por computador incorporado em macros (um formato abreviado para escrever código). Ferramentas de programação letrada são usadas para gerar duas saídas do programa letrado: código "emaranhado" que pode ser executado pelo computador e documentação formatada "tecida".[^1] - -Fernando Pérez, o criador do ambiente de programação iPython que acabou se tornando o Projeto Jupyter, cunhou o termo computação letrada para o modelo usado pelos Jupyter Notebooks: - -> Um ambiente de computação letrado é aquele que permite aos usuários não apenas executar comandos, mas também armazenar os resultados desses comandos em um formato de documento literário, juntamente com figuras e com texto em formato livre que pode incluir expressões matemáticas formatadas. Na prática, ele pode ser visto como uma mistura de um ambiente de linha de comando, como o shell Unix, com um processador de texto, uma vez que os documentos resultantes podem ser lidos como texto, mas contêm blocos de código que foram executados pelo sistema computacional subjacente.[^2] - -Jupyter não é nem o primeiro e nem o único exemplo de cadernos computacionais. Já na década de 1980, interfaces de notebook estavam disponíveis através de softwares como Wolfram Mathematica e MATLAB. Em 2013, Stéfan Sinclair e Geoffrey Rockwell propuseram "cadernos Voyant" baseados no modelo de Mathematica, que exporia algumas das suposições que sustentam as [Ferramentas Voyant](https://perma.cc/9M5K-JWU7) e as tornaram configuráveis pelo usuário.[^3] Eles desenvolveram ainda esse conceito em [A Arte da Análise de Texto Literário Cadernos Spyral](https://perma.cc/53HW-GGSJ). - - -Jupyter ganhou força em muitos campos como um ambiente de código aberto compatível com inúmeras linguagens de programação. O nome Jupyter é uma referência às três linguagens principais suportadas pelo projeto (Julia, Python e R), mas [núcleos estão disponíveis que tornam o Jupyter compatível com dezenas de idiomas](https://perma.cc/B448-XMJQ), incluindo Ruby, PHP, Javascript, SQL e Node.js. Pode não fazer sentido implementar projetos em todas essas línguas usando Jupyter Notebooks (por exemplo, Omeka não permitirá que você instale um plugin escrito como um Jupyter Notebook), mas o ambiente Jupyter ainda pode ser valioso para documentar códigos, ensinar linguagens de programação e fornecer aos alunos um espaço onde eles podem facilmente experimentar com exemplos fornecidos. - - -## Instalando o Jupyter Notebooks - -Desde o final de 2019, existem dois grandes ambientes que você pode usar para executar Jupyter Notebooks: O Jupyter Notebook (não confundir com os próprios ficheiro(s) do Jupyter Notebook, que possuem uma extensão `.ipynb`), e o mais novo Jupyter Lab. O Jupyter Notebook é amplamente usado e bem documentado, e fornece um navegador simples de ficheiro(s), juntamente com o ambiente para criar, editar e executar os notebooks. Jupyter Lab é mais complexo, com um ambiente de usuário mais parecido com um Ambiente de Desenvolvimento Integrado (discutido em tutoriais anteriores do Programming Historian para [Windows](/pt/licoes/instalacao-windows), [Mac](/pt/licoes/instalacao-mac) e [Linux](/pt/licoes/instalacao-linux)). Embora o Jupyter Lab seja feito para, eventualmente, substituir o Jupyter Notebook, não há indicação de que o Jupyter Notebook deixará de ser suportado tão cedo. Devido à sua simplicidade comparativa e facilidade de uso para iniciantes, este tutorial usa o Jupyter Notebook como o software para executar ficheiro(s) de notebook. Ambos os pacotes de software estão incluídos na Anaconda, descrita abaixo. É mais fácil usar a Anaconda para instalar o Jupyter Notebook, mas se você já tem Python instalado em seu sistema e não quer lidar com o grande pacote Anaconda, você pode executar `pip3 install jupyter` (para Python 3). - - -## Anaconda - -Anaconda é uma distribuição gratuita de código aberto de Python e R que vem com mais de 1.400 pacotes, o gerenciador de pacotes Conda para instalação de pacotes adicionais, e o navegador Anaconda, que permite gerenciar ambientes (por exemplo, você pode instalar diferentes conjuntos de pacotes para diferentes projetos, para que eles não causem conflitos uns para os outros) usando uma interface gráfica. Após a instalação da Anaconda, você pode usar o navegador Anaconda para instalar novos pacotes (ou `conda install` através da linha de comando), mas muitos pacotes estão disponíveis apenas através de pip (ou seja, usando `pip install` através da linha de comando ou em seu Jupyter Notebook). - -Para a maioria dos propósitos, você deve optar pela versão Python 3 do Anaconda, mas alguns códigos ainda podem ser escritos em Python 2. Nesta lição, você usará Python 3. O instalador Anaconda tem mais de 500 MB, e após a instalação pode levar mais de 3 GB de espaço no disco rígido, por isso certifique-se de que você tem espaço suficiente no computador e uma conexão de rede rápida antes de começar. - -
    -Se o espaço do disco rígido é uma preocupação, você pode empacotar um notebook para que ele possa ser executado usando recursos gratuitos de computação em nuvem, em vez de fazer com que os usuários instalem o Anaconda. Isso pode ser especialmente útil em situações de oficina. Veja a seção abaixo. -
    - -Para baixar e instalar a Anaconda, acesse o [site da Anaconda](https://www.anaconda.com/data-science-platform). Certifique-se de ter clicado no ícone do seu sistema operacional (que deve alterar o texto Anaconda [número da versão] para [sistema operacional selecionado], de forma a indicar o seu sistema operacional) e, em seguida, clique no botão Baixar na caixa para a versão atual do Python 3. Se você estiver no Windows, deve baixar um ficheiro `.exe`; em Mac, é `.pkg`; no Linux, é `.sh`. - -Abra normalmente o ficheiro para instalar o software em seu sistema operacional. Mais detalhes de instalação estão disponíveis nos [documentos da Anaconda](https://docs.anaconda.com/anaconda/install/), incluindo como instalar a Anaconda através da linha de comando em cada sistema operacional. Se o computador não conseguir abrir o ficheiro que você baixou, certifique-se de selecionar o sistema operacional correto antes de baixar o instalador. No Windows, não deixe de escolher a opção de "Adicionar Anaconda à PATH Variable" durante o processo de instalação, ou você não poderá lançar Jupyter Notebook a partir da linha de comando. - -## Usando Jupyter Notebook para pesquisa - -Esta lição descreve como você pode inicialmente escrever um Jupyter Notebook para análise de dados como parte de um projeto de pesquisa e, em seguida, adaptá-lo para uso em sala de aula. Embora este exemplo em particular seja extraído de estudos de fãs, ele se concentra na conversão de datas, que é amplamente necessária na análise de dados históricos e literários. - -## Abrindo o Jupyter Notebook - -Supondo que você já tenha instalado a Anaconda como descrito acima, você pode abrir o Anaconda Navigator como qualquer outro aplicativo de software (você pode fechar o prompt sobre a criação de uma conta na nuvem do Anaconda; você não precisa de uma conta para trabalhar com o Anaconda). Na tela inicial, você deve ver um conjunto de ícones e breves sinopses sobre cada aplicativo incluído no Anaconda. - -Clique no botão "Iniciar" sob o ícone do Jupyter Notebook. - -{% include figure.html filename="tr-pt-introducao-jupyter-notebooks-1.png" alt="Imagem com captura de tela do interface do Anaconda Navigator" caption="Figura 1. Interface do Anaconda Navigator" %} - -Se você preferir usar a linha de comando em vez do navegador Anaconda, uma vez que você tenha o Anaconda instalado, você deve ser capaz de abrir uma nova janela Terminal (Mac) ou Command Prompt (Win) e executar `jupyter notebook` para iniciar o navegador web com o aplicativo Jupyter Notebook. Se você estiver usando a linha de comando para iniciar o Jupyter Notebook, preste atenção no diretório em que você está quando o iniciar. Essa pasta se torna o diretório doméstico que aparecerá imediatamente na interface Jupyter Notebook, conforme descrito abaixo. - -As duas abordagens abrirão uma nova janela ou guia no seu navegador padrão com a interface Jupyter Notebook. O Jupyter Notebook é baseado no navegador: você só interage com ele através do seu navegador, mesmo quando o Jupyter Notebook está sendo executado no seu próprio computador. - -
    Se você estiver usando notebooks que importam pacotes Python que têm dependências de versões específicas de outros pacotes, você deve configurar um ambiente para usar com esses notebooks, para que você não lide com conflitos de versão (por exemplo, se um notebook requer a versão 1.0 de um pacote, e outro requer a versão 2.0). [A documentação do Anaconda Navegador para Gerenciar Ambientes](https://perma.cc/E9TC-YMCU) (ou, se preferir usar a linha de comando, a [documentação Conda](https://perma.cc/KHB8-U3CT)) fornece instruções passo a passo para criar, atualizar e ativar um ambiente. Para lançar o Jupyter Notebook dentro de um ambiente específico, você precisa primeiro ativar esse ambiente.
    - -## Navegando na interface do Jupyter Notebook - -A interface do gerenciador de ficheiro do Jupyter Notebook é a principal maneira de abrir um ficheiro Jupyter Notebook (.ipynb). Se você tentar abrir em um editor de texto simples, o notebook será exibido como um ficheiro JSON, não com blocos interativos de código. Para visualizar um notebook através da interface Jupyter, você tem que abrir o Jupyter Notebook primeiro (que será exibido em uma janela do navegador), e abrir o ficheiro de dentro do Jupyter Notebook. Infelizmente, não há como definir o Jupyter Notebook como o aplicativo de software padrão para abrir `ficheiro.ipynb` quando você clica duas vezes neles. - -Quando você lança o Jupyter Notebook do navegador Anaconda, ele exibe automaticamente o diretório doméstico. Este é geralmente o diretório com seu nome de usuário em um Mac (/Users/seu nome de usuário). Em um PC geralmente é `C: \` . Se você abrir o Jupyter Notebook a partir da linha de comando, ele exibirá o conteúdo da pasta em que você estava quando o lançou (usando a linha de comando, você também pode lançar diretamente um notebook específico, por exemplo, `jupyter-notebook-example.ipynb`.) - -Para evitar desordenar esta pasta, você pode fazer uma nova pasta dentro deste diretório para seus notebooks. Você pode fazer isso na sua interface usual de gerenciamento de ficheiro(s)(Finder no Mac, ou File Explorer no Windows), ou dentro do próprio Jupyter Notebook, já que o Jupyter Notebook, assim como o Google Drive, fornece uma interface de gerenciamento de ficheiro(s) dentro de um navegador, bem como uma interface de menu e de barra de ferramentas para a criação de ficheiro(s). Para adicionar uma nova pasta no Jupyter Notebook, clique em Novo no canto superior direito e escolha Pasta. Isso criará uma nova pasta chamada "Pasta Sem Título". Para alterar o nome, clique na caixa de seleção à esquerda da "Pasta Sem Título", em seguida, clique no botão "Renomear" que aparece na guia "ficheiro(s)". Nomeie os notebooks da pasta. Clique nele para abrir essa pasta. - -## Upload dos dados do exemplo -O ficheiro CSV de exemplo para esta lição é um extrato de metadados de fan fiction de Harry Potter coletados do site de fanfic italiano https://efpfanfic.net, depois limpos usando uma combinação de [expressões regulares](/en/lessons/understanding-regular-expressions) e [OpenRefine](/pt/licoes/limpar-dados-openrefine). O CSV tem três colunas: a classificação da história (similar a uma classificação de filme), a data que foi originalmente publicada, e a data mais recente de atualização. As opções de classificação são verde (verde), giallo (amarelo), arancione (laranja), e rosso (vermelho). A publicação e as datas atualizadas são criadas automaticamente; quando consistente a história é postada no site ou atualizado, assim você pode tomá-las como consistentes. - -Baixe o [ficheiro CSV](/assets/jupyter-notebooks/ph-jupyter-notebook-example.csv). - -Dentro do navegador de ficheiro(s) Jupyter Notebook, você deve estar dentro do diretório de notebooks que acabou de criar. No canto superior direito, clique no botão "Carregar" e carregue o ficheiro CSV de amostra. Será mais fácil de acessar se estiver no mesmo diretório do Jupyter Notebook que você criará na próxima etapa a fim de converter as datas. - -{% include figure.html filename="tr-pt-introducao-jupyter-notebooks-2.png" alt="Imagem com captura de tela sobre o upload de ficheiros no interface Jupyter Notebook" caption="Figura 2. Upload de ficheiro(s) na interface Jupyter Notebook" %} - -Observe que esta não é a única maneira de fazer os ficheiro(s) aparecerem no gerenciador de ficheiro(s) do Jupyter Notebook. A pasta de notebooks que você criou é um diretório regular em seu computador, e assim você também pode usar sua interface usual de gerenciamento de ficheiro(s) (por exemplo, Finder no Mac, ou File Explorer no Windows) para colocar ficheiro(s) `.ipynb` e/ou de dados neste diretório. Os Jupyter Notebooks usam a localização do próprio ficheiro do notebook (o `ficheiro.ipynb`) como o caminho de partida padrão. Para oficinas e cursos, pode fazer sentido criar uma pasta onde você pode armazenar o notebook, qualquer imagem anexada e os dados com os quais você vai trabalhar, todos juntos. Se tudo não estiver na mesma pasta, você terá que incluir o caminho ao referenciá-lo ou usar o código Python dentro do notebook para alterar o diretório de trabalho. - -## Criando um novo notebook - -Dentro da pasta de notebooks, crie um novo Jupyter Notebook para converter as datas para o seu projeto de pesquisa. Clique no botão "new" no canto superior direito da interface do gerenciador de ficheiro(s) do Jupyter Notebook. Se você acabou de instalar o Anaconda como descrito acima, sua única opção será criar um Jupyter Notebook usando o _kernel_ Python 3 (o componente de backend que realmente executa o código escrito no notebook), mas vamos discutir abaixo como adicionar kernels para outras linguagens de programação. Clique em "Python 3", e o Jupyter Notebook abrirá uma nova guia com a interface para os próprios Jupyter Notebooks. Por padrão, o notebook será chamado de "Sem título"; você pode clicar nesse texto na parte superior da tela para renomeá-lo. - -{% include figure.html filename="tr-pt-introducao-jupyter-notebooks-3.png" alt="Imagem com captura de tela da interface do Jupyter Notebook para criar novo ficheiro" caption="Figura 3. Criando um novo Jupyter Notebook" %} - -## Trabalhando em Jupyter Notebooks - -Um notebook é composto de células: caixas que contêm código ou texto legível por humanos. Cada célula tem um tipo, que pode ser selecionado a partir das opções drop-down do menu (“menu deslizante”). A opção padrão é "Code"; as caixas de textos legíveis por humanos devem usar o tipo "Markdown" e precisarão ser escritas usando as convenções de formatação do Markdown. Para saber mais sobre Markdown, veja a lição do Programming Historian “[Introdução ao Markdown](/pt/licoes/introducao-ao-markdown)”. - -Quando você cria um novo Jupyter Notebook, a primeira célula será uma célula de código. No topo da interface do Jupyter Notebook está uma barra de ferramentas com funções que se aplicam à célula selecionada atualmente. A primeira função do menu deslizante é, por padrão, "Code". Clique nesse menu e selecione "Markdown" (você também pode usar um atalho de teclado, _esc + m_, para alterar a célula atual para Markdown, e _esc + y_ muda de volta para uma célula de código). Vamos começar este caderno com um título e uma breve explicação do que o caderno está fazendo. No momento, isso é apenas para sua própria memória e referência; você não quer investir muito em prosa e formatação nesta fase do projeto, quando você não sabe se você vai acabar usando este código como parte de seu projeto final, ou se você vai usar uma ferramenta ou método diferente. Mas ainda pode ser útil incluir algumas células de marcação com notas para ajudá-lo a reconstruir seu processo. - -Cole o seguinte na primeira célula. Se a primeira linha não aparecer com uma fonte grande (como um cabeçalho), certifique-se de ter selecionado "Markdown" no menu suspenso na parte superior. - - -``` -# Fanfic date conversion -Converting published & updated dates for Italian fanfic into days of the week. -``` - -{% include figure.html filename="tr-pt-introducao-jupyter-notebooks-4.png" alt="Imagem com captura de tela da interface do Jupyter Notebook para editar Markdown" caption="Figura 4. Editando a célula Markdown em um Jupyter Notebook" %} - -Quando você está editando uma célula, você pode usar _Ctrl + Z_ (Win) ou _Command + Z_ (Mac) para desfazer as alterações que você fez. Cada célula mantém seu próprio histórico de edição; mesmo que você passe para uma célula diferente e faça edições lá, você pode posteriormente clicar de volta na primeira célula e desfazer suas alterações anteriores lá, sem perder as alterações realizadas para a segunda célula. - -Para deixar o modo de edição e "executar" esta célula (para uma célula Markdown, isso não faz nada, apenas move o cursor mais para baixo no notebook), você pode clicar na barra de ferramentas ou pressione Ctrl+Enter (Ctrl+Return no Mac). Se você quiser retomar a edição mais tarde, você pode clicar duas vezes nela ou selecionar a célula (que mostrará uma linha azul vertical à esquerda uma vez selecionada) clicando-a uma vez e, em seguida, pressionando a tecla Enter (Win) ou Return (Mac). Para deixar o modo de edição, você pode clicar na barra de ferramentas ou pressionar Ctrl+Enter (Ctrl+Return no Mac). Se você quiser executar sua célula atual e adicionar uma nova célula (por padrão, uma célula de código) imediatamente abaixo dela, você pode pressionar Alt+Enter (Option+Enter no Mac). - -Em seguida, você precisa descobrir como fazer a conversão. A busca por termos relevantes pode levá-lo a essa [discussão do StackOverflow](https://perma.cc/JG6H-KZAZ), e a primeira resposta envolve o uso do módulo Python datetime. Como primeiro passo, você precisa importar datetime, usando uma célula de código. Você também sabe que o seu ficheiro de entrada é um CSV, então você deve importar o módulo csv também. - -Para adicionar uma nova célula, clique no botão + (mais) na barra de ferramentas (ou use o atalho do teclado _esc + b_). Isso criará uma nova célula de código abaixo da célula que está atualmente selecionada. Crie uma nova célula de código e cole o código a seguir para importar um módulo Python: - - -``` -import datetime -import csv - -``` - -Pensando desde já na possibilidade de compartilhar este notebook ou parte dele, pode ser útil dividir as importações de módulos em células individuais, e colocar o código em si em outra célula, para que você possa incluir uma célula Markdown que explique o que cada uma delas está fazendo. - -Ambos os pacotes que você está importando para este notebook já estão instalados como parte do Anaconda, mas existem muitos pacotes de nicho relevantes para a pesquisa (por exemplo, o [Classic Languages Toolkit, CLTK](https://perma.cc/Q9Q8-9TNZ), para fazer análise de texto em línguas históricas) que não estão incluídos com a Anaconda, e não estão disponíveis através do _instalador conda_. Se você precisa de um pacote como esse, você tem que instalá-lo usando _pip_. Instalar pacotes de dentro do Jupyter notebook pode ser um pouco complicado, porque pode haver diferenças entre o kernel Jupyter que o notebook está usando, e outras versões do Python que você pode ter instalado no seu computador. Você pode encontrar uma longa discussão técnica sobre esses problemas neste [post de blog](https://perma.cc/N6M6-ZF5G). - -Se você está trabalhando em um notebook que deseja compartilhar, e ele inclui pacotes menos comuns, você pode incluir uma célula Markdown instruindo os usuários a instalar os pacotes com antecedência usando conda ou pip, ou você pode usar: - -``` -import sys -!conda install --yes --prefix {sys.prefix} YourModuleNameHere - -``` - -para instalar algo do notebook usando conda; a sintaxe `!` indica que o código está executando algo da linha de comando, em vez do kernel Jupyter. Ou, se o pacote não estiver disponível na conda (muitos pacotes de nicho relevantes para a pesquisa não estão), você pode usar `pip`: - -``` -import sys - -!{sys.executable} -m pip install YourModuleNameHere - -``` - -Se você não tinha instalado o Python no computador antes de instalar o Anaconda para esta lição, talvez seja necessário adicionar o pacote pip para poder usá-lo para instalar outros pacotes. Você pode adicioná-lo através da GUI (interface gráfica do usuário) do navegador Anaconda, ou executar `conda install pip` a partir da linha de comando. - -Voltando ao nosso exemplo, em seguida adicione uma nova célula de código e cole o seguinte código (certifique-se de que incluiu os espaçamentos): - -``` -with open('ph-jupyter-notebook-example.csv') as f: - csv_reader = csv.reader(f, delimiter=',') - for row in csv_reader: - datetime.datetime.strptime(row[1], '%d/%m/%Y').strftime('%A') - print(row) -``` - -Clicar no botão 'play' na barra de ferramentas quando você tem uma célula de código selecionada executa o código dentro da célula (se você tentar executar este código depois de executar as declarações de importação, verá um erro: "ValueError: time data ‘1/7/18’ does not match format ‘%d/%m/%Y’". Não se preocupe, vamos depurar isso a seguir). - -Depois de executar uma célula de código, um número aparecerá entre colchetes à esquerda da célula. Este número indica a ordem em que a célula foi executada. Se você voltar e executar o celular novamente, o número é atualizado. - -Se um número não aparecer imediatamente ao lado da célula, você verá um asterisco entre os colchetes. Isso significa que a célula de código não terminou de funcionar. Isso é comum para códigos computação intensiva (por exemplo, processamento de linguagem natural) ou tarefas de longa duração, como extração de conteúdo na web. Sempre que uma célula de código está sendo executada, o favicon na guia do navegador do notebook muda para uma ampulheta. Se você quiser alterar as guias e fazer outra coisa enquanto o código estiver em execução, você pode saber que a ação anterior foi concluída quando a ampulheta muda de volta para o ícone do notebook. - - -{% include figure.html filename="tr-pt-introducao-jupyter-notebooks-5.png" alt="Imagem com captura de tela sobre a execução de código no Jupyter Notebook" caption="Figura 5. Executando uma célula de código em um Jupyter Notebook" %} - -``` -O Jupyter notebook funciona melhor se você executar as células sequencialmente. Às vezes, você pode obter erros ou saídas incorretas se executar as células fora de ordem ou tentar editar e executar iterativamente diferentes partes do notebook. Se você fez muitas alterações e executou blocos de código de forma não linear e descobrir que você está recebendo uma saída estranha, você pode redefinir o Jupyter Notebook clicando no _Kernel_ no menu e escolhendo _Restart & Clear Output_. Mesmo que você não tenha notado nada de estranho, é uma boa ideia utilizar o Restart & Clear Output em seu código, uma vez que você tenha terminado de escrevê-lo, para ter certeza de que o resultado está correto. -``` - -Depois de executar a segunda célula de código, você verá um erro. Para descobrir o que está acontecendo, você pode consultar a -[documentação para datação](https://perma.cc/S92Z-3QVM) que explica cada uma das diferentes opções de formatação. Lá, você verá que a única opção de valores para “dia” assume o uso de dois dígitos (ou seja, dias de um dígito são prefixados com um 0). Olhando para os dados do exemplo, os meses (listados em segundo lugar nesta ordem de data) já são acrescidos de zero, quando tem apenas um dígito, mas não os dias. Você tem duas opções: você pode tentar alterar os dados, ou você pode tentar alterar seu código. - -Digamos que você queira tentar uma abordagem diferente, mas quer deixar o que você fez até agora, no caso de você querer revisitar esse código, e talvez usá-lo depois de alterar os dados. Para lembrar do que aconteceu, adicione uma célula Markdown acima da sua segunda célula do código. Clique na primeira célula do código e clique no botão mais na barra de ferramentas. Se você clicar no botão de adição na barra de ferramentas depois de executar a última célula de código, a nova célula aparecerá na parte inferior do notebook. Você pode movê-la para onde quiser clicando no botão de seta para cima. Certifique-se de que está no modo Markdown e cole o seguinte texto: - -``` - ### Não funciona, precisa de datas precedidas por zero - [documentação do datetime](https://docs.python.org/2/library/datetime.html?highlight=strftime#strftime-and-strptime-behavior). - Modificar o ficheiro de origem? - -``` - -Lendo ainda mais na [discussão do StackOverflow](https://perma.cc/EN55-P57H), há outra abordagem que usa uma biblioteca diferente, dateutil, que parece ser mais flexível com os tipos de datas que ela aceita. Volte para a célula usada para importar módulos e edite-a para adicionar a nova biblioteca (em qualquer lugar dessa célula, desde que cada declaração de importação esteja em sua própria linha): - -``` -import dateutil - -``` - -Re-execute essa célula de código; note que o número ao lado da célula muda na segunda vez que você executá-lo. - -Agora crie uma nova célula Markdown na parte inferior do notebook e cole: - - -``` -#### tentando dateutil para analisar datas, conforme https://stackoverflow.com/a/16115575 - -``` - -Abaixo dele, adicione uma nova célula de código com o seguinte código (prestando atenção ao espaçamento, de modo que o código seja indentado assim como você vê abaixo): - -``` -with open('ph-jupyter-notebook-example.csv') as f: - csv_reader = csv.reader(f, delimiter=',') - for row in csv_reader: - parseddate = dateutil.parser.parse(row[1]) - print(parseddate) -``` - -Execute a célula com o código que você acabou de adicionar. Pode levar mais tempo; continue esperando até que o asterisco ao lado da célula de código se transforme em um número. O resultado deve mostrar a lista de datas de publicação, formatadas de forma diferente, com hífen em vez de barras, e com a adição das horas, minutos e segundos (como zeros, porque as datas registradas não incluem esses dados). À primeira vista, parece que funcionou, mas se você compará-lo mais de perto com o ficheiro de origem, você verá que o módulo dateutil não está sendo consistente em como analisa as datas. Datas em que o valor do dia é maior que 12 estão sendo analisadas corretamente (ele sabe que um valor maior que 12 não pode ser um mês), mas quando o valor da data é 12 ou menos, a data está sendo identificada com o mês primeiro. A primeira linha do ficheiro de origem tem a data 1/7/18, que é entendida como "2018-01-07 00:00:00". Na documentação para dateutil, você descobrirá que você pode [especificar `dayfirst=true`](https://perma.cc/W54E-SP5Z) para corrigir isso. Edite a última célula de código e altere a penúltima linha para ler: - -``` -parseddate = dateutil.parser.parse(row[1], dayfirst=True) - - ``` - -Quando você executar a linha novamente, você verá que todas as datas foram analisadas corretamente. - -Analisar a data é apenas o primeiro passo – você ainda precisa usar o módulo datetime para converter as datas em dias da semana. - -Exclua a última linha do bloco de código e substitua-a pelo seguinte (certificando-se de que você tenha o mesmo nível de recuo da última linha anterior, para ambas as linhas): - - -``` -dayofweek = datetime.date.strftime(parseddate, '%A') - -print(dayofweek) - -``` - -Execute o bloco de códigos novamente. Isso deve lhe dar uma lista de dias da semana. - -Agora que você tem código para analisar e re-formatar uma data, você precisa fazê-lo para ambas as datas em cada linha do seu ficheiro de origem. Porque você sabe que tem código funcionante na célula de código atual, se você não se sentir muito confortável com Python, você pode querer copiar a célula de código atual antes de fazer modificações. Selecione a célula que deseja copiar e clique no botão copiar na barra de ferramentas; o botão de colar irá colar a célula abaixo de qualquer célula atualmente selecionada. Fazer uma cópia permite que você faça livremente alterações no código, sabendo que você sempre pode voltar facilmente para uma versão que funciona. - -Se você não quiser resolver isso por conta própria, você pode copiar e colar esse código em uma nova célula de código ou substituir a célula de código atual: - -``` -#identifica o ficheiro fonte a ser aberto, chama-o f -with open('ph-jupyter-notebook-example.csv') as f: - #cria um ficheiro de saída (referido como "out" no notebook) para ser gravado - with open('ph-jupyter-notebook-example-dayofweek.csv', 'w') as out: - #define "csv_reader" como executando a função csv.reader no ficheiro - csv_reader = csv.reader(f, delimiter=',') - #define "csv_writer" como executando a função csv.writer para "out" (o ficheiro de saída) - csv_writer = csv.writer(out) - #para cada linha que está sendo lida pelo csv_reader... - for row in csv_reader: - #define "csv_reader" como executando a função csv.reader no ficheiro - csv_reader = csv.reader(f, delimiter=',') - #para cada linha que está sendo lida pelo csv_reader... - for row in csv_reader: - #cria uma lista chamada "values" com o conteúdo da linha - values = list(row) - #define "rating" como a primeira coisa na lista - #contagem em Python começa com 0, não 1 - rating = values[0] - #define "parseddatepub" como a segunda coisa (1, porque começamos com 0) na lista, - #convertido em um formato de data padrão usando dateutil.parser - #e quando essas datas são analisadas, o analisador deve saber - #que o primeiro valor na sequência é o dia - parseddatepub = dateutil.parser.parse(values[1], dayfirst=True) - #mesmo que acima para a data atualizada, a terceira coisa (2) na lista - parseddateupdate = dateutil.parser.parse(values[2], dayfirst=True) - #define "dayofweekpub" como parseddatepub (definido acima), convertido para o dia da semana - #%A é usado para mudar para o dia da semana - #Pode ver outros formatos aqui: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior - dayofweekpub = datetime.date.strftime(parseddatepub, '%A') - #mesma coisa para data de atualização - dayofweekupdate = datetime.date.strftime(parseddateupdate, '%A') - #cria uma lista da classificação e as novas datas formatadas - updatedvalues = [rating, dayofweekpub, dayofweekupdate] - #escreve todos os valores nesta célula de código - csv_writer.writerow(updatedvalues) - print(updatedvalues) -``` - -Depois de executar este código, você terá um novo ficheiro ph-jupyter-notebook-exemplo-dayofweek.csv, com seus dados no formato que você precisa para a análise. - -Agora que você tem um código que funciona para converter as datas do formulário que você tem para o formulário que você precisa, você pode limpar as falsas partidas e notas para si mesmo. Você vai querer manter o primeiro código com as declarações de importação, e a primeira célula Markdown com o título e a descrição, mas você deve excluir outras células de código e Markdown que não são o seu código final. Para excluir uma célula, clique nela e clique no botão tesoura na barra de ferramentas. Se você excluir uma célula por engano, você pode clicar em Editar no menu e escolher "Desfazer excluir células". - -## Salvando, exportando e publicando Jupyter Notebooks - -O Jupyter salva automaticamente seu trabalho de forma periódica, criando "pontos de verificação". Se algo der errado com seu notebook, você pode reverter para um ponto de verificação anterior indo em "File", em seguida, "Revert to Checkpoint", e escolhendo um horário. Dito isto, ainda é importante salvar seu notebook (usando o botão de salvar), porque se você fechar e desligar o kernel do notebook (incluindo reiniciar o kernel), os pontos de verificação serão perdidos. - -Você também pode baixar o notebook (_File> Download as_) em vários formatos de ficheiro diferentes. Baixar o formato Notebook (`.ipynb`) é útil se você quiser compartilhar seu código em seu formato completo de notebook. Você também pode baixá-lo como código em qualquer linguagem em que seu notebook estiver (por exemplo, `.r` se em R ou `.py` se Python ou `.js` se JavaScript), como um ficheiro de `.html`, como um ficheiro de marcação (`.md`) ou como um PDF via LaTeX. Se você baixá-lo como código, as células Markdown se tornam comentários (se você quiser converter um ficheiro, `ficheiro.ipynb` para outro formato depois de baixá-lo, você pode usar a ferramenta [nbconvert](https://perma.cc/6J73-KCK5)). - -Se você está trabalhando em um projeto de pesquisa, você pode usar um Jupyter notebook, ou uma série de notebooks, ao longo do caminho para acompanhar seu fluxo de trabalho. Alguns estudiosos postam esses cadernos no GitHub, juntamente com slides ou PDFs de pôsteres e dados de origem (ou metadados, se os direitos autorais permitirem), para acompanhar apresentações e palestras. O GitHub renderiza versões não interativas de ficheiro(s) de notebook, para que possam ser visualizados dentro de um repositório. Alternativamente, você pode colar a URL de um repositório do GitHub que tem notebooks Jupyter em [nbviewer,](https://nbviewer.jupyter.org/) o que às vezes pode ser uma visualização mais rápida e confiável. Você pode querer incluir uma célula Markdown com uma citação recomendada para o seu Jupyter notebook, e uma referência para o repositório do GitHub onde ela está armazenada, especialmente se o seu notebook inclui código que outros possam reutilizar para análises semelhantes. - -O código que você acabou de desenvolver como parte desta lição pertence a algum lugar no meio de um projeto real. Se você estiver usando notebooks para documentar seu fluxo de trabalho, você pode optar por adicionar a nova célula de código a um notebook existente, em vez de baixá-lo como um notebook separado e autônomo. Os Jupyter notebooks podem ser particularmente úteis para documentar fluxos de trabalho de projetos quando você está trabalhando com colaboradores que só podem estar envolvidos por um curto período de tempo (como estagiários de graduação no período de férias escolares). Com colaboradores de curto prazo, é importante ajudá-los a entender e começar a usar os fluxos de trabalho do projeto sem muito tempo de iniciação, e os Jupyter notebooks podem definir esses fluxos de trabalho passo a passo, explicar onde e como os ficheiro(s) são armazenados e fornecer dicas para tutoriais externos e materiais de treinamento para ajudar os colaboradores que estão menos familiarizados com os fundamentos técnicos do projeto a serem iniciados. Por exemplo, dois projetos que usaram Jupyter notebooks para publicar fluxos de trabalho são o Projeto Realismo Socialista de Sarah McEleney e a [“mineração de texto da literatura infantil inglesa 1789-1914 para a representação de insetos e outros rastejantes assustadores”](https://perma.cc/JD8N-P79G). - -À medida que seu projeto progride, se você estiver publicando através de canais de acesso aberto e se seus conjuntos de dados podem ser compartilhados livremente, os Jupyter notebooks podem fornecer um formato ideal para tornar o código que sustenta seu argumento acadêmico visível, testável e reutilizável. Embora os periódicos e publicações possam não aceitar os Jupyter notebooks como um formato de submissão, você pode desenvolver uma "versão" do seu artigo que inclui o texto completo (como células Markdown),com células de código integradas ao fluxo da narrativa acadêmica como uma ilustração imediatamente acessada da análise que você está descrevendo. Você também pode incluir as células de código que compõem os fluxos de trabalho de preparação de dados como um apêndice, seja no mesmo notebook, ou em um separado. Integrar o código com o texto de um artigo acadêmico torna muito mais provável que os leitores realmente se envolvam com o código, já que eles podem simplesmente executá-lo dentro do mesmo caderno onde estão lendo o argumento. Alguns estudiosos, particularmente na Europa, também postam seus cadernos no [Zenodo](https://zenodo.org/), um ficheiro para dados de pesquisa, independentemente do país de origem, financiador ou disciplina. O Zenodo suporta configurações de dados de até 50 GB (vs. o limite de tamanho de ficheiro de 100 MB no Github), e fornece DOIs para o material carregado, incluindo notebooks. Alguns estudiosos combinam arquivamento no Zenodo para sustentabilidade com a publicação no GitHub para a possibilidade de encontrar, incluindo o Zenodo DOI como parte do ficheiro readme.md no repositório do GitHub que inclui os notebooks. Como exemplo, o caderno de workshop ["Análise de Dados Aplicados" por Giovanni Colavizza e Matteo Romanello para o DHOxSS 2019](https://perma.cc/6S7H-LQEA) é publicado no GitHub, mas inclui [um Zenodo DOI](https://doi.org/10.5281/zenodo.3352830). - -Embora a argumentação e o código totalmente integrados ainda sejam difíceis de encontrar devido à falta de um local para publicar esse tipo de trabalho, os estudiosos começaram a usar os Jupyter notebooks como um passo incremental mais interativo para publicações computacionais dinâmicas. José Calvo tem um exemplo de um [caderno acompanhando um artigo sobre estilizometria](https://perma.cc/Y9CK-CFK8) (em espanhol), e Jed Dobson publicou um [conjunto de cadernos](https://perma.cc/UDA3-467P) para acompanhar seu livro Critical Digital Humanities: The Search for a Methodology, que aborda diretamente os Jupyter Notebooks como objetos acadêmicos (p.39-41). - -## Usando Jupyter Notebook para ensinar - -O Jupyter Notebook é uma ótima ferramenta para ensinar programação, ou para ensinar conceitos como modelagem de tópicos ou vetores de palavras que envolvem programação. A capacidade de fornecer instruções e explicações como Markdown permite que os educadores forneçam notas detalhadas sobre o código através de marcação alternada e células de código, de modo que o texto de Markdown explique o código na célula logo abaixo. Isso é útil para oficinas práticas, pois as instruções e o código podem ser escritos com antecedência. Isso permite que os participantes abram o notebook, baixem um conjunto de dados e executem o código conforme está. Se você espera ministrar uma oficina onde os alunos terão diferentes níveis de familiaridade com a programação, você pode configurar o notebook para ter tarefas suplementares para os alunos que se sentem confortáveis em modificar o código. Ao mesmo tempo, mesmo os alunos que hesitam em tocar no código ainda poderão alcançar o resultado principal da oficina apenas executando células de código pré-escritas. - -Como outra abordagem, você também pode usar Jupyter notebooks para escrever código na medida em que o desenvolve. Em tal oficina, os alunos podem começar com um caderno em branco, e escrever o código junto com você. As células ajudam a segmentar o código como você o escreve, em vez de usar um editor de texto ou IDE (Ambiente de Desenvolvimento Integrado) que não quebra o código de forma tão clara e pode causar confusão, especialmente quando ensina iniciantes. - -Você pode usar Jupyter notebooks para tarefas em sala de aula dando instruções em Markdown e fazendo com que os alunos escrevam código em uma célula em branco com base nas instruções. Dessa forma, você pode criar uma tarefa de programação interativa que ensina aos alunos não apenas a sintaxe e o vocabulário de uma linguagem de programação, mas também pode explicar as melhores práticas de programação em geral. - -Se você já está usando Jupyter notebooks para documentar o fluxo de trabalho do seu projeto, você pode ser capaz de reformular esses cadernos de pesquisa para uso em sala de aula, como uma maneira de trazer sua pesquisa para a sala de aula. Este [exemplo de caderno pedagógico](/assets/jupyter-notebooks/ph-jupyter-notebook-example.ipynb) é um híbrido de algumas das abordagens pedagógicas descritas acima. A primeira seção do caderno destina-se a estudantes que têm pouca ou nenhuma experiência anterior executando o código; o principal resultado do aprendizado é comparar o tempo necessário para converter manualmente formatos de dados, em comparação com fazê-lo com código. Você poderia usar este caderno para uma sessão de laboratório prática em uma introdução à humanidades digitais ou história digital, onde todos os alunos instalam Anaconda e aprendem o básico do Jupyter Notebook. Se a turma tem uma mistura de alunos sem formação técnica e alunos com exposição prévia ao Python, você pode orientar os alunos com experiência de programação a trabalhar em conjunto em grupos de dois ou três para propor soluções para os prompts na segunda parte do notebook. Tenha em mente que se você usar uma tarefa de classe como esta como uma forma de fazer com que os alunos de ciência da computação escrevem código que ajude seu projeto de pesquisa, eles devem ser creditados como colaboradores e reconhecidos em publicações subsequentes vindas do projeto.[^4] - -Existem muitos cursos e workshops de 'Introdução ao Python' nas Humanidades Digitais que utilizam Jupyter Notebook (incluindo [Introdução ao Python e Desenvolvimento web com Python para as Humanidades](https://perma.cc/ANL2-K7SM) by Thibault Clérice, traduzido do material por Matthew Munson). O Jupyter Notebook também é comumente usado em oficinas de análise de texto, como a [oficina de vetores de palavras na DH 2018](https://perma.cc/5UZ9-25XW), ministrada por Eun Seo Jo, Javier de la Rosa e Scott Bailey. - -Ensinar com Jupyter Notebook nem sempre tem que envolver o processo demorado de baixar e instalar a Anaconda, especialmente se você está imaginando ter apenas uma ou duas lições que envolvem notebooks. Se suas atividades em sala de aula com Jupyter notebooks envolvem o uso de dados de exemplo que você já preparou, e se você já escreveu pelo menos parte do código, você pode querer explorar a execução de Jupyter Notebooks usando recursos gratuitos de computação em nuvem, desde que seus alunos tenham a garantia de ter conectividade confiável com a internet em sala de aula. Rodar notebooks na nuvem também fornece um ambiente consistente para todos os alunos, poupando você de ter que negociar diferenças entre Windows e Mac, ou fornecer uma alternativa para estudantes cujos laptops não têm espaço ou memória para executar Anaconda efetivamente. - -Como as opções estão evoluindo rapidamente, é melhor usar seu mecanismo de busca favorito para encontrar uma lista mais atualizada com opções de computação em nuvem para Jupyter Notebook. Um projeto que tem visto uma absorção particular entre usuários acadêmicos de notebooks é o [MyBinder](https://mybinder.org/). Nele você levará um repositório do GitHub que contém dados relacionados a ficheiro(s) `jupyter.ipynb` (imagens incorporadas, conjuntos de dados que você deseja usar os notebooks, etc.), as informações sobre pacotes e dependências necessários (em um `requisito.txt` ou `ficheiro-environment.yml`) e torná-lo incializável usando um servidor de nuvem. Uma vez que você tenha o pacote MyBinder até o seu repo GitHub, você pode adicionar um "crachá" binder ao ficheiro readme para o repo. Quem estiver vendo o relatório pode lançar o notebook diretamente do seu navegador, sem ter que baixar ou instalar nada. - -Como os dados que o notebook precisa acessar devem ser incluídos no repo, isso não funcionará para todas as situações (por exemplo, se os dados não podem ser redistribuídos legalmente no GitHub, excede o tamanho máximo de ficheiro(s) do GitHub e não podem ser baixados de outros lugares como parte da configuração do ambiente Binder, ou se você quiser que as pessoas usem o notebook com seus próprios dados), mas é uma ótima opção para oficinas ou aulas onde todos estão trabalhando com os mesmos dados compartilháveis. - -Se você quiser começar a explorar opções de nuvem, Shawn Graham criou [alguns modelos para configurar notebooks Python e R Jupyter para uso no Binder](https://perma.cc/T25E-BFH4). - -Finalmente, se você precisa manter seus notebooks fora da nuvem (por exemplo, devido a dados sensíveis ou de outra forma restritos), mas quiser fornecer um ambiente consistente para todos os seus alunos, você pode explorar o [JupyterHub,](https://perma.cc/8EH7-N22K) que tem sido adotado como infraestrutura técnica central para um número crescente de programas de ciência de dados. - -## Convertendo códigos Python - -Mesmo que você goste da ideia de usar Jupyter Notebooks, qualquer conversão de formato requer trabalho adicional. Se você já tem seu código escrito como scripts Python, a conversão para Os Jupyter Notebooks é bastante simples. Você pode copiar e colar o código do seu ficheiro.py em uma única célula de código de um novo notebook e, em seguida, dividir a célula de código em segmentos e adicionar células de Markdown adicionais conforme necessário. - -Alternativamente, pode ser mais fácil segmentar à medida que você transfere o código, copiando um segmento de cada vez em uma nova célula de código. Qualquer método funciona e é uma questão de preferência pessoal. - -Há também ferramentas como o [pacote 'p2j'](https://perma.cc/5YUE-YBH7) que convertem automaticamente o código Python existente em notebooks Jupyter, seguindo um conjunto documentado de convenções (por exemplo, transformando comentários em células Markdown). - -## Cadernos Jupyter para outras linguagens de programação - -Os Jupyter Notebooks permitem que você use muitas linguagens de programação diferentes, incluindo R, Julia, JavaScript, PHP ou Ruby. Uma lista atual de linguagens disponíveis pode ser encontrada na página do [Jupyter Kernels](https://perma.cc/B448-XMJQ) GitHub. - -Enquanto o Python é suportado por padrão quando você instala o Jupyter Notebook através da Anaconda, as outras linguagens de programação precisam ter seus núcleos de linguagens instalados antes que eles possam ser executados no Jupyter Notebook. As instruções de instalação são diferentes para cada núcleo de linguagem, por isso é melhor apenas encontrar e seguir as instruções para a sua linguagem preferida. Pelo menos para R, isso é relativamente simples. A página Jupyter Kernels GitHub tem links para instruções para todos os kernels de linguagens disponíveis. - -Uma vez que você tenha o kernel para a linguagem desejada instalado, você pode executar cadernos escritos nessa linguagem de programação, ou você pode criar seus próprios cadernos que executam essa linguagem. Cada linguagem com um kernel instalado em seu computador estará disponível como uma opção quando você criar um novo notebook como descrito acima. - -Como exemplo de um notebook R, [veja esta adaptação jupyter do código R de Andrew Piper de "Enumerações"](https://perma.cc/656B-U9SB). - -## Dimensionando a computação com Jupyter Notebooks - -Especialmente se você é novo em programar em Python, apenas conseguir qualquer coisa para trabalhar pode parecer uma vitória. No entanto, se você começar a trabalhar com conjuntos de dados maiores, poderá descobrir que algumas das “soluções” iniciais encontradas (como usar `readlines()` para ler um ficheiro de texto linha por linha) se tornam computacionalmente ineficientes, a ponto de causar problemas. Uma maneira de começar a entender as ineficiências em seu código é adicionar `%%timeit` ao topo de uma célula. O notebook escolherá um número de iterações para executar o código, dependendo da complexidade da tarefa, imprimirá o número de iterações e o tempo médio. Fazer várias iterações, em vez de apenas uma, pode ser útil para contabilizar pequenos atrasos no âmbito do sistema (por exemplo, se seu laptop estiver momentaneamente atolado com outros processos). Você pode colocar `%timeit` na frente da linha. Tenha cuidado com a ordenação significativa: ordenar uma aplicação pequena de muito mais tempo para a primeira iteração do que para a segunda, depois que a lista já estiver em ordem. Em casos como a classificação de listas em que não faz sentido medir várias iterações ou para tarefas de longa duração onde pequenos atrasos no sistema não terão um impacto significativo, você pode usar `%%time` no topo de uma célula ou `%time` na frente de uma linha, que mede o tempo que uma única execução leva. Esses comandos fazem parte de uma família de “comandos mágicos” integrados disponíveis em Jupyter Notebooks. Veja a [documentação do Jupyter](https://perma.cc/ED9F-DNDA) para saber de mais detalhes. - -Ter alguma ideia de aumento do tempo previsto para ser implementado é um requisito necessário para aumentar o tempo dos clusters em uso, como no caso dos clusters de programação de alto desempenho (HPC) financiados de forma centralizadamente, disponíveis em muitas instituições. A maioria esmagadora dos pesquisadores que usam esses recursos está nas ciências duras, mas geralmente qualquer membro do corpo docente pode solicitar acesso. É possível que você também possa ter acesso a recursos de HPC regionais ou nacionais. Esses recursos de computação podem acelerar significativamente grandes trabalhos de computação, especialmente tarefas como modelagem 3D que podem tirar proveito de nós computacionais com poderosas unidades de processamento gráfico (GPUs). Aprender a usar clusters HPC é um tópico suficientemente grande para sua própria lição, mas os notebooks Jupyter podem permitir que você pegue um atalho. Alguns grupos de computação de pesquisa oferecem maneiras mais fáceis para os pesquisadores executarem Jupyter Notebooks usando recursos de cluster HPC, e você pode encontrar [vários guias e exemplos de uso geral](https://perma.cc/A5R4-9ZD7) para fazê-lo. Se você conseguir acesso aos recursos do HPC, vale a pena contatar a equipe de TI que com computação para uma área de e pesquisar sobre como você pode executar o Jupyter Notebook caso você não lidou com sua redação a respeito no site da sua instituição. O TI que trabalha majoritariamente com pesquisa pode se comunicar de forma brusca do que você é de forma mais pessoal, mas não permite que a maioria dos humanos querem, porque usam a diversidade da sua base de usuários é importante para suas medidas de atuação na universidade. - -## Conclusão -Desde a experimentação do código até a documentação de fluxos de trabalho, da pedagogia à publicação acadêmica, o Jupyter Notebook é uma ferramenta flexível e multiuso que pode apoiar a pesquisa digital em diversos contextos. Mesmo que você não tenha certeza de como exatamente você vai usá-los, é bastante fácil instalar o software Jupyter Notebook e baixar e explorar notebooks existentes, ou experimentar alguns dos seus próprios. Os Jupyter Notebooks têm uma grande promessa de fazer a ponte das facetas críticas e computacionais da pesquisa de humanidades digitais. Para concluir com uma citação de Jed Dobson's _Critical Digital Humanities: The Search for a Methodology_. - ->Notebooks são teoria - não apenas código como teoria, mas teoria como reflexo reflexivo com o trabalho teórico e implicações do próprio código. As normas disciplinares, incluindo enquadramento contextual, teoria e autocrítica, precisam acompanhar, complementar e informar qualquer crítica computacional. Revelar o máximo possível do código, dos dados e dos métodos é essencial para permitir a conversa disciplinar em curso. Compilando-os juntos em um único objeto, que pode ser exportado, compartilhado, examinado e executado por outros, produz um tipo dinâmico de teorização que é modular, mas firmemente ligado ao seu objeto.[^5] - -## Links -- Uma lista crescente de notebooks [Jupyter para DH](https://perma.cc/V5JX-VPP8), em múltiplas linguagens humanas e de programação. Obrigado a todos que enviaram sugestões no Twitter; referências adicionais são bem-vindas. - - Uma descrição técnica detalhada da [instalação de pacotes Python do Jupyter](https://perma.cc/N6M6-ZF5G). - -## Agradecimentos -- Obrigado a Stéfan Sinclair pelas referências a discussões anteriores sobre o uso de notebooks em humanidades digitais. - -- Obrigado a Rachel Midura por sugerir o uso de Jupyter Notebooks para colaboração. - -[^1]: Knuth, Donald. 1992. Literate Programming Stanford, Califórnia: Centro para o Estudo da Linguagem e da Informação. - -[^2]: Millman, KJ e Fernando Perez. 2014. “Developing open source scientific practice”. In Implementing Reproducible Research, Ed. Victoria Stodden, Friedrich Leisch, and Roger D. Peng. [https://osf.io/h9gsd/](https://perma.cc/M8R7-9JTL) - -[^3]: Sinclair, Stéfan & Geoffrey Rockwell. 2013. “Voyant Notebooks: Literate Programming and Programming Literacy”. Journal of Digital Humanities, Vol. 2, No. 3 Summer 2013. [http://journalofdigitalhumanities.org/2-3/voyant-notebooks-literate-programming-and-programming-literacy/](https://perma.cc/R253-BP2B) - -[^4]: Haley Di Pressi, Stephanie Gorman, Miriam Posner, Raphael Sasayama, and Tori Schmitt, with contributions from Roderic Crooks, Megan Driscoll, Amy Earhart, Spencer Keralis, Tiffany Naiman, and Todd Presner. “A Student Collaborator’s Bill of Rights”. [https://humtech.ucla.edu/news/a-student-collaborators-bill-of-rights/](https://perma.cc/A8G2-BBL9) - -[^5]: Dobson, James. 2019. Critical Digital Humanities: The Search for a Methodology. Urbana-Champaign: University of Illinois Press. p. 40. +--- +title: "Introdução ao Jupyter Notebook" +slug: introducao-jupyter-notebooks +original: jupyter-notebooks +layout: lesson +collection: lessons +date: 2019-12-08 +translation_date: 2023-06-02 +authors: +- Quinn Dombrowski +- Tassie Gniady +- David Kloster +reviewers: +- Patrick Burns +- Jeri Wieringa +editors: +- Brandon Walsh +translator: +- Vânia Rosa +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Juliana Marques +- Caio Mello +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/431 +difficulty: 1 +activity: presenting +topics: [python, website] +abstract: Jupyter Notebook fornece um ambiente onde você pode trabalhar com facilidade o seu código na linguagem Python. Esta lição descreve como instalar o software Jupyter Notebook, como executar e criar ficheiros para o Jupyter Notebook. +avatar_alt: O planeta Júpiter +doi: 10.46430/phpt0043 +--- + +{% include toc.html %} + +## Introdução + +Quando a computação é uma parte intrínseca de sua prática de pesquisa, como você publica um argumento acadêmico de forma que torne o código tão acessível e legível como a prosa que o acompanha? Na área das humanidades, a publicação de uma pesquisa assume principalmente a forma de prosa escrita, artigo ou monografia. Embora as editoras estejam cada vez mais abertas à inclusão de códigos suplementares ou outros materiais, tal arranjo inerentemente os relega a um estatuto secundário relativo ao texto escrito. + +E se você pudesse publicar sua pesquisa em um formato que desse um peso equilibrado entre a prosa e o código? A realidade das atuais diretrizes de publicação acadêmica significa que a separação forçosa do seu código e da argumentação pode ser uma necessidade, e sua reunificação pode ser impossível sem que se navegue por numerosos obstáculos. Atualmente o código é tipicamente publicado em separado no GitHub ou em outro repositório, caso no qual os leitores têm que procurar uma nota de rodapé no texto para descobrir quais scripts estão sendo referenciados, encontrar a URL do repositório, acessar a URL, procurar os scripts, baixá-los e também os ficheiro(s) de dados associados, e então executar os códigos. No entanto, se você tiver os direitos e permissões necessários para republicar o texto de sua pesquisa em outro formato, o Jupyter Notebook fornece um ambiente onde código e prosa podem ser justapostos e apresentados com igual peso e valor. + +Os Jupyter Notebooks têm visto uma adoção entusiástica na comunidade de ciência de dados, a ponto de cada vez mais substituir o Microsoft Word como um ambiente padrão de escrita da pesquisa. Dentro da literatura de humanidades digitais, pode-se encontrar referência a Jupyter Notebooks (separados do iPython, ou Python interativo, notebooks em 2014) desde 2015. + +Os Jupyter Notebooks também ganharam força nas humanidades digitais como uma ferramenta pedagógica. Diversos tutoriais do Programming Historian, como [Mineração de texto em Python através do leitor de recursos HTRC](/en/lessons/text-mining-with-extracted-features), e [Extraindo páginas ilustradas de bibliotecas digitais com python](/pt/licoes/extrair-paginas-ilustradas-com-python#jupyter-notebooks), assim como outros materiais pedagógicos para oficinas fazem referência à colocação de código em um Jupyter Notebook ou ao uso do Jupyter Notebook para orientar os estudantes, permitindo que eles remixem e editem o código livremente. O formato do notebook é ideal para o ensino, especialmente quando os estudantes têm diferentes níveis de proficiência técnica e de conforto com escrita e edição dos códigos. + +O objetivo dos Jupyter Notebooks é fornecer uma interface mais acessível para o código usado em pesquisa ou práticas pedagógicas com suporte digital. Ferramentas como os Jupyter Notebook são menos significativas para aprender ou ensinar no vácuo, porque os Jupyter Notebooks em si não fazem nada para promover diretamente a pesquisa ou a pedagogia. Antes de começar esta lição, pense no que você quer obter usando Jupyter Notebooks. Deseja organizar o fluxo de trabalho do seu projeto? Você quer trabalhar analisando seus dados, acompanhando as coisas que você tenta ao longo do caminho? Você quer que os leitores da sua pesquisa possam seguir os lados teóricos e técnicos do seu argumento sem alternar entre um PDF e uma pasta de scripts? Quer ministrar oficinas de programação mais acessíveis aos participantes com uma gama de conhecimentos técnicos? Você quer usar ou adaptar notebooks que outras pessoas escreveram? Tenha seu objetivo em mente enquanto você trabalha nesta lição. Dependendo de como você imagina usar Jupyter Notebooks, você pode ser capaz de pular seções que são mais aplicáveis em outro contexto. + +## Metas de lição + +Nesta lição você aprenderá: + +- O que são Jupyter Notebooks + +- Como instalar, configurar e usar o pacote de software do Jupyter Notebook + +- Quando os cadernos podem ser úteis em pesquisas e contextos pedagógicos + +Para esta lição, vamos trabalhar em um cenário de uso de Jupyter Notebooks para analisar dados e, em seguida, adaptar esse mesmo notebook e dados para uso em sala de aula. A aula também abordará temas mais avançados relacionados aos Jupyter Notebooks, tais como: + +- Usando Jupyter Notebook para linguagens de programação que não sejam Python + +- Convertendo o código Python existente em Jupyter Notebooks + +- Usando Jupyter Notebooks para ampliar a capacidade computacional em ambientes como clusters de computação de alto desempenho + +## Pré-requisitos + +Esta lição é adequada para iniciantes intrépidos, assumindo pouca experiência técnica anterior. + +Na verdade, o Jupyter Notebook é um ótimo recurso para pessoas que estão aprendendo a escrever código. + +Dependendo do notebook que você quer executar, você pode precisar [instalar alguns módulos Python com pip](/pt/licoes/instalacao-modulos-python-pip), que assume alguma familiaridade com a linha de comando (para [windows aqui](/en/lessons/intro-to-powershell), ou [Mac/Linux aqui](/en/lessons/intro-to-bash) (em inglês)). + +A lição é escrita usando o Jupyter Notebook 6.0, mas a interface do usuário e a funcionalidade do software tem sido bastante consistente entre as versões. + +## Computação Letrada + +A relação entre código legível por computador e texto legível por humanos ganhou visibilidade dentro da ciência da computação na década de 1970, quando Donald Knuth propôs o paradigma da "programação letrada" (ou “programação alfabetizada”). Em vez de organizar o código de acordo com os requisitos que privilegiam a execução do código pelo computador, a programação letrada trata um programa como literatura compreensível aos seres humanos, priorizando o próprio processo de pensamento do programador. A programação letrada projetada por Knuth assume a forma de prosa escrita, com código acionável por computador incorporado em macros (um formato abreviado para escrever código). Ferramentas de programação letrada são usadas para gerar duas saídas do programa letrado: código "emaranhado" que pode ser executado pelo computador e documentação formatada "tecida".[^1] + +Fernando Pérez, o criador do ambiente de programação iPython que acabou se tornando o Projeto Jupyter, cunhou o termo computação letrada para o modelo usado pelos Jupyter Notebooks: + +> Um ambiente de computação letrado é aquele que permite aos usuários não apenas executar comandos, mas também armazenar os resultados desses comandos em um formato de documento literário, juntamente com figuras e com texto em formato livre que pode incluir expressões matemáticas formatadas. Na prática, ele pode ser visto como uma mistura de um ambiente de linha de comando, como o shell Unix, com um processador de texto, uma vez que os documentos resultantes podem ser lidos como texto, mas contêm blocos de código que foram executados pelo sistema computacional subjacente.[^2] + +Jupyter não é nem o primeiro e nem o único exemplo de cadernos computacionais. Já na década de 1980, interfaces de notebook estavam disponíveis através de softwares como Wolfram Mathematica e MATLAB. Em 2013, Stéfan Sinclair e Geoffrey Rockwell propuseram "cadernos Voyant" baseados no modelo de Mathematica, que exporia algumas das suposições que sustentam as [Ferramentas Voyant](https://perma.cc/9M5K-JWU7) e as tornaram configuráveis pelo usuário.[^3] Eles desenvolveram ainda esse conceito em [A Arte da Análise de Texto Literário Cadernos Spyral](https://perma.cc/53HW-GGSJ). + + +Jupyter ganhou força em muitos campos como um ambiente de código aberto compatível com inúmeras linguagens de programação. O nome Jupyter é uma referência às três linguagens principais suportadas pelo projeto (Julia, Python e R), mas [núcleos estão disponíveis que tornam o Jupyter compatível com dezenas de idiomas](https://perma.cc/B448-XMJQ), incluindo Ruby, PHP, Javascript, SQL e Node.js. Pode não fazer sentido implementar projetos em todas essas línguas usando Jupyter Notebooks (por exemplo, Omeka não permitirá que você instale um plugin escrito como um Jupyter Notebook), mas o ambiente Jupyter ainda pode ser valioso para documentar códigos, ensinar linguagens de programação e fornecer aos alunos um espaço onde eles podem facilmente experimentar com exemplos fornecidos. + + +## Instalando o Jupyter Notebooks + +Desde o final de 2019, existem dois grandes ambientes que você pode usar para executar Jupyter Notebooks: O Jupyter Notebook (não confundir com os próprios ficheiro(s) do Jupyter Notebook, que possuem uma extensão `.ipynb`), e o mais novo Jupyter Lab. O Jupyter Notebook é amplamente usado e bem documentado, e fornece um navegador simples de ficheiro(s), juntamente com o ambiente para criar, editar e executar os notebooks. Jupyter Lab é mais complexo, com um ambiente de usuário mais parecido com um Ambiente de Desenvolvimento Integrado (discutido em tutoriais anteriores do Programming Historian para [Windows](/pt/licoes/instalacao-windows), [Mac](/pt/licoes/instalacao-mac) e [Linux](/pt/licoes/instalacao-linux)). Embora o Jupyter Lab seja feito para, eventualmente, substituir o Jupyter Notebook, não há indicação de que o Jupyter Notebook deixará de ser suportado tão cedo. Devido à sua simplicidade comparativa e facilidade de uso para iniciantes, este tutorial usa o Jupyter Notebook como o software para executar ficheiro(s) de notebook. Ambos os pacotes de software estão incluídos na Anaconda, descrita abaixo. É mais fácil usar a Anaconda para instalar o Jupyter Notebook, mas se você já tem Python instalado em seu sistema e não quer lidar com o grande pacote Anaconda, você pode executar `pip3 install jupyter` (para Python 3). + + +## Anaconda + +Anaconda é uma distribuição gratuita de código aberto de Python e R que vem com mais de 1.400 pacotes, o gerenciador de pacotes Conda para instalação de pacotes adicionais, e o navegador Anaconda, que permite gerenciar ambientes (por exemplo, você pode instalar diferentes conjuntos de pacotes para diferentes projetos, para que eles não causem conflitos uns para os outros) usando uma interface gráfica. Após a instalação da Anaconda, você pode usar o navegador Anaconda para instalar novos pacotes (ou `conda install` através da linha de comando), mas muitos pacotes estão disponíveis apenas através de pip (ou seja, usando `pip install` através da linha de comando ou em seu Jupyter Notebook). + +Para a maioria dos propósitos, você deve optar pela versão Python 3 do Anaconda, mas alguns códigos ainda podem ser escritos em Python 2. Nesta lição, você usará Python 3. O instalador Anaconda tem mais de 500 MB, e após a instalação pode levar mais de 3 GB de espaço no disco rígido, por isso certifique-se de que você tem espaço suficiente no computador e uma conexão de rede rápida antes de começar. + +
    +Se o espaço do disco rígido é uma preocupação, você pode empacotar um notebook para que ele possa ser executado usando recursos gratuitos de computação em nuvem, em vez de fazer com que os usuários instalem o Anaconda. Isso pode ser especialmente útil em situações de oficina. Veja a seção abaixo. +
    + +Para baixar e instalar a Anaconda, acesse o [site da Anaconda](https://www.anaconda.com/data-science-platform). Certifique-se de ter clicado no ícone do seu sistema operacional (que deve alterar o texto Anaconda [número da versão] para [sistema operacional selecionado], de forma a indicar o seu sistema operacional) e, em seguida, clique no botão Baixar na caixa para a versão atual do Python 3. Se você estiver no Windows, deve baixar um ficheiro `.exe`; em Mac, é `.pkg`; no Linux, é `.sh`. + +Abra normalmente o ficheiro para instalar o software em seu sistema operacional. Mais detalhes de instalação estão disponíveis nos [documentos da Anaconda](https://docs.anaconda.com/anaconda/install/), incluindo como instalar a Anaconda através da linha de comando em cada sistema operacional. Se o computador não conseguir abrir o ficheiro que você baixou, certifique-se de selecionar o sistema operacional correto antes de baixar o instalador. No Windows, não deixe de escolher a opção de "Adicionar Anaconda à PATH Variable" durante o processo de instalação, ou você não poderá lançar Jupyter Notebook a partir da linha de comando. + +## Usando Jupyter Notebook para pesquisa + +Esta lição descreve como você pode inicialmente escrever um Jupyter Notebook para análise de dados como parte de um projeto de pesquisa e, em seguida, adaptá-lo para uso em sala de aula. Embora este exemplo em particular seja extraído de estudos de fãs, ele se concentra na conversão de datas, que é amplamente necessária na análise de dados históricos e literários. + +## Abrindo o Jupyter Notebook + +Supondo que você já tenha instalado a Anaconda como descrito acima, você pode abrir o Anaconda Navigator como qualquer outro aplicativo de software (você pode fechar o prompt sobre a criação de uma conta na nuvem do Anaconda; você não precisa de uma conta para trabalhar com o Anaconda). Na tela inicial, você deve ver um conjunto de ícones e breves sinopses sobre cada aplicativo incluído no Anaconda. + +Clique no botão "Iniciar" sob o ícone do Jupyter Notebook. + +{% include figure.html filename="tr-pt-introducao-jupyter-notebooks-1.png" alt="Imagem com captura de tela do interface do Anaconda Navigator" caption="Figura 1. Interface do Anaconda Navigator" %} + +Se você preferir usar a linha de comando em vez do navegador Anaconda, uma vez que você tenha o Anaconda instalado, você deve ser capaz de abrir uma nova janela Terminal (Mac) ou Command Prompt (Win) e executar `jupyter notebook` para iniciar o navegador web com o aplicativo Jupyter Notebook. Se você estiver usando a linha de comando para iniciar o Jupyter Notebook, preste atenção no diretório em que você está quando o iniciar. Essa pasta se torna o diretório doméstico que aparecerá imediatamente na interface Jupyter Notebook, conforme descrito abaixo. + +As duas abordagens abrirão uma nova janela ou guia no seu navegador padrão com a interface Jupyter Notebook. O Jupyter Notebook é baseado no navegador: você só interage com ele através do seu navegador, mesmo quando o Jupyter Notebook está sendo executado no seu próprio computador. + +
    Se você estiver usando notebooks que importam pacotes Python que têm dependências de versões específicas de outros pacotes, você deve configurar um ambiente para usar com esses notebooks, para que você não lide com conflitos de versão (por exemplo, se um notebook requer a versão 1.0 de um pacote, e outro requer a versão 2.0). [A documentação do Anaconda Navegador para Gerenciar Ambientes](https://perma.cc/E9TC-YMCU) (ou, se preferir usar a linha de comando, a [documentação Conda](https://perma.cc/KHB8-U3CT)) fornece instruções passo a passo para criar, atualizar e ativar um ambiente. Para lançar o Jupyter Notebook dentro de um ambiente específico, você precisa primeiro ativar esse ambiente.
    + +## Navegando na interface do Jupyter Notebook + +A interface do gerenciador de ficheiro do Jupyter Notebook é a principal maneira de abrir um ficheiro Jupyter Notebook (.ipynb). Se você tentar abrir em um editor de texto simples, o notebook será exibido como um ficheiro JSON, não com blocos interativos de código. Para visualizar um notebook através da interface Jupyter, você tem que abrir o Jupyter Notebook primeiro (que será exibido em uma janela do navegador), e abrir o ficheiro de dentro do Jupyter Notebook. Infelizmente, não há como definir o Jupyter Notebook como o aplicativo de software padrão para abrir `ficheiro.ipynb` quando você clica duas vezes neles. + +Quando você lança o Jupyter Notebook do navegador Anaconda, ele exibe automaticamente o diretório doméstico. Este é geralmente o diretório com seu nome de usuário em um Mac (/Users/seu nome de usuário). Em um PC geralmente é `C: \` . Se você abrir o Jupyter Notebook a partir da linha de comando, ele exibirá o conteúdo da pasta em que você estava quando o lançou (usando a linha de comando, você também pode lançar diretamente um notebook específico, por exemplo, `jupyter-notebook-example.ipynb`.) + +Para evitar desordenar esta pasta, você pode fazer uma nova pasta dentro deste diretório para seus notebooks. Você pode fazer isso na sua interface usual de gerenciamento de ficheiro(s)(Finder no Mac, ou File Explorer no Windows), ou dentro do próprio Jupyter Notebook, já que o Jupyter Notebook, assim como o Google Drive, fornece uma interface de gerenciamento de ficheiro(s) dentro de um navegador, bem como uma interface de menu e de barra de ferramentas para a criação de ficheiro(s). Para adicionar uma nova pasta no Jupyter Notebook, clique em Novo no canto superior direito e escolha Pasta. Isso criará uma nova pasta chamada "Pasta Sem Título". Para alterar o nome, clique na caixa de seleção à esquerda da "Pasta Sem Título", em seguida, clique no botão "Renomear" que aparece na guia "ficheiro(s)". Nomeie os notebooks da pasta. Clique nele para abrir essa pasta. + +## Upload dos dados do exemplo +O ficheiro CSV de exemplo para esta lição é um extrato de metadados de fan fiction de Harry Potter coletados do site de fanfic italiano https://efpfanfic.net, depois limpos usando uma combinação de [expressões regulares](/en/lessons/understanding-regular-expressions) e [OpenRefine](/pt/licoes/limpar-dados-openrefine). O CSV tem três colunas: a classificação da história (similar a uma classificação de filme), a data que foi originalmente publicada, e a data mais recente de atualização. As opções de classificação são verde (verde), giallo (amarelo), arancione (laranja), e rosso (vermelho). A publicação e as datas atualizadas são criadas automaticamente; quando consistente a história é postada no site ou atualizado, assim você pode tomá-las como consistentes. + +Baixe o [ficheiro CSV](/assets/jupyter-notebooks/ph-jupyter-notebook-example.csv). + +Dentro do navegador de ficheiro(s) Jupyter Notebook, você deve estar dentro do diretório de notebooks que acabou de criar. No canto superior direito, clique no botão "Carregar" e carregue o ficheiro CSV de amostra. Será mais fácil de acessar se estiver no mesmo diretório do Jupyter Notebook que você criará na próxima etapa a fim de converter as datas. + +{% include figure.html filename="tr-pt-introducao-jupyter-notebooks-2.png" alt="Imagem com captura de tela sobre o upload de ficheiros no interface Jupyter Notebook" caption="Figura 2. Upload de ficheiro(s) na interface Jupyter Notebook" %} + +Observe que esta não é a única maneira de fazer os ficheiro(s) aparecerem no gerenciador de ficheiro(s) do Jupyter Notebook. A pasta de notebooks que você criou é um diretório regular em seu computador, e assim você também pode usar sua interface usual de gerenciamento de ficheiro(s) (por exemplo, Finder no Mac, ou File Explorer no Windows) para colocar ficheiro(s) `.ipynb` e/ou de dados neste diretório. Os Jupyter Notebooks usam a localização do próprio ficheiro do notebook (o `ficheiro.ipynb`) como o caminho de partida padrão. Para oficinas e cursos, pode fazer sentido criar uma pasta onde você pode armazenar o notebook, qualquer imagem anexada e os dados com os quais você vai trabalhar, todos juntos. Se tudo não estiver na mesma pasta, você terá que incluir o caminho ao referenciá-lo ou usar o código Python dentro do notebook para alterar o diretório de trabalho. + +## Criando um novo notebook + +Dentro da pasta de notebooks, crie um novo Jupyter Notebook para converter as datas para o seu projeto de pesquisa. Clique no botão "new" no canto superior direito da interface do gerenciador de ficheiro(s) do Jupyter Notebook. Se você acabou de instalar o Anaconda como descrito acima, sua única opção será criar um Jupyter Notebook usando o _kernel_ Python 3 (o componente de backend que realmente executa o código escrito no notebook), mas vamos discutir abaixo como adicionar kernels para outras linguagens de programação. Clique em "Python 3", e o Jupyter Notebook abrirá uma nova guia com a interface para os próprios Jupyter Notebooks. Por padrão, o notebook será chamado de "Sem título"; você pode clicar nesse texto na parte superior da tela para renomeá-lo. + +{% include figure.html filename="tr-pt-introducao-jupyter-notebooks-3.png" alt="Imagem com captura de tela da interface do Jupyter Notebook para criar novo ficheiro" caption="Figura 3. Criando um novo Jupyter Notebook" %} + +## Trabalhando em Jupyter Notebooks + +Um notebook é composto de células: caixas que contêm código ou texto legível por humanos. Cada célula tem um tipo, que pode ser selecionado a partir das opções drop-down do menu (“menu deslizante”). A opção padrão é "Code"; as caixas de textos legíveis por humanos devem usar o tipo "Markdown" e precisarão ser escritas usando as convenções de formatação do Markdown. Para saber mais sobre Markdown, veja a lição do Programming Historian “[Introdução ao Markdown](/pt/licoes/introducao-ao-markdown)”. + +Quando você cria um novo Jupyter Notebook, a primeira célula será uma célula de código. No topo da interface do Jupyter Notebook está uma barra de ferramentas com funções que se aplicam à célula selecionada atualmente. A primeira função do menu deslizante é, por padrão, "Code". Clique nesse menu e selecione "Markdown" (você também pode usar um atalho de teclado, _esc + m_, para alterar a célula atual para Markdown, e _esc + y_ muda de volta para uma célula de código). Vamos começar este caderno com um título e uma breve explicação do que o caderno está fazendo. No momento, isso é apenas para sua própria memória e referência; você não quer investir muito em prosa e formatação nesta fase do projeto, quando você não sabe se você vai acabar usando este código como parte de seu projeto final, ou se você vai usar uma ferramenta ou método diferente. Mas ainda pode ser útil incluir algumas células de marcação com notas para ajudá-lo a reconstruir seu processo. + +Cole o seguinte na primeira célula. Se a primeira linha não aparecer com uma fonte grande (como um cabeçalho), certifique-se de ter selecionado "Markdown" no menu suspenso na parte superior. + + +``` +# Fanfic date conversion +Converting published & updated dates for Italian fanfic into days of the week. +``` + +{% include figure.html filename="tr-pt-introducao-jupyter-notebooks-4.png" alt="Imagem com captura de tela da interface do Jupyter Notebook para editar Markdown" caption="Figura 4. Editando a célula Markdown em um Jupyter Notebook" %} + +Quando você está editando uma célula, você pode usar _Ctrl + Z_ (Win) ou _Command + Z_ (Mac) para desfazer as alterações que você fez. Cada célula mantém seu próprio histórico de edição; mesmo que você passe para uma célula diferente e faça edições lá, você pode posteriormente clicar de volta na primeira célula e desfazer suas alterações anteriores lá, sem perder as alterações realizadas para a segunda célula. + +Para deixar o modo de edição e "executar" esta célula (para uma célula Markdown, isso não faz nada, apenas move o cursor mais para baixo no notebook), você pode clicar na barra de ferramentas ou pressione Ctrl+Enter (Ctrl+Return no Mac). Se você quiser retomar a edição mais tarde, você pode clicar duas vezes nela ou selecionar a célula (que mostrará uma linha azul vertical à esquerda uma vez selecionada) clicando-a uma vez e, em seguida, pressionando a tecla Enter (Win) ou Return (Mac). Para deixar o modo de edição, você pode clicar na barra de ferramentas ou pressionar Ctrl+Enter (Ctrl+Return no Mac). Se você quiser executar sua célula atual e adicionar uma nova célula (por padrão, uma célula de código) imediatamente abaixo dela, você pode pressionar Alt+Enter (Option+Enter no Mac). + +Em seguida, você precisa descobrir como fazer a conversão. A busca por termos relevantes pode levá-lo a essa [discussão do StackOverflow](https://perma.cc/JG6H-KZAZ), e a primeira resposta envolve o uso do módulo Python datetime. Como primeiro passo, você precisa importar datetime, usando uma célula de código. Você também sabe que o seu ficheiro de entrada é um CSV, então você deve importar o módulo csv também. + +Para adicionar uma nova célula, clique no botão + (mais) na barra de ferramentas (ou use o atalho do teclado _esc + b_). Isso criará uma nova célula de código abaixo da célula que está atualmente selecionada. Crie uma nova célula de código e cole o código a seguir para importar um módulo Python: + + +``` +import datetime +import csv + +``` + +Pensando desde já na possibilidade de compartilhar este notebook ou parte dele, pode ser útil dividir as importações de módulos em células individuais, e colocar o código em si em outra célula, para que você possa incluir uma célula Markdown que explique o que cada uma delas está fazendo. + +Ambos os pacotes que você está importando para este notebook já estão instalados como parte do Anaconda, mas existem muitos pacotes de nicho relevantes para a pesquisa (por exemplo, o [Classic Languages Toolkit, CLTK](https://perma.cc/Q9Q8-9TNZ), para fazer análise de texto em línguas históricas) que não estão incluídos com a Anaconda, e não estão disponíveis através do _instalador conda_. Se você precisa de um pacote como esse, você tem que instalá-lo usando _pip_. Instalar pacotes de dentro do Jupyter notebook pode ser um pouco complicado, porque pode haver diferenças entre o kernel Jupyter que o notebook está usando, e outras versões do Python que você pode ter instalado no seu computador. Você pode encontrar uma longa discussão técnica sobre esses problemas neste [post de blog](https://perma.cc/N6M6-ZF5G). + +Se você está trabalhando em um notebook que deseja compartilhar, e ele inclui pacotes menos comuns, você pode incluir uma célula Markdown instruindo os usuários a instalar os pacotes com antecedência usando conda ou pip, ou você pode usar: + +``` +import sys +!conda install --yes --prefix {sys.prefix} YourModuleNameHere + +``` + +para instalar algo do notebook usando conda; a sintaxe `!` indica que o código está executando algo da linha de comando, em vez do kernel Jupyter. Ou, se o pacote não estiver disponível na conda (muitos pacotes de nicho relevantes para a pesquisa não estão), você pode usar `pip`: + +``` +import sys + +!{sys.executable} -m pip install YourModuleNameHere + +``` + +Se você não tinha instalado o Python no computador antes de instalar o Anaconda para esta lição, talvez seja necessário adicionar o pacote pip para poder usá-lo para instalar outros pacotes. Você pode adicioná-lo através da GUI (interface gráfica do usuário) do navegador Anaconda, ou executar `conda install pip` a partir da linha de comando. + +Voltando ao nosso exemplo, em seguida adicione uma nova célula de código e cole o seguinte código (certifique-se de que incluiu os espaçamentos): + +``` +with open('ph-jupyter-notebook-example.csv') as f: + csv_reader = csv.reader(f, delimiter=',') + for row in csv_reader: + datetime.datetime.strptime(row[1], '%d/%m/%Y').strftime('%A') + print(row) +``` + +Clicar no botão 'play' na barra de ferramentas quando você tem uma célula de código selecionada executa o código dentro da célula (se você tentar executar este código depois de executar as declarações de importação, verá um erro: "ValueError: time data ‘1/7/18’ does not match format ‘%d/%m/%Y’". Não se preocupe, vamos depurar isso a seguir). + +Depois de executar uma célula de código, um número aparecerá entre colchetes à esquerda da célula. Este número indica a ordem em que a célula foi executada. Se você voltar e executar o celular novamente, o número é atualizado. + +Se um número não aparecer imediatamente ao lado da célula, você verá um asterisco entre os colchetes. Isso significa que a célula de código não terminou de funcionar. Isso é comum para códigos computação intensiva (por exemplo, processamento de linguagem natural) ou tarefas de longa duração, como extração de conteúdo na web. Sempre que uma célula de código está sendo executada, o favicon na guia do navegador do notebook muda para uma ampulheta. Se você quiser alterar as guias e fazer outra coisa enquanto o código estiver em execução, você pode saber que a ação anterior foi concluída quando a ampulheta muda de volta para o ícone do notebook. + + +{% include figure.html filename="tr-pt-introducao-jupyter-notebooks-5.png" alt="Imagem com captura de tela sobre a execução de código no Jupyter Notebook" caption="Figura 5. Executando uma célula de código em um Jupyter Notebook" %} + +``` +O Jupyter notebook funciona melhor se você executar as células sequencialmente. Às vezes, você pode obter erros ou saídas incorretas se executar as células fora de ordem ou tentar editar e executar iterativamente diferentes partes do notebook. Se você fez muitas alterações e executou blocos de código de forma não linear e descobrir que você está recebendo uma saída estranha, você pode redefinir o Jupyter Notebook clicando no _Kernel_ no menu e escolhendo _Restart & Clear Output_. Mesmo que você não tenha notado nada de estranho, é uma boa ideia utilizar o Restart & Clear Output em seu código, uma vez que você tenha terminado de escrevê-lo, para ter certeza de que o resultado está correto. +``` + +Depois de executar a segunda célula de código, você verá um erro. Para descobrir o que está acontecendo, você pode consultar a +[documentação para datação](https://perma.cc/S92Z-3QVM) que explica cada uma das diferentes opções de formatação. Lá, você verá que a única opção de valores para “dia” assume o uso de dois dígitos (ou seja, dias de um dígito são prefixados com um 0). Olhando para os dados do exemplo, os meses (listados em segundo lugar nesta ordem de data) já são acrescidos de zero, quando tem apenas um dígito, mas não os dias. Você tem duas opções: você pode tentar alterar os dados, ou você pode tentar alterar seu código. + +Digamos que você queira tentar uma abordagem diferente, mas quer deixar o que você fez até agora, no caso de você querer revisitar esse código, e talvez usá-lo depois de alterar os dados. Para lembrar do que aconteceu, adicione uma célula Markdown acima da sua segunda célula do código. Clique na primeira célula do código e clique no botão mais na barra de ferramentas. Se você clicar no botão de adição na barra de ferramentas depois de executar a última célula de código, a nova célula aparecerá na parte inferior do notebook. Você pode movê-la para onde quiser clicando no botão de seta para cima. Certifique-se de que está no modo Markdown e cole o seguinte texto: + +``` + ### Não funciona, precisa de datas precedidas por zero + [documentação do datetime](https://docs.python.org/2/library/datetime.html?highlight=strftime#strftime-and-strptime-behavior). + Modificar o ficheiro de origem? + +``` + +Lendo ainda mais na [discussão do StackOverflow](https://perma.cc/EN55-P57H), há outra abordagem que usa uma biblioteca diferente, dateutil, que parece ser mais flexível com os tipos de datas que ela aceita. Volte para a célula usada para importar módulos e edite-a para adicionar a nova biblioteca (em qualquer lugar dessa célula, desde que cada declaração de importação esteja em sua própria linha): + +``` +import dateutil + +``` + +Re-execute essa célula de código; note que o número ao lado da célula muda na segunda vez que você executá-lo. + +Agora crie uma nova célula Markdown na parte inferior do notebook e cole: + + +``` +#### tentando dateutil para analisar datas, conforme https://stackoverflow.com/a/16115575 + +``` + +Abaixo dele, adicione uma nova célula de código com o seguinte código (prestando atenção ao espaçamento, de modo que o código seja indentado assim como você vê abaixo): + +``` +with open('ph-jupyter-notebook-example.csv') as f: + csv_reader = csv.reader(f, delimiter=',') + for row in csv_reader: + parseddate = dateutil.parser.parse(row[1]) + print(parseddate) +``` + +Execute a célula com o código que você acabou de adicionar. Pode levar mais tempo; continue esperando até que o asterisco ao lado da célula de código se transforme em um número. O resultado deve mostrar a lista de datas de publicação, formatadas de forma diferente, com hífen em vez de barras, e com a adição das horas, minutos e segundos (como zeros, porque as datas registradas não incluem esses dados). À primeira vista, parece que funcionou, mas se você compará-lo mais de perto com o ficheiro de origem, você verá que o módulo dateutil não está sendo consistente em como analisa as datas. Datas em que o valor do dia é maior que 12 estão sendo analisadas corretamente (ele sabe que um valor maior que 12 não pode ser um mês), mas quando o valor da data é 12 ou menos, a data está sendo identificada com o mês primeiro. A primeira linha do ficheiro de origem tem a data 1/7/18, que é entendida como "2018-01-07 00:00:00". Na documentação para dateutil, você descobrirá que você pode [especificar `dayfirst=true`](https://perma.cc/W54E-SP5Z) para corrigir isso. Edite a última célula de código e altere a penúltima linha para ler: + +``` +parseddate = dateutil.parser.parse(row[1], dayfirst=True) + + ``` + +Quando você executar a linha novamente, você verá que todas as datas foram analisadas corretamente. + +Analisar a data é apenas o primeiro passo – você ainda precisa usar o módulo datetime para converter as datas em dias da semana. + +Exclua a última linha do bloco de código e substitua-a pelo seguinte (certificando-se de que você tenha o mesmo nível de recuo da última linha anterior, para ambas as linhas): + + +``` +dayofweek = datetime.date.strftime(parseddate, '%A') + +print(dayofweek) + +``` + +Execute o bloco de códigos novamente. Isso deve lhe dar uma lista de dias da semana. + +Agora que você tem código para analisar e re-formatar uma data, você precisa fazê-lo para ambas as datas em cada linha do seu ficheiro de origem. Porque você sabe que tem código funcionante na célula de código atual, se você não se sentir muito confortável com Python, você pode querer copiar a célula de código atual antes de fazer modificações. Selecione a célula que deseja copiar e clique no botão copiar na barra de ferramentas; o botão de colar irá colar a célula abaixo de qualquer célula atualmente selecionada. Fazer uma cópia permite que você faça livremente alterações no código, sabendo que você sempre pode voltar facilmente para uma versão que funciona. + +Se você não quiser resolver isso por conta própria, você pode copiar e colar esse código em uma nova célula de código ou substituir a célula de código atual: + +``` +#identifica o ficheiro fonte a ser aberto, chama-o f +with open('ph-jupyter-notebook-example.csv') as f: + #cria um ficheiro de saída (referido como "out" no notebook) para ser gravado + with open('ph-jupyter-notebook-example-dayofweek.csv', 'w') as out: + #define "csv_reader" como executando a função csv.reader no ficheiro + csv_reader = csv.reader(f, delimiter=',') + #define "csv_writer" como executando a função csv.writer para "out" (o ficheiro de saída) + csv_writer = csv.writer(out) + #para cada linha que está sendo lida pelo csv_reader... + for row in csv_reader: + #define "csv_reader" como executando a função csv.reader no ficheiro + csv_reader = csv.reader(f, delimiter=',') + #para cada linha que está sendo lida pelo csv_reader... + for row in csv_reader: + #cria uma lista chamada "values" com o conteúdo da linha + values = list(row) + #define "rating" como a primeira coisa na lista + #contagem em Python começa com 0, não 1 + rating = values[0] + #define "parseddatepub" como a segunda coisa (1, porque começamos com 0) na lista, + #convertido em um formato de data padrão usando dateutil.parser + #e quando essas datas são analisadas, o analisador deve saber + #que o primeiro valor na sequência é o dia + parseddatepub = dateutil.parser.parse(values[1], dayfirst=True) + #mesmo que acima para a data atualizada, a terceira coisa (2) na lista + parseddateupdate = dateutil.parser.parse(values[2], dayfirst=True) + #define "dayofweekpub" como parseddatepub (definido acima), convertido para o dia da semana + #%A é usado para mudar para o dia da semana + #Pode ver outros formatos aqui: https://docs.python.org/3/library/datetime.html#strftime-and-strptime-behavior + dayofweekpub = datetime.date.strftime(parseddatepub, '%A') + #mesma coisa para data de atualização + dayofweekupdate = datetime.date.strftime(parseddateupdate, '%A') + #cria uma lista da classificação e as novas datas formatadas + updatedvalues = [rating, dayofweekpub, dayofweekupdate] + #escreve todos os valores nesta célula de código + csv_writer.writerow(updatedvalues) + print(updatedvalues) +``` + +Depois de executar este código, você terá um novo ficheiro ph-jupyter-notebook-exemplo-dayofweek.csv, com seus dados no formato que você precisa para a análise. + +Agora que você tem um código que funciona para converter as datas do formulário que você tem para o formulário que você precisa, você pode limpar as falsas partidas e notas para si mesmo. Você vai querer manter o primeiro código com as declarações de importação, e a primeira célula Markdown com o título e a descrição, mas você deve excluir outras células de código e Markdown que não são o seu código final. Para excluir uma célula, clique nela e clique no botão tesoura na barra de ferramentas. Se você excluir uma célula por engano, você pode clicar em Editar no menu e escolher "Desfazer excluir células". + +## Salvando, exportando e publicando Jupyter Notebooks + +O Jupyter salva automaticamente seu trabalho de forma periódica, criando "pontos de verificação". Se algo der errado com seu notebook, você pode reverter para um ponto de verificação anterior indo em "File", em seguida, "Revert to Checkpoint", e escolhendo um horário. Dito isto, ainda é importante salvar seu notebook (usando o botão de salvar), porque se você fechar e desligar o kernel do notebook (incluindo reiniciar o kernel), os pontos de verificação serão perdidos. + +Você também pode baixar o notebook (_File> Download as_) em vários formatos de ficheiro diferentes. Baixar o formato Notebook (`.ipynb`) é útil se você quiser compartilhar seu código em seu formato completo de notebook. Você também pode baixá-lo como código em qualquer linguagem em que seu notebook estiver (por exemplo, `.r` se em R ou `.py` se Python ou `.js` se JavaScript), como um ficheiro de `.html`, como um ficheiro de marcação (`.md`) ou como um PDF via LaTeX. Se você baixá-lo como código, as células Markdown se tornam comentários (se você quiser converter um ficheiro, `ficheiro.ipynb` para outro formato depois de baixá-lo, você pode usar a ferramenta [nbconvert](https://perma.cc/6J73-KCK5)). + +Se você está trabalhando em um projeto de pesquisa, você pode usar um Jupyter notebook, ou uma série de notebooks, ao longo do caminho para acompanhar seu fluxo de trabalho. Alguns estudiosos postam esses cadernos no GitHub, juntamente com slides ou PDFs de pôsteres e dados de origem (ou metadados, se os direitos autorais permitirem), para acompanhar apresentações e palestras. O GitHub renderiza versões não interativas de ficheiro(s) de notebook, para que possam ser visualizados dentro de um repositório. Alternativamente, você pode colar a URL de um repositório do GitHub que tem notebooks Jupyter em [nbviewer,](https://nbviewer.jupyter.org/) o que às vezes pode ser uma visualização mais rápida e confiável. Você pode querer incluir uma célula Markdown com uma citação recomendada para o seu Jupyter notebook, e uma referência para o repositório do GitHub onde ela está armazenada, especialmente se o seu notebook inclui código que outros possam reutilizar para análises semelhantes. + +O código que você acabou de desenvolver como parte desta lição pertence a algum lugar no meio de um projeto real. Se você estiver usando notebooks para documentar seu fluxo de trabalho, você pode optar por adicionar a nova célula de código a um notebook existente, em vez de baixá-lo como um notebook separado e autônomo. Os Jupyter notebooks podem ser particularmente úteis para documentar fluxos de trabalho de projetos quando você está trabalhando com colaboradores que só podem estar envolvidos por um curto período de tempo (como estagiários de graduação no período de férias escolares). Com colaboradores de curto prazo, é importante ajudá-los a entender e começar a usar os fluxos de trabalho do projeto sem muito tempo de iniciação, e os Jupyter notebooks podem definir esses fluxos de trabalho passo a passo, explicar onde e como os ficheiro(s) são armazenados e fornecer dicas para tutoriais externos e materiais de treinamento para ajudar os colaboradores que estão menos familiarizados com os fundamentos técnicos do projeto a serem iniciados. Por exemplo, dois projetos que usaram Jupyter notebooks para publicar fluxos de trabalho são o Projeto Realismo Socialista de Sarah McEleney e a [“mineração de texto da literatura infantil inglesa 1789-1914 para a representação de insetos e outros rastejantes assustadores”](https://perma.cc/JD8N-P79G). + +À medida que seu projeto progride, se você estiver publicando através de canais de acesso aberto e se seus conjuntos de dados podem ser compartilhados livremente, os Jupyter notebooks podem fornecer um formato ideal para tornar o código que sustenta seu argumento acadêmico visível, testável e reutilizável. Embora os periódicos e publicações possam não aceitar os Jupyter notebooks como um formato de submissão, você pode desenvolver uma "versão" do seu artigo que inclui o texto completo (como células Markdown),com células de código integradas ao fluxo da narrativa acadêmica como uma ilustração imediatamente acessada da análise que você está descrevendo. Você também pode incluir as células de código que compõem os fluxos de trabalho de preparação de dados como um apêndice, seja no mesmo notebook, ou em um separado. Integrar o código com o texto de um artigo acadêmico torna muito mais provável que os leitores realmente se envolvam com o código, já que eles podem simplesmente executá-lo dentro do mesmo caderno onde estão lendo o argumento. Alguns estudiosos, particularmente na Europa, também postam seus cadernos no [Zenodo](https://zenodo.org/), um ficheiro para dados de pesquisa, independentemente do país de origem, financiador ou disciplina. O Zenodo suporta configurações de dados de até 50 GB (vs. o limite de tamanho de ficheiro de 100 MB no Github), e fornece DOIs para o material carregado, incluindo notebooks. Alguns estudiosos combinam arquivamento no Zenodo para sustentabilidade com a publicação no GitHub para a possibilidade de encontrar, incluindo o Zenodo DOI como parte do ficheiro readme.md no repositório do GitHub que inclui os notebooks. Como exemplo, o caderno de workshop ["Análise de Dados Aplicados" por Giovanni Colavizza e Matteo Romanello para o DHOxSS 2019](https://perma.cc/6S7H-LQEA) é publicado no GitHub, mas inclui [um Zenodo DOI](https://doi.org/10.5281/zenodo.3352830). + +Embora a argumentação e o código totalmente integrados ainda sejam difíceis de encontrar devido à falta de um local para publicar esse tipo de trabalho, os estudiosos começaram a usar os Jupyter notebooks como um passo incremental mais interativo para publicações computacionais dinâmicas. José Calvo tem um exemplo de um [caderno acompanhando um artigo sobre estilizometria](https://perma.cc/Y9CK-CFK8) (em espanhol), e Jed Dobson publicou um [conjunto de cadernos](https://perma.cc/UDA3-467P) para acompanhar seu livro Critical Digital Humanities: The Search for a Methodology, que aborda diretamente os Jupyter Notebooks como objetos acadêmicos (p.39-41). + +## Usando Jupyter Notebook para ensinar + +O Jupyter Notebook é uma ótima ferramenta para ensinar programação, ou para ensinar conceitos como modelagem de tópicos ou vetores de palavras que envolvem programação. A capacidade de fornecer instruções e explicações como Markdown permite que os educadores forneçam notas detalhadas sobre o código através de marcação alternada e células de código, de modo que o texto de Markdown explique o código na célula logo abaixo. Isso é útil para oficinas práticas, pois as instruções e o código podem ser escritos com antecedência. Isso permite que os participantes abram o notebook, baixem um conjunto de dados e executem o código conforme está. Se você espera ministrar uma oficina onde os alunos terão diferentes níveis de familiaridade com a programação, você pode configurar o notebook para ter tarefas suplementares para os alunos que se sentem confortáveis em modificar o código. Ao mesmo tempo, mesmo os alunos que hesitam em tocar no código ainda poderão alcançar o resultado principal da oficina apenas executando células de código pré-escritas. + +Como outra abordagem, você também pode usar Jupyter notebooks para escrever código na medida em que o desenvolve. Em tal oficina, os alunos podem começar com um caderno em branco, e escrever o código junto com você. As células ajudam a segmentar o código como você o escreve, em vez de usar um editor de texto ou IDE (Ambiente de Desenvolvimento Integrado) que não quebra o código de forma tão clara e pode causar confusão, especialmente quando ensina iniciantes. + +Você pode usar Jupyter notebooks para tarefas em sala de aula dando instruções em Markdown e fazendo com que os alunos escrevam código em uma célula em branco com base nas instruções. Dessa forma, você pode criar uma tarefa de programação interativa que ensina aos alunos não apenas a sintaxe e o vocabulário de uma linguagem de programação, mas também pode explicar as melhores práticas de programação em geral. + +Se você já está usando Jupyter notebooks para documentar o fluxo de trabalho do seu projeto, você pode ser capaz de reformular esses cadernos de pesquisa para uso em sala de aula, como uma maneira de trazer sua pesquisa para a sala de aula. Este [exemplo de caderno pedagógico](/assets/jupyter-notebooks/ph-jupyter-notebook-example.ipynb) é um híbrido de algumas das abordagens pedagógicas descritas acima. A primeira seção do caderno destina-se a estudantes que têm pouca ou nenhuma experiência anterior executando o código; o principal resultado do aprendizado é comparar o tempo necessário para converter manualmente formatos de dados, em comparação com fazê-lo com código. Você poderia usar este caderno para uma sessão de laboratório prática em uma introdução à humanidades digitais ou história digital, onde todos os alunos instalam Anaconda e aprendem o básico do Jupyter Notebook. Se a turma tem uma mistura de alunos sem formação técnica e alunos com exposição prévia ao Python, você pode orientar os alunos com experiência de programação a trabalhar em conjunto em grupos de dois ou três para propor soluções para os prompts na segunda parte do notebook. Tenha em mente que se você usar uma tarefa de classe como esta como uma forma de fazer com que os alunos de ciência da computação escrevem código que ajude seu projeto de pesquisa, eles devem ser creditados como colaboradores e reconhecidos em publicações subsequentes vindas do projeto.[^4] + +Existem muitos cursos e workshops de 'Introdução ao Python' nas Humanidades Digitais que utilizam Jupyter Notebook (incluindo [Introdução ao Python e Desenvolvimento web com Python para as Humanidades](https://perma.cc/ANL2-K7SM) by Thibault Clérice, traduzido do material por Matthew Munson). O Jupyter Notebook também é comumente usado em oficinas de análise de texto, como a [oficina de vetores de palavras na DH 2018](https://perma.cc/5UZ9-25XW), ministrada por Eun Seo Jo, Javier de la Rosa e Scott Bailey. + +Ensinar com Jupyter Notebook nem sempre tem que envolver o processo demorado de baixar e instalar a Anaconda, especialmente se você está imaginando ter apenas uma ou duas lições que envolvem notebooks. Se suas atividades em sala de aula com Jupyter notebooks envolvem o uso de dados de exemplo que você já preparou, e se você já escreveu pelo menos parte do código, você pode querer explorar a execução de Jupyter Notebooks usando recursos gratuitos de computação em nuvem, desde que seus alunos tenham a garantia de ter conectividade confiável com a internet em sala de aula. Rodar notebooks na nuvem também fornece um ambiente consistente para todos os alunos, poupando você de ter que negociar diferenças entre Windows e Mac, ou fornecer uma alternativa para estudantes cujos laptops não têm espaço ou memória para executar Anaconda efetivamente. + +Como as opções estão evoluindo rapidamente, é melhor usar seu mecanismo de busca favorito para encontrar uma lista mais atualizada com opções de computação em nuvem para Jupyter Notebook. Um projeto que tem visto uma absorção particular entre usuários acadêmicos de notebooks é o [MyBinder](https://mybinder.org/). Nele você levará um repositório do GitHub que contém dados relacionados a ficheiro(s) `jupyter.ipynb` (imagens incorporadas, conjuntos de dados que você deseja usar os notebooks, etc.), as informações sobre pacotes e dependências necessários (em um `requisito.txt` ou `ficheiro-environment.yml`) e torná-lo incializável usando um servidor de nuvem. Uma vez que você tenha o pacote MyBinder até o seu repo GitHub, você pode adicionar um "crachá" binder ao ficheiro readme para o repo. Quem estiver vendo o relatório pode lançar o notebook diretamente do seu navegador, sem ter que baixar ou instalar nada. + +Como os dados que o notebook precisa acessar devem ser incluídos no repo, isso não funcionará para todas as situações (por exemplo, se os dados não podem ser redistribuídos legalmente no GitHub, excede o tamanho máximo de ficheiro(s) do GitHub e não podem ser baixados de outros lugares como parte da configuração do ambiente Binder, ou se você quiser que as pessoas usem o notebook com seus próprios dados), mas é uma ótima opção para oficinas ou aulas onde todos estão trabalhando com os mesmos dados compartilháveis. + +Se você quiser começar a explorar opções de nuvem, Shawn Graham criou [alguns modelos para configurar notebooks Python e R Jupyter para uso no Binder](https://perma.cc/T25E-BFH4). + +Finalmente, se você precisa manter seus notebooks fora da nuvem (por exemplo, devido a dados sensíveis ou de outra forma restritos), mas quiser fornecer um ambiente consistente para todos os seus alunos, você pode explorar o [JupyterHub,](https://perma.cc/8EH7-N22K) que tem sido adotado como infraestrutura técnica central para um número crescente de programas de ciência de dados. + +## Convertendo códigos Python + +Mesmo que você goste da ideia de usar Jupyter Notebooks, qualquer conversão de formato requer trabalho adicional. Se você já tem seu código escrito como scripts Python, a conversão para Os Jupyter Notebooks é bastante simples. Você pode copiar e colar o código do seu ficheiro.py em uma única célula de código de um novo notebook e, em seguida, dividir a célula de código em segmentos e adicionar células de Markdown adicionais conforme necessário. + +Alternativamente, pode ser mais fácil segmentar à medida que você transfere o código, copiando um segmento de cada vez em uma nova célula de código. Qualquer método funciona e é uma questão de preferência pessoal. + +Há também ferramentas como o [pacote 'p2j'](https://perma.cc/5YUE-YBH7) que convertem automaticamente o código Python existente em notebooks Jupyter, seguindo um conjunto documentado de convenções (por exemplo, transformando comentários em células Markdown). + +## Cadernos Jupyter para outras linguagens de programação + +Os Jupyter Notebooks permitem que você use muitas linguagens de programação diferentes, incluindo R, Julia, JavaScript, PHP ou Ruby. Uma lista atual de linguagens disponíveis pode ser encontrada na página do [Jupyter Kernels](https://perma.cc/B448-XMJQ) GitHub. + +Enquanto o Python é suportado por padrão quando você instala o Jupyter Notebook através da Anaconda, as outras linguagens de programação precisam ter seus núcleos de linguagens instalados antes que eles possam ser executados no Jupyter Notebook. As instruções de instalação são diferentes para cada núcleo de linguagem, por isso é melhor apenas encontrar e seguir as instruções para a sua linguagem preferida. Pelo menos para R, isso é relativamente simples. A página Jupyter Kernels GitHub tem links para instruções para todos os kernels de linguagens disponíveis. + +Uma vez que você tenha o kernel para a linguagem desejada instalado, você pode executar cadernos escritos nessa linguagem de programação, ou você pode criar seus próprios cadernos que executam essa linguagem. Cada linguagem com um kernel instalado em seu computador estará disponível como uma opção quando você criar um novo notebook como descrito acima. + +Como exemplo de um notebook R, [veja esta adaptação jupyter do código R de Andrew Piper de "Enumerações"](https://perma.cc/656B-U9SB). + +## Dimensionando a computação com Jupyter Notebooks + +Especialmente se você é novo em programar em Python, apenas conseguir qualquer coisa para trabalhar pode parecer uma vitória. No entanto, se você começar a trabalhar com conjuntos de dados maiores, poderá descobrir que algumas das “soluções” iniciais encontradas (como usar `readlines()` para ler um ficheiro de texto linha por linha) se tornam computacionalmente ineficientes, a ponto de causar problemas. Uma maneira de começar a entender as ineficiências em seu código é adicionar `%%timeit` ao topo de uma célula. O notebook escolherá um número de iterações para executar o código, dependendo da complexidade da tarefa, imprimirá o número de iterações e o tempo médio. Fazer várias iterações, em vez de apenas uma, pode ser útil para contabilizar pequenos atrasos no âmbito do sistema (por exemplo, se seu laptop estiver momentaneamente atolado com outros processos). Você pode colocar `%timeit` na frente da linha. Tenha cuidado com a ordenação significativa: ordenar uma aplicação pequena de muito mais tempo para a primeira iteração do que para a segunda, depois que a lista já estiver em ordem. Em casos como a classificação de listas em que não faz sentido medir várias iterações ou para tarefas de longa duração onde pequenos atrasos no sistema não terão um impacto significativo, você pode usar `%%time` no topo de uma célula ou `%time` na frente de uma linha, que mede o tempo que uma única execução leva. Esses comandos fazem parte de uma família de “comandos mágicos” integrados disponíveis em Jupyter Notebooks. Veja a [documentação do Jupyter](https://perma.cc/ED9F-DNDA) para saber de mais detalhes. + +Ter alguma ideia de aumento do tempo previsto para ser implementado é um requisito necessário para aumentar o tempo dos clusters em uso, como no caso dos clusters de programação de alto desempenho (HPC) financiados de forma centralizadamente, disponíveis em muitas instituições. A maioria esmagadora dos pesquisadores que usam esses recursos está nas ciências duras, mas geralmente qualquer membro do corpo docente pode solicitar acesso. É possível que você também possa ter acesso a recursos de HPC regionais ou nacionais. Esses recursos de computação podem acelerar significativamente grandes trabalhos de computação, especialmente tarefas como modelagem 3D que podem tirar proveito de nós computacionais com poderosas unidades de processamento gráfico (GPUs). Aprender a usar clusters HPC é um tópico suficientemente grande para sua própria lição, mas os notebooks Jupyter podem permitir que você pegue um atalho. Alguns grupos de computação de pesquisa oferecem maneiras mais fáceis para os pesquisadores executarem Jupyter Notebooks usando recursos de cluster HPC, e você pode encontrar [vários guias e exemplos de uso geral](https://perma.cc/A5R4-9ZD7) para fazê-lo. Se você conseguir acesso aos recursos do HPC, vale a pena contatar a equipe de TI que com computação para uma área de e pesquisar sobre como você pode executar o Jupyter Notebook caso você não lidou com sua redação a respeito no site da sua instituição. O TI que trabalha majoritariamente com pesquisa pode se comunicar de forma brusca do que você é de forma mais pessoal, mas não permite que a maioria dos humanos querem, porque usam a diversidade da sua base de usuários é importante para suas medidas de atuação na universidade. + +## Conclusão +Desde a experimentação do código até a documentação de fluxos de trabalho, da pedagogia à publicação acadêmica, o Jupyter Notebook é uma ferramenta flexível e multiuso que pode apoiar a pesquisa digital em diversos contextos. Mesmo que você não tenha certeza de como exatamente você vai usá-los, é bastante fácil instalar o software Jupyter Notebook e baixar e explorar notebooks existentes, ou experimentar alguns dos seus próprios. Os Jupyter Notebooks têm uma grande promessa de fazer a ponte das facetas críticas e computacionais da pesquisa de humanidades digitais. Para concluir com uma citação de Jed Dobson's _Critical Digital Humanities: The Search for a Methodology_. + +>Notebooks são teoria - não apenas código como teoria, mas teoria como reflexo reflexivo com o trabalho teórico e implicações do próprio código. As normas disciplinares, incluindo enquadramento contextual, teoria e autocrítica, precisam acompanhar, complementar e informar qualquer crítica computacional. Revelar o máximo possível do código, dos dados e dos métodos é essencial para permitir a conversa disciplinar em curso. Compilando-os juntos em um único objeto, que pode ser exportado, compartilhado, examinado e executado por outros, produz um tipo dinâmico de teorização que é modular, mas firmemente ligado ao seu objeto.[^5] + +## Links +- Uma lista crescente de notebooks [Jupyter para DH](https://perma.cc/V5JX-VPP8), em múltiplas linguagens humanas e de programação. Obrigado a todos que enviaram sugestões no Twitter; referências adicionais são bem-vindas. + - Uma descrição técnica detalhada da [instalação de pacotes Python do Jupyter](https://perma.cc/N6M6-ZF5G). + +## Agradecimentos +- Obrigado a Stéfan Sinclair pelas referências a discussões anteriores sobre o uso de notebooks em humanidades digitais. + +- Obrigado a Rachel Midura por sugerir o uso de Jupyter Notebooks para colaboração. + +[^1]: Knuth, Donald. 1992. Literate Programming Stanford, Califórnia: Centro para o Estudo da Linguagem e da Informação. + +[^2]: Millman, KJ e Fernando Perez. 2014. “Developing open source scientific practice”. In Implementing Reproducible Research, Ed. Victoria Stodden, Friedrich Leisch, and Roger D. Peng. [https://osf.io/h9gsd/](https://perma.cc/M8R7-9JTL) + +[^3]: Sinclair, Stéfan & Geoffrey Rockwell. 2013. “Voyant Notebooks: Literate Programming and Programming Literacy”. Journal of Digital Humanities, Vol. 2, No. 3 Summer 2013. [https://journalofdigitalhumanities.org/2-3/voyant-notebooks-literate-programming-and-programming-literacy/](https://perma.cc/R253-BP2B) + +[^4]: Haley Di Pressi, Stephanie Gorman, Miriam Posner, Raphael Sasayama, and Tori Schmitt, with contributions from Roderic Crooks, Megan Driscoll, Amy Earhart, Spencer Keralis, Tiffany Naiman, and Todd Presner. “A Student Collaborator’s Bill of Rights”. [https://humtech.ucla.edu/news/a-student-collaborators-bill-of-rights/](https://perma.cc/A8G2-BBL9) + +[^5]: Dobson, James. 2019. Critical Digital Humanities: The Search for a Methodology. Urbana-Champaign: University of Illinois Press. p. 40. diff --git a/pt/licoes/introducao-linha-comando-bash.md b/pt/licoes/introducao-linha-comando-bash.md index c5e80e0d16..1cf314c8dd 100644 --- a/pt/licoes/introducao-linha-comando-bash.md +++ b/pt/licoes/introducao-linha-comando-bash.md @@ -38,7 +38,7 @@ Muitas das lições do *Programming Historian* exigem que você insira comandos {% include figure.html filename="en-or-intro-to-bash-01.png" alt="Screenshot mostrando interface gráfica de um computador" caption="Figura 1. GUI do computador de Ian Milligan" %} -Interfaces de linha de comando possuem vantagens para usuários de computador que precisam de maior precisão em seu trabalho – tal como historiadores(as) digitais. Elas permitem maior detalhamento quando executando alguns programas, ao passo que você pode adicionar modificações para especificar exatamente como deseja que o programa seja executado. Além do mais, elas podem ser facilmente automatizadas através de [scripts](http://www.tldp.org/LDP/Bash-Beginners-Guide/html/chap_01.html), que são basicamente conjuntos de comandos baseados em texto. +Interfaces de linha de comando possuem vantagens para usuários de computador que precisam de maior precisão em seu trabalho – tal como historiadores(as) digitais. Elas permitem maior detalhamento quando executando alguns programas, ao passo que você pode adicionar modificações para especificar exatamente como deseja que o programa seja executado. Além do mais, elas podem ser facilmente automatizadas através de [scripts](https://www.tldp.org/LDP/Bash-Beginners-Guide/html/chap_01.html), que são basicamente conjuntos de comandos baseados em texto. Existem duas interfaces de linha de comando principais, ou "shells", que muitos historiadores e historiadoras digitais utilizam. No macOS[^1] e muitas distribuições Linux, o shell é conhecido como `bash`, ou o "bourne-again shell" (shell renascido). Para usuários(as) de sistemas baseados no Windows, a interface de linha de comando é por norma baseada em `MS-DOS`, que utiliza comandos e [sintaxe](https://perma.cc/WPA6-LJG8) distinta, mas que comumente pode desempenhar tarefas similares. Essa lição oferece uma introdução básica ao terminal `bash`, e usuários Windows podem acompanhá-la instalando shells populares como [Cygwin](https://www.cygwin.com/) ou Git Bash (veja abaixo). @@ -60,7 +60,7 @@ Quando você o executa, verá esta janela. {% include figure.html filename="en-or-intro-to-bash-03.png" alt="Screenshot de uma tela vazia do Terminal" caption="Figura 3. Uma tela vazia do terminal em nosso macOS" %} -Você pode querer alterar a aparência padrão de seu terminal, pois os olhos podem se cansar ao olhar repetidamente para um texto preto em fundo branco. Na aplicação padrão do macOS, você pode abrir o menu **Settings** nas **Preferences** no Terminal. Clique na guia **Settings** e altere-a para um novo esquema de cores. Pessoalmente, preferimos algo com um pouco menos de contraste entre o fundo e o texto, já que você estará olhando para esta tela por muito tempo. "Novel" é agradável, assim como o popular conjunto de paleta de cores "[Solarized]"(http://ethanschoonover.com/solarized). Para usuários Windows, um efeito similar pode ser alcançado utilizando a aba **Properties** do Git Bash. Para alcançá-la, clique com o botão direito do mouse em qualquer lugar na barra superior e selecione **Properties**. +Você pode querer alterar a aparência padrão de seu terminal, pois os olhos podem se cansar ao olhar repetidamente para um texto preto em fundo branco. Na aplicação padrão do macOS, você pode abrir o menu **Settings** nas **Preferences** no Terminal. Clique na guia **Settings** e altere-a para um novo esquema de cores. Pessoalmente, preferimos algo com um pouco menos de contraste entre o fundo e o texto, já que você estará olhando para esta tela por muito tempo. "Novel" é agradável, assim como o popular conjunto de paleta de cores "[Solarized]"(https://ethanschoonover.com/solarized). Para usuários Windows, um efeito similar pode ser alcançado utilizando a aba **Properties** do Git Bash. Para alcançá-la, clique com o botão direito do mouse em qualquer lugar na barra superior e selecione **Properties**. {% include figure.html filename="en-or-intro-to-bash-04.png" alt="Screenshot da tela de configurações do Terminal do macOS" caption="Figura 4. A tela de configurações da Aplicação Shell Terminal do macOS" %} @@ -198,7 +198,7 @@ Esse comando cria um diretório com o nome, como você pode imaginar, `ProgHist- Mas espere! Há um truque para tornar as coisas um pouco mais rápidas. Vá para o diretório anterior (`cd ..` - o que o levará de volta para a área de trabalho). Para navegar até o diretório `ProgHist-Text`, você poderia digitar `cd ProgHist-Text`. Alternativamente, você poderia digitar `cd Prog` e depois pressionar a tecla Tab. Você notará que a interface completa automaticamente a linha para `cd ProgHist-Text`. Pressionar a tecla tab a qualquer momento no shell irá tentar completar a linha com base nos ficheiros ou subdiretórios no diretório atual. No entanto, é sensível a maiúsculas e minúsculas. No exemplo anterior, `cd prog` não seria autocompletado para `ProgHist-Text`. Quando dois ou mais ficheiros têm os mesmos caracteres, o completar preencherá apenas até o primeiro ponto de diferença. Encorajamos o uso desse método ao longo da lição para ver como ele funciona. -Agora você precisa encontrar um ficheiro de texto simples para nos ajudar com o exemplo. Porque não usar um livro que sabemos ser longo, tal como o épico *Guerra e Paz* (em inglês), de Leon Tolstoy? O ficheiro de texto está disponível no [Projeto Gutenberg](http://www.gutenberg.org/ebooks/2600). Se você já instalou o [wget](/en/lessons/automated-downloading-with-wget) (em inglês), pode simplesmente digitar: +Agora você precisa encontrar um ficheiro de texto simples para nos ajudar com o exemplo. Porque não usar um livro que sabemos ser longo, tal como o épico *Guerra e Paz* (em inglês), de Leon Tolstoy? O ficheiro de texto está disponível no [Projeto Gutenberg](https://www.gutenberg.org/ebooks/2600). Se você já instalou o [wget](/en/lessons/automated-downloading-with-wget) (em inglês), pode simplesmente digitar: `wget http://www.gutenberg.org/files/2600/2600-0.txt` diff --git a/pt/licoes/introducao-map-warper.md b/pt/licoes/introducao-map-warper.md index a3ec9f2e17..ed3edf1f18 100644 --- a/pt/licoes/introducao-map-warper.md +++ b/pt/licoes/introducao-map-warper.md @@ -38,7 +38,7 @@ Nota Explicativa: a versão lusófona desta lição distingue-se da versão orig ## O Map Warper -O [Map Warper](http://mapwarper.net/) (em inglês) é uma ferramenta de código aberto e de acesso livre. Desde 2008, o site tem sido desenvolvido e mantido por [Tim Waters](https://perma.cc/WEW2-9WTH) (em inglês) para georreferenciar imagens de áreas geográficas ao compará-las com o [OpenStreetMap](https://perma.cc/52L2-Q9W8) (em inglês) sem a necessidade de se instalar um software. A ferramenta já foi utilizada em vários projetos digitais, por diversas instituições, e é utilizada por profissionais que não são necessariamente especializados na área da cartografia. +O [Map Warper](https://mapwarper.net/) (em inglês) é uma ferramenta de código aberto e de acesso livre. Desde 2008, o site tem sido desenvolvido e mantido por [Tim Waters](https://perma.cc/WEW2-9WTH) (em inglês) para georreferenciar imagens de áreas geográficas ao compará-las com o [OpenStreetMap](https://perma.cc/52L2-Q9W8) (em inglês) sem a necessidade de se instalar um software. A ferramenta já foi utilizada em vários projetos digitais, por diversas instituições, e é utilizada por profissionais que não são necessariamente especializados na área da cartografia. Esta ferramenta foi criada para que fosse possível georreferenciar mapas antigos (mapas-múndi, portulanos, cartas marítimas, mapas topográficos, mapas de cadastro, etc.), fotografias aéreas e outros documentos que expressam o espaço. O Map Warper permite, então, que o usuário gere uma imagem georreferenciada de tipo raster (ex. TIFF). Os Sistemas de Informação Geográfica (de que são exemplo: o [QGIS ](https://qgis.org/) (em inglês), o [Map Server](https://perma.cc/W2B8-QD4U) (em inglês), [JOSM ](https://perma.cc/YQ6S-62BA) (em inglês), [ArcGIS](https://www.arcgis.com/index.html) (em inglês), o [Google Earth](https://earth.google.com/web/), o [WorldMap](https://worldmap.maps.arcgis.com/home/index.html) (em inglês), são então capazes de trabalhar com este conjunto de dados georreferenciados na imagem raster. A capacidade colaborativa do Map Warper permite descentralizar o processo de georreferenciamento, catalogação e visualização. Contudo, é necessário que se realize o preenchimento dos metadados, aspeto crucial para o trabalho colaborativo. diff --git a/pt/licoes/introducao-mysql-r.md b/pt/licoes/introducao-mysql-r.md index 1696e597b7..70561a954c 100644 --- a/pt/licoes/introducao-mysql-r.md +++ b/pt/licoes/introducao-mysql-r.md @@ -1,944 +1,944 @@ ---- -title: Introdução ao MySQL com R -layout: lesson -slug: introducao-mysql-r -authors: -- Jeff Blackadar -date: 2018-05-03 -translation_date: 2021-12-18 -editors: -- Amanda Visconti -reviewers: -- Jesse Sadler -- Simon Appleford -translator: -- Jéssica Evelyn Santos -translation-editor: -- Daniel Alves -translation-reviewer: -- Dália Guerreiro -- Leonardo F. Nascimento -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/439 -collection: lessons -activity: transforming -topics: [data-manipulation, distant-reading, r, data-visualization] -abstract: "Esta lição ajudará a armazenar grandes quantidades de dados históricos de maneira estruturada, pesquisar e filtrar esses dados e visualizar alguns dos dados como um gráfico." -original: getting-started-with-mysql-using-r -avatar_alt: Uma mão a segurar um jornal -doi: 10.46430/phpt0025 ---- - -Esta lição é direcionada aos que desejam armazenar grandes quantidades de dados de projetos de história digital de uma forma estruturada. Usaremos um sistema de gerenciamento de dados chamado MySQL para armazenar os dados. - -A linguagem R permite realizar análises e armazenar dados sem que um banco de dados relacional seja utilizado. No entanto, há situações nas quais a inclusão de bancos de dados é muito útil, dentre elas: - -- Publicar os resultados de um script em R num *web site* com dados interativos -- Manipular mais dados do que o R pode armazenar em sua própria memória -- Quando os dados já estão armazenados num banco de dados relacional -- Trabalhar com dados de entidades diferentes que são relacionados uns com os outros. Um exemplo seria um banco de dados de soldados de dois exércitos distintos que lutaram numa batalha, onde gostaríamos de saber qual esquadrão, pelotão, companhia e brigada cada soldado fazia parte. - -Uma breve discussão do tema pode ser encontrada no [*blog* de Jason A. French's](hhttps://perma.cc/5VYV-L5PG)[^1]. - -Ao final desta lição, será possível instalar um sistema de gerenciamento de banco de dados em seu computador, criar uma tabela de banco de dados, armazenar informações na tabela e realizar consultas dos dados. Na conclusão da lição, utilizaremos uma consulta do banco de dados para construir um gráfico. - -Usaremos a linguagem de programação R para os exemplos, mas as técnicas podem ser utilizadas com outras linguagens, como Python. - -Para fazer essa lição será necessário um computador com permissão para instalar os programas R e RStudio, entre outros, se já não estiverem instalados. Além da programação em R, também instalaremos alguns componentes de um sistema de gerenciamento de banco de dados chamado MySQL, que funciona nos sistemas operacionais Windows, Mac e Linux. - -Possuir algum conhecimento de instalação de programas e organização de dados em campos é útil para essa lição, cujo nível de dificuldade é mediano. - -{% include toc.html %} - -# Introdução - -O MySQL é um banco de dados relacional usado para armazenar e consultar informações. Esta lição utilizará a linguagem R para fornecer um tutorial e exemplos para: - -- Configurar e realizar uma conexão a uma tabela no MySQL -- Armazenar registros em tabelas -- Consultar informações de tabelas - -Neste tutorial, construiremos um banco de dados de artigos de periódicos que contém palavras de uma busca numa hemeroteca digital. O script armazenará o título, a data publicada e a URL de cada artigo num banco de dados. Utilizaremos outro script para realizar consultas no banco de dados e procurar por padrões historicamente relevantes. Os dados de amostra serão fornecidos pelo arquivo de periódicos [Welsh Newspapers Online](https://perma.cc/9EHD-EVEX). Estamos trabalhando com o objetivo de produzir uma lista de artigos à qual possamos consultar informações. Ao final da lição, vamos executar uma consulta para gerar um gráfico do número de artigos de periódicos no banco de dados, para verificar se há um padrão relevante. - -# Programas necessários - -R, R Studio, MySQL Server e MySQL Workbench são os programas necessários para esta lição. Algumas notas sobre a instalação desses pacotes de programas podem ser encontradas abaixo. - -## R - -Na lição [Processamento Básico de Texto em R](/pt/licoes/processamento-basico-texto-r)[^2], Taylor Arnold e Lauren Tilton fornecem um resumo excelente do conhecimento da linguagem R necessária para esta lição. Apenas um conhecimento básico de R é esperado. A lição [Noções básicas de R com dados tabulares](/pt/licoes/nocoes-basicas-R-dados-tabulares), de Taryn Dewar,[^3] aborda como instalar o R e se familiarizar com a linguagem. - -### Download do R - -Você pode realizar o download do R no [Comprehensive R Archive Network](https://cran.r-project.org/). Clique no link que corresponde ao sistema operacional do seu computador. Selecione *base* para instalar o R pela primeira vez. Uma vez que o ficheiro foi baixado, clique no ficheiro para executar o instalador. - -## RStudio - -Os exemplos desta lição utilizam o RStudio, que é uma interface de desenvolvimento para escrever e executar scripts em R. Esta lição usou a versão 1.4.1717 do RStudio. - -### Download do RStudio - -Faça o download do RStudio através do [rstudio.com](https://www.rstudio.com/products/rstudio/#Desktop) e instale-o. Já que o RStudio é de código aberto, você pode selecionar a versão gratuita do RStudio Desktop, rolar a página para baixo e clicar num dos instaladores que corresponda ao sistema operacional de seu computador. Uma vez que o download foi realizado, clique no ficheiro para executar o instalador. - -## MySQL - -SQL significa *Structured Query Language* (Linguagem estruturada de consulta), que é um conjunto de comandos para armazenar e recuperar informações a partir de um banco de dados relacional. MySQL é um tipo de sistema de gerenciamento de banco de dados relacionais. Há muitos outros, como Microsoft SQL Server, IBM DB2 e Microsoft Access. Esta lição utiliza o MySQL porque é um programa de código aberto, utilizado por uma grande comunidade, tem uma longa trajetória e possui uma versão gratuita que pode ser utilizada. - -### Realizando o download e instalando o MySQL - -Nesta seção, iremos instalar o MySQL, que é o sistema que mantém o banco de dados, assim como o MySQL Workbench, que é onde se trabalha para configurar a estrutura do banco de dados. Para usar o MySQL,o MySQL Workbench não é necessário, podem ser utilizados apenas comandos digitados. Esta lição utiliza o MySQL Workbench porque é uma *GUI* (Interface gráfica do usuário) que facilita o aprendizado de MySQL. - -Conclua essas instruções para instalar o MySQL Community Server e o MySQL Workbench em seu computador. - -### MySQL Community Server - -Este é o servidor onde o banco de dados é armazenado. Sua instalação é necessária para que seja possível conectar e armazenar os dados. Abaixo, faremos o download dos ficheiros, a instalação e iniciaremos o servidor. Esta lição utilizou a versão 8.0.21 do MySQL e 8.0.26 do MySQL Workbench. - -#### Fazendo o download do ficheiro de instalação do MySQL Community Server - -Clique neste link: [https://dev.mysql.com/downloads/mysql/](https://dev.mysql.com/downloads/mysql/). Role a página para baixo e selecione o sistema operacional que corresponde ao seu computador. Se necessário, clique em **Select Operating System** para selecionar o sistema operacional. Uma vez feita essa operação, clique no botão azul **Go to Download Page**. Depois clique no botão azul **Download**. Na página de download, role para baixo e terá a opção de começar o download clicando em **No thanks, just start my download** (Não, obrigado, apenas inicie o download). - -#### Instalação do MySQL Community Server - -Abaixo se encontram as dicas de instalação para PC e Mac: - -##### Dicas de instalação para PC - -A maneira recomendada de instalar os componentes do MySQL é através do instalador do MySQL para Windows. Com o ficheiro já baixado, clique duas vezes no ficheiro para instalá-lo. Siga as instruções para aceitar a licença (nota de tradução: com o instalador MySQL para Windows pode optar por fazer de uma vez só a instalação do MySQL Server e do MySQL Workbench; para isso, escolha os respectivos componentes e siga as instruções abaixo). -Depois que os componentes forem instalados, serão solicitadas as seguintes opções: - -###### 1. Escolhendo um tipo de configuração - -Selecione: **Developer Default** (Padrão do desenvolvedor). Esta opção *instala o MySQL Server e as ferramentas necessárias para o desenvolvimento da aplicação. Isto é útil se pretendes desenvolver aplicações para um servidor existente.* -(Ver abaixo) - -{% include figure.html filename="introducao-ao-mysql-e-r-1.PNG" caption="Configure o tipo de padrão do desenvolvedor" %} - -###### 2. Verificar Requisitos - -Clique no botão **Execute** caso haja requisitos pendentes (*failing requirements*) listados na checagem de requisitos. A lista de requisitos pode ser diferente da mostrada aqui. Uma vez que o processo de executar instalar os requisitos pendentes, clique no botão *Next* . -(Ver abaixo) - -{% include figure.html filename="introducao-ao-mysql-e-r-2.PNG" caption="Clique no botão *Execute* se necessário" %} - -###### 3. Tipo e Rede (1) - -Selecione: **Standalone MySQL Server** -(Ver abaixo) - -{% include figure.html filename="getting-started-with-mysql-7.png" caption="Select Standalone MySQL Server" %} - -###### 4. Tipo e Rede (2) - -*Config type*: Selecione: **Development Computer** -Checar: TCP/IP. Port number (Número da porta): 3306. -(Ver abaixo) - -{% include figure.html filename="introducao-ao-mysql-e-r-4.png" caption="Development Computer TCPIP Port 3306" %} - -###### 5. Contas e Funções - -{% include figure.html filename="introducao-ao-mysql-e-r-5.png" caption="Digite a senha *root* e depois guarde-a em local seguro" %} - -###### 6. Serviço do Windows - -As configurações aqui são opcionais, mas achamos mais fácil configurar o MySQL como um serviço do Windows e inclui-lo na inicialização automática. Um serviço do Windows é um processo que é executado no computador enquanto se está trabalhando. É possível mudar as configurações do serviço do Windows posteriormente, para iniciar o MySQL manualmente, para impedir que o programa inicialize quando não for necessário. - -{% include figure.html filename="introducao-ao-mysql-e-r-6.png" caption="MySQL como um serviço do Windows" %} - -Clique nos botões *Execute* e *Next* para finalizar a instalação e inicializar o servidor. - -###### 7. MySQL Workbench e Senha Root - -Procure por MySQL Workbench no menu de inicialização do Windows, sob o item MySQL. Se está lá, clique para iniciar. Caso não esteja, clique no instalador do MySQL - Community para executar novamente a instalação e adicionar o MySQL Workbench aos componentes instalados. -Depois de aberto o MySQL Workbench, clique na instância local do seu MySQL Server. -Quando a senha *root* for solicitada, digite a senha criada na etapa *5. Accounts and Roles*. -(Ver abaixo) - -{% include figure.html filename="introducao-ao-mysql-e-r-7.png" caption="Senha Root" %} - -##### Dicas de instalação para um Mac - -###### 1. Instalação do MySQL Community Server - -Com o ficheiro de instalação do MySQL Community Server baixado, clique duas vezes no ficheiro para instalá-lo. (Ver abaixo) - -{% include figure.html filename="introducao-ao-mysql-e-r-8.png" caption="Ficheiro de instalação" %} - -###### 2. Guarde a senha temporária - -Siga as instruções para aceitar a licença e o local de instalação. **Importante: Uma senha temporária será solicitada. Guarde-a cuidadosamente.** (Veja o exemplo abaixo. Sua senha temporária será diferente da mostrada abaixo.) Se um erro for cometido, é possível remover o servidor instalado e reinstalá-lo, mas essa é uma pequena complicação. Um dos revisores dessa lição achou que [essa resposta do StackOverflow](https://perma.cc/J4Q5-SLK5) pode auxiliar nesta etapa. - -{% include figure.html filename="getting-started-with-mysql-18.png" caption="Senha temporária" %} - -Concluída a instalação, iremos alterar a senha *root* para o servidor do MySQL. - -###### 3. Modifique a senha do servidor do MySQL - -**Esta seção da lição causou dificuldade para algumas pessoas. Leve o tempo que for necessário e note, por favor, que os comandos do MySQL terminam com um ponto e vírgula. Observe-os em alguns dos comandos abaixo.** - -3.1. Abra uma janela do terminal - -3.2. Adicione /usr/local/mysql/bin ao PATH através do comando abaixo. O PATH é uma lista de diretórios que o computador considera quando um comando é digitado para executar um programa. No próximo passo abaixo, ao executar o *mysql*, o PATH busca pelos diretórios que contém o programa *mysql*. O PATH procura pelo *mysql* no diretório */usr/local/mysql/bin* e o executa. O PATH apenas salva o caminho completo que for digitado, nesse caso, */usr/local/mysql/bin/mysql*, para um programa quando se quer executá-lo. - -``` -export PATH=${PATH}:/usr/local/mysql/bin -``` - -3.3. Inicie o servidor do MySQL. - -Vá até System Preferences > imagem do MySQL > clique em "Start MySQL server". - -3.4. Inicie uma sessão no MySQL. No comando abaixo, depois de *--password*, digite a senha guardada no passo *2. Guarde a senha temporária*. - -``` -mysql --user=root --password=senha_root_guardada_acima -``` - -3.5. Configure a senha *root* para uma **nova** senha. Escolha e guarde a nova senha cuidadosamente. No *prompt* mysql> , digite o seguinte comando, substituindo a nova senha entre aspas simples no comando SET PASSWORD=PASSWORD('password') com a nova senha criada. - -``` -SET PASSWORD=PASSWORD('nova_senha_criada_na_etapa_3.5'); -``` - -3.6. Reinicie o computador. Depois de reiniciar, é possível que seja necessário repetir a etapa *3.3 Inicie o servidor do MySQL* acima. - -###### 4. Download do MySQL Workbench - -Clique nesse link: [http://dev.mysql.com/downloads/workbench/](http://dev.mysql.com/downloads/workbench/). Role a página para baixo e clique em **Select Operating System** para selecionar o sistema operacional que corresponde ao seu computador. Se necessário, clique em **Select OS Version** para selecionar a versão do sistema operacional. Feito isso, clique no botão azul de **Download**. Na página de download, role para baixo e terá a opção de iniciar o download ao clicar em **No thanks, just start my download.** (Não, obrigado, apenas inicie o download.) - -Com o ficheiro baixado, clique duas vezes para instalá-lo. Feita a instalação do MySQL Workbench de acordo com as instruções na tela, arraste o ícone para a pasta de aplicações da esquerda. (Ver abaixo) - -{% include figure.html filename="introducao-ao-mysql-e-r-10.png" caption="MySQL Workbench" %} - -# Crie um banco de dados - -Aqui iremos criar um banco de dados que serve como um contentor para as tabelas nas quais armazenaremos informações. Uma tabela é a estrutura que mantém os dados que queremos armazenar. Tabelas contém muitas linhas de registros. Um exemplo de informações básicas de contato conteria campos para nome, número de telefone e endereço de e-mail. Numa tabela, os campos são organizados por *colunas*. - -Aqui está uma tabela de amostra com uma linha de dados que representa um registro: - -| nome | número de telefone | endereço de e-mail | -| ----------- | ------------------ | ------------------ | -| Pat Abraham | 613-555-1212 | pat@zmail.ca | - -## Abra o MySQL Workbench - -Abra o MySQL Workbench. Clique duas vezes em *Local Instance MySQL80* (num Mac isto pode aparecer como *Local Instance 3306*). É possível que a senha *root* criada nas etapas acima seja solicitada. Em alguns Macs, uma aba de *Query* será aberta; se não for, abra uma aba de *Query* utilizando: *File > New Query Tab*. - -## Crie um banco de dados - -Agora iremos criar um novo banco de dados. Utilizando o MySQL Workbench, realize os seguintes passos: - -1. Na janela de **Query**, digite: - - ``` - CREATE DATABASE periodicos_resultados_pesquisa; - ``` - -2. Execute o comando CREATE DATABASE. Clique no **relâmpago/raio** ou, utilizando o menu, clique em *Query* e então em *Execute Current Statement*. - -3. O novo banco de dados **periodicos_resultados_pesquisa** deve estar visível na aba **SCHEMAS**, no canto superior esquerdo da tela. Se não conseguir visualizar um item chamado periodicos_resultados_pesquisa, clique no botão de atualizar. - -(Ver abaixo:) - -{% include figure.html filename="introducao-ao-mysql-e-r-11.png" caption="Crie um banco de dados no MySQL Workbench" %} - -## USE o banco de dados - -Em seguida, iremos inserir uma declaração USE para informar ao MySQL qual banco de dados será usado. Isto se torna mais importante quando se tem mais de um banco de dados no computador. - -Na janela de **Query**, apague todo o comando CREATE DATABASE e digite: - -``` -USE periodicos_resultados_pesquisa; -``` - -Novamente, clique no **relâmpago/raio** ou, usando o menu, clique em *Query* e então em *Execute Current Statement*. É possível usar a tecla de teclado para isso. Num Mac, use *Command+Return*. Num PC, use *Ctrl+Shift+Enter*. A partir desse ponto da lição, todas as vezes que um comando for digitado na janela de *Query* será executado desta maneira. - -(Ver abaixo:) - -{% include figure.html filename="introducao-ao-mysql-e-r-12.png" caption="USE um banco de dados no MySQL Workbench" %} - -# Adicione uma tabela - -1. No MySQL Workbench, procure no lado esquerdo no painel **Navigator**, na aba **SCHEMAS**, por **periodicos_resultados_pesquisa**. -2. Clique em **Tables** com o lado direito do mouse e depois clique em **Create Table**. -3. Para **Table Name:** digite **tbl_periodicos_resultados_pesquisa** - -## Adicione colunas à tabela - -Adicione essas colunas: - -1. **id** Data type: **INT**. Clique PK (Primary Key), NN (Not Null) e AI (Auto Increment). Esta coluna de *id* será usada para relacionar registros nesta tabela com registros em outras tabelas. -2. **titulo_artigo** Data type: **VARCHAR(99)**. Esta coluna armazenará o título de cada resultado de artigo que coletarmos da busca. -3. **data_publicacao_artigo** Data type: **DATETIME**. Esta coluna armazenará a data em que o periódico foi publicado. -4. **url_artigo** Data type: **VARCHAR(99)**. Esta coluna armazenará a url de cada resultado que coletarmos da pesquisa. -5. **termo_busca_usado** Data type: **VARCHAR(45)**. Esta coluna irá armazenar a palavra que usamos para buscar os periódicos. - Clique no botão **Apply**. - -Se preferir, todas as etapas acima podem ser realizadas com um comando. Este comando pode ser executado na janela de *Query* para criar a tabela com as colunas indicadas acima. - -``` -CREATE TABLE periodicos_resultados_pesquisa.tbl_periodicos_resultados_pesquisa ( -id INT NOT NULL AUTO_INCREMENT, -titulo_artigo VARCHAR(99) NULL, -data_publicacao_artigo DATETIME NULL, -url_artigo VARCHAR(99) NULL, -termo_busca_usado VARCHAR(45) NULL, -PRIMARY KEY (id)); -``` - -*Dica: Leve o tempo que for necessário para pensar sobre a elaboração da tabela e sua nomeação, uma vez que um banco de dados bem elaborado será mais fácil de trabalhar e entender.* - -## Adicione um usuário para se conectar ao banco de dados - -Um usuário é uma conta que tem permissão para se conectar a um banco de dados. Abaixo, adicionaremos um novo usuário para que essa conta apenas se conecte a esse novo banco de dados. Usar essa conta de usuário para uma conexão com esse banco de dados limita a exposição a outros bancos de dados, caso a senha para este usuário seja comprometida. Dar ao usuário os privilégios mínimos requeridos para realizar o necessário reduz o risco, caso outra pessoa tiver acesso à senha de usuário. Por exemplo, se um usuário pode apenas ler um banco de dados, é um risco menor se a senha for descoberta do que um usuário que também pode alterar ou apagar o banco de dados. - -No menu do MySQL Workbench, clique em **Server** e depois em **Users and Privileges** - -**Usuários de Mac** - Em alguns computadores Mac, como meu laptop de teste, o painel de **Schema Privileges** não é exibido corretamente. Veja a nota abaixo da captura de tela se isso ocorrer. - -Clique no botão **Add Account** e complete os detalhes para a nova conta de usuário na caixa de diálogo: - -1. Login Name: **periodicos_pesquisa_usuario** -2. Authentication Type: selecione **Standard** -3. Limit to Hosts Matching: **localhost** -4. Tecle *Enter* e confirme uma senha *AlgoDificil* -5. Clique na aba **Administrative Roles**. Certifique-se de que nada está marcado. Esta conta é apenas para acessar o banco de dados. -6. Clique na aba **Schema Privileges** e clique **Add Entry**. -7. Na caixa de diálogo **New Schema Privilege Definition**, clique na caixa de seleção **Selected schema:** e selecione **periodicos_resultados_pesquisa**. Clique OK. -8. Clique em todas as opções de *Object Rights*: SELECT, INSERT, UPDATE, DELETE, EXECUTE, SHOW VIEW, como mostrado na imagem abaixo. (Este usuário precisará fazer muitas coisas posteriormente na lição, por isso, estamos lhe concendendo várias permissões.) -9. Clique em **Apply**. - -{% include figure.html filename="introducao-ao-mysql-e-r-13.PNG" caption="Configurando permissões para a nova conta" %} - -### Schema Privileges não exibidos corretamente - -Alguns computadores Mac, como meu laptop de teste, não exibem corretamente o painel de **Schema Privileges**. Nesse caso, é possível realizar a tarefa acima através de um *script* usando uma janela de Query. - -Se o usuário já foi criado acima, execute o seguinte comando para lhe conceder privilégios de usuário: - -``` -GRANT SELECT, INSERT, UPDATE, DELETE, EXECUTE, SHOW VIEW ON periodicos_resultados_pesquisa.* TO 'periodicos_pesquisa_usuario'@'localhost'; -``` - -Se o usuário não foi criado ainda, execute estes dois comandos para criar um usuário e depois lhe conceder privilégios de usuário: - -``` -CREATE USER 'periodicos_pesquisa_usuario'@'localhost' IDENTIFIED BY 'AlgoDificil'; -GRANT SELECT, INSERT, UPDATE, DELETE, EXECUTE, SHOW VIEW ON periodicos_resultados_pesquisa.* TO 'periodicos_pesquisa_usuario'@'localhost'; -``` - -### MySQL versão 8 e tipo de autenticação de usuário. - -Quando um usuário é criado no MySQL 8 Workbench o **Authentication Type** (tipo de autenticação) é configurado para o padrão **caching_sha2_password**. Esse tipo de autenticação causa um erro para o pacote R que usaremos para conectar o banco de dados mais tarde nesta lição. O erro é *Authentication plugin 'caching_sha2_password' cannot be loaded* e é descrito no [Stack Overflow](https://perma.cc/7NVR-TSYT). - -Para evitar esse erro, podemos modificar o tipo de autenticação do usuário para padrão (Standard). Para fazer isso, execute o seguinte comando na janela de *Query*: - -``` -ALTER USER 'periodicos_pesquisa_usuario'@'localhost' IDENTIFIED WITH mysql_native_password BY 'AlgoDificil'; -``` - -# Crie um R Script que se conecte ao banco de dados - -Abra o RStudio, que foi instalado anteriormente na lição. Veja a seção [RStudio](#rstudio). - -Agora usaremos o RStudio para escrever um novo R Script e salvá-lo com o nome periodicos_resultados_pesquisa.R. - -Vá em File > New File > R Script e depois salve o novo ficheiro com o nome periodicos_resultados_pesquisa.R. - -Usaremos o pacote RMariaDB para realizar a conexão com o MySQL. (Se tiver curiosidade, a documentação para o pacote RMariaDB pode ser encontrada [aqui](https://perma.cc/FX5P-DAW7).) - -Se não possui o pacote RMariaDB instalado (o que é provável, caso seja a primeira vez que usa o RStudio), instale-o utilizando o _console_ do RStudio. Após abrir o RStudio, copie e cole o seguinte para a janela da esquerda no >, e depois dê enter: - -``` -install.packages("RMariaDB") -``` - -Adicione o seguinte comando ao script periodicos_resultados_pesquisa.R (janela de cima, à esquerda) - -``` -library(RMariaDB) -``` - -## Conectando a um banco de dados com uma senha - -Primeiro, nos conectaremos ao banco de dados usando uma senha. (Depois utilizaremos um meio de conexão melhor). Por hora, usaremos uma variável para armazenar a senha. Cada vez que iniciar o R, será necessário apagar esta variável, mas isso é melhor do que publicar uma senha *hardcoded* caso compartilhe seus scripts, como pode fazer usando o GitHub. - -No console do RStudio, digite o comando abaixo, substituindo *AlgoDificil* com a senha criada para periodicos_pesquisa_usuario nos passos realizados acima para adicionar um usuário ao banco de dados. - -``` -senhadeusuariolocal <- "AlgoDificil" -``` - -Adicione as seguintes declarações em R ao ficheiro periodicos_resultados_pesquisa.R file e salve-o. - -Para executar este script, selecione todo o texto e clique no botão *Run* (Executar). (Há outras maneiras de executar apenas uma parte do script ou o script inteiro. Se tiver curiosidade, procure no menu abaixo de Code > Run Region. O comando CTRL+ALT+R executa todo o código em R no script.) - -``` -library(RMariaDB) -# O método de conexão abaixo utiliza uma senha armazenada numa variável. -# Para utilizar isto, configure senhadeusuariolocal="A senha de periodicos_pesquisa_usuario" - -artigosDb <- dbConnect(RMariaDB::MariaDB(), user='periodicos_pesquisa_usuario', password=senhadeusuariolocal, dbname='periodicos_resultados_pesquisa', host='localhost') -dbListTables(artigosDb) -dbDisconnect(artigosDb) -``` - -No console, deverá visualizar: - -``` -> dbListTables(artigosDb) -[1] "tbl_periodicos_resultados_pesquisa" -> dbDisconnect(artigosDb) -``` - -Sucesso! O que conseguiu: - -1. Conectar ao banco de dados com dbConnect. -2. Listar a tabela no banco de dados com dbListTables. -3. Desconectar do banco de dados usando dbDisconnect. - -### Conectar-se ao banco de dados com uma senha armazenada num ficheiro de configuração - -O exemplo acima de conexão é uma das maneiras de conectar-se. O método de conexão descrito abaixo armazena a informação da conexão do banco de dados num ficheiro de configuração, para que não seja necessário digitar uma senha numa variável todas as vezes que uma sessão no R for iniciada. Acredito que esse é um processo minucioso, mas é uma maneira mais padronizada e segura de proteger as credenciais usadas para acessar seu banco de dados. Esse método de conexão será usado no código para o restante desse tutorial, mas pode ser substituído pelo método de conexão mais simples mostrado acima se preferir. - -#### Crie o ficheiro .cnf para armazenar a informação de conexão com o banco de dados MySQL - -1. Abra um editor de texto, como o notepad, nano ou TextEdit e cole os itens abaixo, modificando a senha para a criada para periodicos_pesquisa_usuario nas etapas acima para adicionar um usuário e conectá-lo ao banco de dados. - -``` -[periodicos_resultados_pesquisa] -user=periodicos_pesquisa_usuario -password=AlgoDificil -host=127.0.0.1 -port=3306 -database=periodicos_resultados_pesquisa -``` - -2. Salve este ficheiro em algum local fora do diretório de trabalho do R. Salvei o meu no mesmo diretório de outros ficheiros de configuração do MySQL. No PC, o caminho foi o seguinte: C:\Program Files\MySQL\MySQL Server 8.0. Dependendo de seu sistema operacional e da versão do MySQL, esse local pode estar em outro lugar. No Mac, usei /Users/blackadar/Documents/ como a pasta de destino. Testei colocar este ficheiro em lugares diferentes, apenas é necessário que o R possa localizá-lo quando o script for executado. Nomeie o ficheiro como **periodicos_resultados_pesquisa.cnf**. - -3. Atualize o script periodicos_resultados_pesquisa.R acima para conectar-se ao banco de dados usando o ficheiro de configuração. - -``` -library(RMariaDB) -# O método de conexão abaixo utiliza uma senha armazenada num ficheiro de configuração. - -# O R precisa de um caminho completo para encontrar o ficheiro de configuração. -rmariadb.settingsfile<-"C:/Program Files/MySQL/MySQL Server 8.0/periodicos_resultados_pesquisa.cnf" - -rmariadb.db<-"periodicos_resultados_pesquisa" -artigosDb<-dbConnect(RMariaDB::MariaDB(),default.file=rmariadb.settingsfile,group=rmariadb.db) - -# Lista a tabela. Isso confirma que estamos conectados ao banco de dados. -dbListTables(artigosDb) - -# Desconecta para limpar a conexão com o banco de dados. -dbDisconnect(artigosDb) -``` - -4. Execute seu script. - -No console, entre outras linhas, deverá ver novamente: - -``` -> dbListTables(artigosDb) -[1] "tbl_periodicos_resultados_pesquisa" -``` - -De maneira bem sucedida, a conexão com o banco de dados foi realizada utilizando um ficheiro de configuração. - -# Armazenando dados numa tabela com o SQL - -Nesta seção da lição, criaremos uma declaração no SQL para inserir uma linha de dados no banco de dados sobre esse [artigo de periódico](https://perma.cc/C8MR-WYV2). Iremos inserir primeiro o registro no MySQL Workbench e depois faremos isso no R. - -1. No MySQL Workbench, clique na imagem categorizada como SQL+ para criar uma nova aba para o SQL executar consultas (ou vá ao menu "File" e escolha a opção "New Query Tab"). - -2. Cole a declaração abaixo na janela de Query. Esta ação irá inserir um registro na tabela. - - ``` - INSERT INTO tbl_periodicos_resultados_pesquisa ( - titulo_artigo, - data_publicacao_artigo, - url_artigo, - termo_busca_usado) - VALUES('THE LOST LUSITANIA.', - '1915-05-21', - LEFT(RTRIM('http://newspapers.library.wales/view/4121281/4121288/94/'),99), - 'German+Submarine'); - ``` - -3. Clique na imagem de relâmpago/raio na aba do SQL para executar a declaração SQL. - -{% include figure.html filename="introducao-ao-mysql-e-r-14.png" caption="Inserindo um registro numa tabela usando MySQL Workbench" %} - -## Explicação da declaração INSERT - -| SQL | Significado | -| --------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| INSERT INTO tbl_periodicos_resultados_pesquisa ( | Insere um registro na tabela nomeada tbl_periodicos_resultados_pesquisa | -| titulo_artigo, | nome do campo a ser preenchido por um valor | -| data_publicacao_artigo, | " | -| url_artigo, | " | -| termo_busca_usado) | " | -| VALUES('THE LOST LUSITANIA.', | O valor a ser inserido no campo titulo_artigo | -| '1915-05-21', | campo data_publicacao_artigo | -| LEFT(RTRIM('http://newspapers.library.wales/view/4121281/4121288/94/'),99), | campo url_artigo. Este campo é um VARCHAR(99), portanto tem um máximo de 99 caracteres. Inserir uma URL mais longa que 99 caracteres causaria um erro, portanto, duas funções são utilizadas para controlar isso. RTRIM() reduz espaços residuais à direita da URL. LEFT(value,99) retorna apenas os 99 caracteres mais à esquerda da URL reduzida. Esta URL é mais curta que isso, então essas funções estão aqui apenas como exemplo. | -| 'German+Submarine'); | campo termo_busca_usado | - -Opcional: Modifique a declaração INSERT acima e execute-a algumas vezes. Por exemplo: - -``` -INSERT INTO tbl_periodicos_resultados_pesquisa ( -titulo_artigo, -data_publicacao_artigo, -url_artigo, -termo_busca_usado) -VALUES('test insert.', -'1916-07-01', -LEFT(RTRIM('http://newspapers.library.wales/view/4121281/4121288/94/'),99), -'German+Submarine'); -``` - -## Consultando dados numa tabela com o SQL - -Nesta seção da lição, criaremos uma declaração no SQL para selecionar uma linha de dados do banco de dados que inserimos. Selecionaremos o primeiro registro no MySQL Workbench e depois faremos isso no R. - -1. Cole a declaração abaixo numa janela de query no MySQL Workbench. Isto irá selecionar registros da tabela. - - ``` - SELECT titulo_artigo FROM tbl_periodicos_resultados_pesquisa; - ``` - -2. Clique na imagem de relâmpago/raio na aba do SQL para executá-la. Deverá visualizar o título do artigo "THE LOST LUSITANIA." na grade de resultados. Ver abaixo. - -{% include figure.html filename="introducao-ao-mysql-e-r-15.png" caption="Selecionando registros de uma tabela usando MySQL Workbench" %} - -Opcional: Modifique a declaração SELECT acima alterando os campos selecionados e execute novamente. Adicione mais de um campo para a declaração SELECT e execute: - -``` -SELECT titulo_artigo, data_publicacao_artigo FROM tbl_periodicos_resultados_pesquisa; -``` - -## Armazenando dados numa tabela com SQL usando R - -Vamos fazer isso usando R! Abaixo se encontra uma versão expandida do R Script que usamos para nos conectar ao banco de dados. Para sermos concisos, os três primeiros comentários que tínhamos no R Script mostrado acima foram removidos. Não são mais necessários. - -Na linha 4 do script abaixo, lembre-se de modificar o caminho do rmariadb.settingsfile que corresponde à localização desse ficheiro em seu computador. - -``` -library(RMariaDB) -# O método de conexão abaixo utiliza uma senha armazenada num ficheiro de configuração. - -# O R precisa de um caminho completo para encontrar o ficheiro de configuração. -rmariadb.settingsfile<-"C:/Program Files/MySQL/MySQL Server 8.0/periodicos_resultados_pesquisa.cnf" - -rmariadb.db<-"periodicos_resultados_pesquisa" -artigosDb<-dbConnect(RMariaDB::MariaDB(),default.file=rmariadb.settingsfile,group=rmariadb.db) - -# Opcional. Liste a tabela. Isso confirma que nos conectamos ao banco de dados. -dbListTables(artigosDb) - -# Cria a declaração de query. -query<-"INSERT INTO tbl_periodicos_resultados_pesquisa ( -titulo_artigo, -data_publicacao_artigo, -url_artigo, -termo_busca_usado) -VALUES('THE LOST LUSITANIA.', -'1915-05-21', -LEFT(RTRIM('http://newspapers.library.wales/view/4121281/4121288/94/'),99), -'German+Submarine');" - -# Opcional. Exibe o query para o caso de ser necessário solucionar problemas. -print(query) - -# Executa o query no artigoDb que conectamos abaixo. -rsInsert <- dbSendQuery(artigosDb, query) - -# Limpa o resultado. -dbClearResult(rsInsert) - -# Desconecta para limpar a conexão com o banco de dados. -dbDisconnect(artigosDb) -``` - -No script acima, realizamos duas etapas para inserir um registro: - -1. Defina a declaração INSERT na linha com: query <- "INSERT INTO tbl_periodicos_resultados_pesquisa ( -2. Execute a declaração INSERT armazenada na variável da consulta com: rsInsert <- dbSendQuery(artigosDb, query) - -Execute o script acima no R Studio e depois execute uma declaração SELECT no MySQL Workbench. Consegue visualizar o novo registro adicionado? - -### Realize uma limpeza nos dados de teste - -Neste ponto é provável que haja mais de um registro com o título de artigo "THE LOST LUSITANIA.", o que é razoável para a testagem, mas não queremos dados duplicados. Iremos remover os dados de teste e começar novamente. Usando a janela de query no MySQL Workbench, execute a declaração SQL: - -``` -TRUNCATE tbl_periodicos_resultados_pesquisa; -``` - -No painel Action Output do MySQL Workbench deverá visualizar: - -``` -TRUNCATE tbl_periodicos_resultados_pesquisa; 0 row(s) affected 0.093 sec -``` - -Para praticar o que acabamos de fazer: - -1. Execute uma declaração SELECT novamente. Não deverá receber linhas de retorno. -2. Execute novamente o script em R acima para inserir um registro. -3. Realize uma declaração SELECT. Deverás visualizar uma linha de dados. - -### Modifique a declaração INSERT para usar variáveis - -Iremos inserir muitos dados na tabela usando o R, então mudaremos a declaração INSERT para usar variáveis. Veja no código abaixo o destaque *# Compila o query.* - -``` -library(RMariaDB) -# O método de conexão abaixo utiliza uma senha armazenada num ficheiro de configuração. - -# O R precisa de um caminho completo para encontrar o ficheiro de configuração. -rmariadb.settingsfile<-"C:/Program Files/MySQL/MySQL Server 8.0/periodicos_resultados_pesquisa.cnf" - -rmariadb.db<-"periodicos_resultados_pesquisa" -artigosDb<-dbConnect(RMariaDB::MariaDB(),default.file=rmariadb.settingsfile,group=rmariadb.db) - -# Opcional. Lista a tabela. Isso confirma que nos conectamos ao banco de dados. -dbListTables(artigosDb) - -# Compila o query. - -# Atribui variáveis. -entradaTitulo <- "THE LOST LUSITANIA." -entradaPublicacao <- "21 05 1916" -# Converte o valor da string para uma data para armazená-la no banco de dados. -entradaDataPublicacao <- as.Date(entradaPublicacao, "%d %M %Y") -entradaUrl <- "http://newspapers.library.wales/view/4121281/4121288/94/" -buscaSimplesTermos <- "German+Submarine" - -# Cria a declaração de query. -query<-paste( - "INSERT INTO tbl_periodicos_resultados_pesquisa ( - titulo_artigo, - data_publicacao_artigo, - url_artigo, - termo_busca_usado) - VALUES('",entradaTitulo,"', - '",entradaDataPublicacao,"', - LEFT(RTRIM('",entradaUrl,"'),99), - '",buscaSimplesTermos,"')", - sep = '' -) - -# Opcional. Exibe o query para o caso de ser necessário solucionar problemas. -print(query) - -# Executa o query no banco de dados artigosDb que conectamos acima. -rsInsert <- dbSendQuery(artigosDb, query) - -# Limpa o resultado. -dbClearResult(rsInsert) - -# Desconecta para limpar a conexão com o banco de dados. -dbDisconnect(artigosDb) -``` - -Vamos testar esse script: - -1. Execute uma declaração SELECT e observe as linhas que possui. -2. Execute o script em R acima para inserir outro registro. -3. Realize a declaração SELECT. Deverá visualizar uma linha adicional de dados. - -### Erros do SQL - -Vamos criar um simples erro no SQL para visualizar o que acontece. - -No R, modifique: - -``` -entradaTitulo <- "THE LOST LUSITANIA." -``` - -para - -``` -entradaTitulo <- "THE LOST LUSITANIA'S RUDDER." -``` - -e execute novamente o script. - -No console R, há um erro: - -``` -> rsInsert <- dbSendQuery(artigosDb, query) -Error in result_create(conn@ptr, statement, is_statement) : - You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'S RUDDER.', - '1916-05-21', - LEFT(RTRIM('http://newspapers.library.wales/view/4' at line 6 [1064] -``` - -É possível verificar, com uma declaração SELECT, se não há registro na tabela com um título de artigo denominado *THE LOST LUSITANIA'S RUDDER*. - -As aspas simples fazem parte da sintaxe do SQL e indicam uma entrada textual. Se estiverem no lugar errado, provocam um erro. Temos que lidar com casos nos quais há dados com aspas. O SQL aceita duas aspas numa declaração de inserção para representar aspas em dados(''). - -Lidaremos com as aspas utilizando uma função `gsub` para substituir aspas simples por aspas duplas, como mostrado abaixo. - -``` -entradaTitulo <- "THE LOST LUSITANIA'S RUDDER." -# altera aspas simples para aspas duplas -entradaTitulo <- gsub("'", "''", entradaTitulo) -``` - -Agora que a questão das aspas no título do artigo está resolvida, execute novamente o script e depois confira com uma declaração SELECT no MySQL Workbench. - -``` -SELECT * FROM periodicos_resultados_pesquisa.tbl_periodicos_resultados_pesquisa WHERE titulo_artigo = "THE LOST LUSITANIA'S RUDDER."; -``` - -Uma vez que o registro teste foi visualizado, digite TRUNCATE tbl_periodicos_resultados_pesquisa para remover esses dados de teste. - -# Armazenando um ficheiro de valores separados por vírgulas (.csv) no banco de dados MySQL - -Na próxima parte da lição, vamos realizar consultas na tabela do banco de dados. Nosso objetivo é obter dados suficientes na tabela para construir um gráfico. Para nos prepararmos para isso, carregaremos alguns dados de amostra de um ficheiro de valores separados por vírgulas (.csv). - -Faça o download dos ficheiros .csv para o seu diretório de trabalho do R. Esses ficheiros estão armazenados no GitHub, então faça o download da versão *Raw* dos ficheiros. - -1. [dados-amostra-jardim.csv](/assets/getting-started-with-mysql-using-r/dados-amostra-jardim.csv) Esta é uma lista de artigos de periódicos galeses publicados durante a Primeira Guerra Mundial que correspondem aos termos de busca "*allotment*"(loteamento) e "*garden*"(jardim). -2. [dados-amostra-submarino.csv](/assets/getting-started-with-mysql-using-r/dados-amostra-submarino.csv) Esta é uma lista de artigos de periódicos galeses publicados durante a Primeira Guerra Mundial que correspondem aos termos de busca "*German*"(alemão) e "*submarine*"(submarino). - -No R, execute a função read.csv() e depois visualize o data frame com os dados amostrais. - -``` -dadosAmostraJardim <- read.csv(file="dados-amostra-jardim.csv", header=TRUE, sep=",") -dadosAmostraJardim -``` - -Muitos dados serão visualizados, incluindo os que se encontram abaixo. Cheque a aba "Environment" (ambiente) na parte direita do RStudio. O Data Frame dadosAmostraJardim deve conter "1242 obs. of 4 variables". - -``` - titulo_artigo -1 -."e;'N'III GARDEN REQUISITES. -<...the result of the data frame results have been removed...> - data_publicacao_artigo url_artigo termo_busca_usado -1 1918-05-11 http://newspapers.library.wales/view/3581057/3581061/27/ AllotmentAndGarden -<...the result of the data frame results have been removed...> -``` - -Observe que nesses dados de amostra, os nomes dos campos estão incluídos no cabeçalho por conveniência: titulo_artigo, data_publicacao_artigo, url_artigo e termo_busca_usado. - -Como observado acima, nosso objetivo aqui é inserir os dados de amostra que estão armazenados no data frame dadosAmostraJardim na tabela MySQL periodicos_resultados_pesquisa. Podemos fazer isso de diferentes maneiras. Uma delas é repetir para cada linha de dado do data frame e executar um comando INSERT, como fizemos acima. Aqui, no entanto, usaremos um comando para inserir todas as linhas em dadosAmostraJardim de uma vez: *dbWriteTable*. Não execute essa declaração ainda, apenas a leia. - -``` -dbWriteTable(artigosDb, value = dadosAmostraJardim, row.names = FALSE, name = "tbl_periodicos_resultados_pesquisa", append = TRUE ) -``` - -| Função | Significado | -| -------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| dbWriteTable(artigosDb, | Use a conexão do banco de dados MySQL artigosDb. | -| value = dadosAmostraJardim, | Insere os valores do data frame dadosAmostraJardim para a tabela. | -| row.names = FALSE, | Nenhum nome de linha foi especificado. | -| name = "tbl_periodicos_resultados_pesquisa", | Insere os valores de dadosAmostraJardim para a tabela tbl_periodicos_resultados_pesquisa | -| append = TRUE ) | Adiciona os valores ao que já existe na tabela. Se esse script rodar novamente, todas as linhas em dadosAmostraJardim serão adicionadas à mesma tabela novamente. | - -Ainda não estamos preparados para executar o comando dbWriteTable(). Primeiro precisamos nos conectar ao banco de dados. Aqui está o script para fazer isso, assim como para carregar o data frame dados-amostra-submarino.csv. Leia-o e execute-o. - -``` -library(RMariaDB) -rmariadb.settingsfile<-"/Program Files/MySQL/MySQL Server 8.0/periodicos_resultados_pesquisa.cnf" - -rmariadb.db<-"periodicos_resultados_pesquisa" -artigosDb<-dbConnect(RMariaDB::MariaDB(),default.file=rmariadb.settingsfile,group=rmariadb.db) - -# A função "setwd" define o directório de trabalho. Deve mudar o caminho desse directório para o directório onde guardou os ficheiros .csv. -setwd("C:/Users/User/Documents") - -# Realiza uma busca nos dados de amostra dos periódicos pelos termos "Allotment" e "Garden" -dadosAmostraJardim <- read.csv(file="dados-amostra-jardim.csv", header=TRUE, sep=",") - -# Uma coluna titulo_artigo na tabela do banco de dados pode armazenar valores até 99 caracteres. -# Esta declaração reduz qualquer título de artigo maior que 99 caracteres. -dadosAmostraJardim$titulo_artigo <- substr(dadosAmostraJardim$titulo_artigo,0,99) - -# Esta declaração formata data_publicacao_artigo para representar o tipo de dado DATETIME. -dadosAmostraJardim$data_publicacao_artigo <- paste(dadosAmostraJardim$data_publicacao_artigo," 00:00:00",sep="") - -dbWriteTable(artigosDb, value = dadosAmostraJardim, row.names = FALSE, name = "tbl_periodicos_resultados_pesquisa", append = TRUE ) - -# Realiza um busca nos dados de amostra dos periódicos pelos termos German+Submarine. -dadosAmostraSubmarino <- read.csv(file="dados-amostra-submarino.csv", header=TRUE, sep=",") - -dadosAmostraSubmarino$titulo_artigo <- substr(dadosAmostraSubmarino$titulo_artigo,0,99) -dadosAmostraSubmarino$data_publicacao_artigo <- paste(dadosAmostraSubmarino$data_publicacao_artigo," 00:00:00",sep="") - -dbWriteTable(artigosDb, value = dadosAmostraSubmarino, row.names = FALSE, name = "tbl_periodicos_resultados_pesquisa", append = TRUE ) - -# Desconecta para limpar a conexão com o banco de dados. -dbDisconnect(artigosDb) -``` - -Se o script for executado mais de uma vez, serão gerados registros duplicados. Se isso acontecer, apenas execute o comando TRUNCATE na tabela e execute o script novamente, mas apenas uma vez. É possível verificar se o número de registros é o correto. No MySQL Workbench, execute o seguinte na janela de Query: - -``` -SELECT COUNT(*) FROM tbl_periodicos_resultados_pesquisa; -``` - -A contagem deve retornar 2880 registros. 1242 de dadosAmostraJardim e 1638 de dadosAmostraSubmarino. - -# Selecionado dados de uma tabela com SQL usando R - -Nosso objetivo aqui é usar a tabela de artigos que importamos e criar um gráfico do número de artigos publicados nos *Welsh Newspapers* (jornais galeses) ao longo de cada mês da Primeira Guerra Mundial que corresponda aos termos de busca *allotment*(loteamento) e *garden* (jardim), e *German* (alemão) e *submarine*(submarino). - -O script abaixo consulta o banco de dados e produz o gráfico de linha abaixo. Leia o script e observe o que está acontecendo. Segue uma explicação do script. - -``` -library(RMariaDB) -rmariadb.settingsfile<-"/Program Files/MySQL/MySQL Server 8.0/periodicos_resultados_pesquisa.cnf" - -rmariadb.db<-"periodicos_resultados_pesquisa" -artigosDb<-dbConnect(RMariaDB::MariaDB(),default.file=rmariadb.settingsfile,group=rmariadb.db) - -termoBuscaUsado = "German+Submarine" -# Solicita uma contagem do número de artigos que correspondem ao termoBuscaUsado que foram publicados a cada mês. -query<-paste("SELECT ( COUNT(CONCAT(MONTH(data_publicacao_artigo), ' ',YEAR(data_publicacao_artigo)))) as 'count' - FROM tbl_periodicos_resultados_pesquisa - WHERE termo_busca_usado ='", termoBuscaUsado,"' - GROUP BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo) - ORDER BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo);",sep="") - -print(query) -rs = dbSendQuery(artigosDb,query) -dbRows<-dbFetch(rs) - -contagemArtigos<-c(as.integer(dbRows$count)) - -# Coloca os resultados da consulta numa série temporal. -qts1 = ts(contagemArtigos, frequency = 12, start = c(1914, 8)) -print(qts1) - -# Plota a série temporal qts1 dos dados com uma linha de espessura 3 na cor vermelha. -plot(qts1, - lwd=3, - col = "red", - xlab="Mês da Guerra", - ylab="Números de artigos de periódicos", - xlim=c(1914,1919), - ylim=c(0,150), - main=paste("Número de artigos nos jornais galeses (Welsh newspapers) que correspondem aos termos de busca listados.",sep=""), - sub="Legenda do termo de busca: Vermelho = German+Submarine. Verde = Allotment And Garden.") - -termoBuscaUsado="AllotmentAndGarden" - -# Solicita uma contagem do número de artigos que correspondem ao termoBuscaUsado que foram publicados a cada mês. -query<-paste("SELECT ( COUNT(CONCAT(MONTH(data_publicacao_artigo),' ',YEAR(data_publicacao_artigo)))) as 'count' FROM tbl_periodicos_resultados_pesquisa WHERE termo_busca_usado='",termoBuscaUsado,"' GROUP BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo) ORDER BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo);",sep="") -print(query) -rs = dbSendQuery(artigosDb,query) -dbRows<-dbFetch(rs) - -contagemArtigos<-c(as.integer(dbRows$count)) - -# Coloca os resultados da consulta numa série temporal. -qts2 = ts(contagemArtigos, frequency = 12, start = c(1914, 8)) - -# Adiciona esta linha com a série temporal qts2 à plotagem existente. -lines(qts2, lwd=3,col="darkgreen") - -# Limpa o resultado. -dbClearResult(rs) - -# Desconecta para limpar a conexão com o banco de dados. -dbDisconnect(artigosDb) -``` - -## Explicação do script de seleção de dados e criação do gráfico. - -O método que conecta o banco de dados é explicado [acima](#Conectando-a-um-banco-de-dados-com-uma-senha). - -Este script seleciona dois resultados de um conjunto de dados e cria um gráfico com esses dados. Um dos resultados é a combinação dos artigos de periódicos com a busca pelos termos "German+Submarine". Eles são consultados através da declaração SELECT: - -``` -SELECT ( - COUNT(CONCAT(MONTH(data_publicacao_artigo),' ',YEAR(data_publicacao_artigo)))) as 'count' - FROM tbl_periodicos_resultados_pesquisa - WHERE termo_busca_usado='",termoBuscaUsado,"' - GROUP BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo) - ORDER BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo); -``` - -| SQL | Significado | -| ----------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| SELECT ( | SELECT - Seleciona os dados que correspondem à condição na cláusula WHERE na tabela do banco de dados nomeado . | -| COUNT(CONCAT(MONTH(data_publicacao_artigo),' ',YEAR(data_publicacao_artigo)))) as 'count' | Fornece uma contagem do número de artigos publicados que compartilham o mesmo mês e ano de publicação. CONCAT representa a ação concatenar, que cria um único valor textual de dois ou mais valores textuais, nesse caso, o mês e o ano. | -| FROM tbl_periodicos_resultados_pesquisa | Este é o banco de dados a partir do qual estamos selecionando os dados. | -| GROUP BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo) | Esta declaração GROUP BY é importante para a contagem (COUNT) acima. Aqui os dados estão agrupados por mês e ano, para que seja possível contar todos os registros no grupo. | -| ORDER BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo); | Coloca os resultados ordenados por data, o que é útil já que queremos construir um gráfico por data. | - -As declarações abaixo executam a consulta e colocam o resultado *rs* num data frame *dbRows*: - -``` -rs = dbSendQuery(artigosDb,query) -dbRows<-dbFetch(rs) -``` - -Abaixo, o data frame *dbRows* é colocado numa série temporal com a função *ts()*, para que seja possível plotar para cada mês, iniciando de agosto de 1914. - -``` -# Coloca os resultados da consulta numa série temporal. -qts1 = ts(contagemArtigos, frequency = 12, start = c(1914, 8)) -``` - -Abaixo, os dados na série temporal *qts1* são plotados num gráfico: - -``` -plot(qts1, - lwd=3, - col = "red", - xlab="Mês da Guerra", - ylab="Números de artigos de periódicos", - xlim=c(1914,1919), - ylim=c(0,150), - main=paste("Número de artigos nos jornais galeses (Welsh newspapers) que correspondem aos termos de busca listados.",sep=""), - sub="Legenda do termo de busca: Vermelho = German+Submarine. Verde = Allotment And Garden.") -``` - -O que isso difere da parte do script que gera o gráfico dos artigos correspondentes à busca dos termos "Allotment And Garden"? Não muito, definitivamente. Apenas usamos a função *lines()* para plotar os resultados no mesmo gráfico que construímos acima. - -``` -lines(qts2, lwd=3,col="darkgreen") -``` - -### Resultados da seleção de dados e da criação do gráfico - -Aqui abaixo está o gráfico que deveria aparecer: - -{% include figure.html filename="introducao-ao-mysql-e-r-16.png" caption="Plotagem do número de artigos de periódicos publicados cada mês que correspondem aos termos de busca" %} - -# Indo mais longe com o MySQL - -Se deseja colocar um banco de dados num website, uma maneira de fazê-lo é usando MySQL e a linguagem PHP para construir as páginas do site. Um exemplo deste tipo de website é o que construí para [buscar edições do "the Equity newspaper"](https://perma.cc/237N-DD9E). O livro de Larry Ullman's, *PHP and MySQL for Dynamic Web Sites*, aborda como configurar e conectar um banco de dados usando MySQL e PHP de uma maneira resistente à hackers. - -Para exemplos do uso de SQL para ordenar e agrupar dados, assim com também realizar cálculos, veja: [MySQL by Examples for Beginners](http://web.archive.org/web/20171228130133/https://www.ntu.edu.sg/home/ehchua/programming/sql/MySQL_Beginner.html) ou MySQL [Examples of Common Queries](https://perma.cc/84HN-9DBL). - -# Conclusão - -Espero que tenha obtido o conhecimento para configurar uma tabela de banco de dados, conectá-lo e armazenar registros. Embora tenhamos abordado apenas uma pequena parte das diferentes maneiras de realizar consultas nos dados, espero também que saiba a técnica de uso das declarações SELECT para que possa utilizá-las em seus futuros projetos de história digital. - -# Créditos - -Finalizei esta lição graças ao suporte do [George Garth Graham Undergraduate Digital History Research Fellowship](https://perma.cc/S7PP-FY5U). - -Agradeço à Drª. Amanda Visconti pelo suporte e orientação ao longo da preparação desta lição. - -# Referências - -Ullman, L. 2005. *PHP and MySQL for Dynamic Web Sites, 2nd ed.* Berkeley, Calif: Peachpit. - -# Notas - -[^1]: Jason A. French, "Using R With MySQL Databases," blog (3 July 2014), [http://www.jason-french.com/blog/2014/07/03/using-r-with-mysql-databases/](https://perma.cc/5VYV-L5PG). - -[^2]: Taylor Arnold and Lauren Tilton, "Basic Text Processing in R," Programming Historian (27 March 2017), [tradução para português](/pt/licoes/processamento-basico-texto-r). - -[^3]: Taryn Dewar, "R Basics with Tabular Data," Programming Historian (05 September 2016), [tradução para português](/pt/licoes/nocoes-basicas-R-dados-tabulares). - -O script em R usado para recolher dados de amostra se encontra [aqui](https://perma.cc/87AE-LJRG). +--- +title: Introdução ao MySQL com R +layout: lesson +slug: introducao-mysql-r +authors: +- Jeff Blackadar +date: 2018-05-03 +translation_date: 2021-12-18 +editors: +- Amanda Visconti +reviewers: +- Jesse Sadler +- Simon Appleford +translator: +- Jéssica Evelyn Santos +translation-editor: +- Daniel Alves +translation-reviewer: +- Dália Guerreiro +- Leonardo F. Nascimento +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/439 +collection: lessons +activity: transforming +topics: [data-manipulation, distant-reading, r, data-visualization] +abstract: "Esta lição ajudará a armazenar grandes quantidades de dados históricos de maneira estruturada, pesquisar e filtrar esses dados e visualizar alguns dos dados como um gráfico." +original: getting-started-with-mysql-using-r +avatar_alt: Uma mão a segurar um jornal +doi: 10.46430/phpt0025 +--- + +Esta lição é direcionada aos que desejam armazenar grandes quantidades de dados de projetos de história digital de uma forma estruturada. Usaremos um sistema de gerenciamento de dados chamado MySQL para armazenar os dados. + +A linguagem R permite realizar análises e armazenar dados sem que um banco de dados relacional seja utilizado. No entanto, há situações nas quais a inclusão de bancos de dados é muito útil, dentre elas: + +- Publicar os resultados de um script em R num *web site* com dados interativos +- Manipular mais dados do que o R pode armazenar em sua própria memória +- Quando os dados já estão armazenados num banco de dados relacional +- Trabalhar com dados de entidades diferentes que são relacionados uns com os outros. Um exemplo seria um banco de dados de soldados de dois exércitos distintos que lutaram numa batalha, onde gostaríamos de saber qual esquadrão, pelotão, companhia e brigada cada soldado fazia parte. + +Uma breve discussão do tema pode ser encontrada no [*blog* de Jason A. French's](hhttps://perma.cc/5VYV-L5PG)[^1]. + +Ao final desta lição, será possível instalar um sistema de gerenciamento de banco de dados em seu computador, criar uma tabela de banco de dados, armazenar informações na tabela e realizar consultas dos dados. Na conclusão da lição, utilizaremos uma consulta do banco de dados para construir um gráfico. + +Usaremos a linguagem de programação R para os exemplos, mas as técnicas podem ser utilizadas com outras linguagens, como Python. + +Para fazer essa lição será necessário um computador com permissão para instalar os programas R e RStudio, entre outros, se já não estiverem instalados. Além da programação em R, também instalaremos alguns componentes de um sistema de gerenciamento de banco de dados chamado MySQL, que funciona nos sistemas operacionais Windows, Mac e Linux. + +Possuir algum conhecimento de instalação de programas e organização de dados em campos é útil para essa lição, cujo nível de dificuldade é mediano. + +{% include toc.html %} + +# Introdução + +O MySQL é um banco de dados relacional usado para armazenar e consultar informações. Esta lição utilizará a linguagem R para fornecer um tutorial e exemplos para: + +- Configurar e realizar uma conexão a uma tabela no MySQL +- Armazenar registros em tabelas +- Consultar informações de tabelas + +Neste tutorial, construiremos um banco de dados de artigos de periódicos que contém palavras de uma busca numa hemeroteca digital. O script armazenará o título, a data publicada e a URL de cada artigo num banco de dados. Utilizaremos outro script para realizar consultas no banco de dados e procurar por padrões historicamente relevantes. Os dados de amostra serão fornecidos pelo arquivo de periódicos [Welsh Newspapers Online](https://perma.cc/9EHD-EVEX). Estamos trabalhando com o objetivo de produzir uma lista de artigos à qual possamos consultar informações. Ao final da lição, vamos executar uma consulta para gerar um gráfico do número de artigos de periódicos no banco de dados, para verificar se há um padrão relevante. + +# Programas necessários + +R, R Studio, MySQL Server e MySQL Workbench são os programas necessários para esta lição. Algumas notas sobre a instalação desses pacotes de programas podem ser encontradas abaixo. + +## R + +Na lição [Processamento Básico de Texto em R](/pt/licoes/processamento-basico-texto-r)[^2], Taylor Arnold e Lauren Tilton fornecem um resumo excelente do conhecimento da linguagem R necessária para esta lição. Apenas um conhecimento básico de R é esperado. A lição [Noções básicas de R com dados tabulares](/pt/licoes/nocoes-basicas-R-dados-tabulares), de Taryn Dewar,[^3] aborda como instalar o R e se familiarizar com a linguagem. + +### Download do R + +Você pode realizar o download do R no [Comprehensive R Archive Network](https://cran.r-project.org/). Clique no link que corresponde ao sistema operacional do seu computador. Selecione *base* para instalar o R pela primeira vez. Uma vez que o ficheiro foi baixado, clique no ficheiro para executar o instalador. + +## RStudio + +Os exemplos desta lição utilizam o RStudio, que é uma interface de desenvolvimento para escrever e executar scripts em R. Esta lição usou a versão 1.4.1717 do RStudio. + +### Download do RStudio + +Faça o download do RStudio através do [rstudio.com](https://www.rstudio.com/products/rstudio/#Desktop) e instale-o. Já que o RStudio é de código aberto, você pode selecionar a versão gratuita do RStudio Desktop, rolar a página para baixo e clicar num dos instaladores que corresponda ao sistema operacional de seu computador. Uma vez que o download foi realizado, clique no ficheiro para executar o instalador. + +## MySQL + +SQL significa *Structured Query Language* (Linguagem estruturada de consulta), que é um conjunto de comandos para armazenar e recuperar informações a partir de um banco de dados relacional. MySQL é um tipo de sistema de gerenciamento de banco de dados relacionais. Há muitos outros, como Microsoft SQL Server, IBM DB2 e Microsoft Access. Esta lição utiliza o MySQL porque é um programa de código aberto, utilizado por uma grande comunidade, tem uma longa trajetória e possui uma versão gratuita que pode ser utilizada. + +### Realizando o download e instalando o MySQL + +Nesta seção, iremos instalar o MySQL, que é o sistema que mantém o banco de dados, assim como o MySQL Workbench, que é onde se trabalha para configurar a estrutura do banco de dados. Para usar o MySQL,o MySQL Workbench não é necessário, podem ser utilizados apenas comandos digitados. Esta lição utiliza o MySQL Workbench porque é uma *GUI* (Interface gráfica do usuário) que facilita o aprendizado de MySQL. + +Conclua essas instruções para instalar o MySQL Community Server e o MySQL Workbench em seu computador. + +### MySQL Community Server + +Este é o servidor onde o banco de dados é armazenado. Sua instalação é necessária para que seja possível conectar e armazenar os dados. Abaixo, faremos o download dos ficheiros, a instalação e iniciaremos o servidor. Esta lição utilizou a versão 8.0.21 do MySQL e 8.0.26 do MySQL Workbench. + +#### Fazendo o download do ficheiro de instalação do MySQL Community Server + +Clique neste link: [https://dev.mysql.com/downloads/mysql/](https://dev.mysql.com/downloads/mysql/). Role a página para baixo e selecione o sistema operacional que corresponde ao seu computador. Se necessário, clique em **Select Operating System** para selecionar o sistema operacional. Uma vez feita essa operação, clique no botão azul **Go to Download Page**. Depois clique no botão azul **Download**. Na página de download, role para baixo e terá a opção de começar o download clicando em **No thanks, just start my download** (Não, obrigado, apenas inicie o download). + +#### Instalação do MySQL Community Server + +Abaixo se encontram as dicas de instalação para PC e Mac: + +##### Dicas de instalação para PC + +A maneira recomendada de instalar os componentes do MySQL é através do instalador do MySQL para Windows. Com o ficheiro já baixado, clique duas vezes no ficheiro para instalá-lo. Siga as instruções para aceitar a licença (nota de tradução: com o instalador MySQL para Windows pode optar por fazer de uma vez só a instalação do MySQL Server e do MySQL Workbench; para isso, escolha os respectivos componentes e siga as instruções abaixo). +Depois que os componentes forem instalados, serão solicitadas as seguintes opções: + +###### 1. Escolhendo um tipo de configuração + +Selecione: **Developer Default** (Padrão do desenvolvedor). Esta opção *instala o MySQL Server e as ferramentas necessárias para o desenvolvimento da aplicação. Isto é útil se pretendes desenvolver aplicações para um servidor existente.* +(Ver abaixo) + +{% include figure.html filename="introducao-ao-mysql-e-r-1.PNG" caption="Configure o tipo de padrão do desenvolvedor" %} + +###### 2. Verificar Requisitos + +Clique no botão **Execute** caso haja requisitos pendentes (*failing requirements*) listados na checagem de requisitos. A lista de requisitos pode ser diferente da mostrada aqui. Uma vez que o processo de executar instalar os requisitos pendentes, clique no botão *Next* . +(Ver abaixo) + +{% include figure.html filename="introducao-ao-mysql-e-r-2.PNG" caption="Clique no botão *Execute* se necessário" %} + +###### 3. Tipo e Rede (1) + +Selecione: **Standalone MySQL Server** +(Ver abaixo) + +{% include figure.html filename="getting-started-with-mysql-7.png" caption="Select Standalone MySQL Server" %} + +###### 4. Tipo e Rede (2) + +*Config type*: Selecione: **Development Computer** +Checar: TCP/IP. Port number (Número da porta): 3306. +(Ver abaixo) + +{% include figure.html filename="introducao-ao-mysql-e-r-4.png" caption="Development Computer TCPIP Port 3306" %} + +###### 5. Contas e Funções + +{% include figure.html filename="introducao-ao-mysql-e-r-5.png" caption="Digite a senha *root* e depois guarde-a em local seguro" %} + +###### 6. Serviço do Windows + +As configurações aqui são opcionais, mas achamos mais fácil configurar o MySQL como um serviço do Windows e inclui-lo na inicialização automática. Um serviço do Windows é um processo que é executado no computador enquanto se está trabalhando. É possível mudar as configurações do serviço do Windows posteriormente, para iniciar o MySQL manualmente, para impedir que o programa inicialize quando não for necessário. + +{% include figure.html filename="introducao-ao-mysql-e-r-6.png" caption="MySQL como um serviço do Windows" %} + +Clique nos botões *Execute* e *Next* para finalizar a instalação e inicializar o servidor. + +###### 7. MySQL Workbench e Senha Root + +Procure por MySQL Workbench no menu de inicialização do Windows, sob o item MySQL. Se está lá, clique para iniciar. Caso não esteja, clique no instalador do MySQL - Community para executar novamente a instalação e adicionar o MySQL Workbench aos componentes instalados. +Depois de aberto o MySQL Workbench, clique na instância local do seu MySQL Server. +Quando a senha *root* for solicitada, digite a senha criada na etapa *5. Accounts and Roles*. +(Ver abaixo) + +{% include figure.html filename="introducao-ao-mysql-e-r-7.png" caption="Senha Root" %} + +##### Dicas de instalação para um Mac + +###### 1. Instalação do MySQL Community Server + +Com o ficheiro de instalação do MySQL Community Server baixado, clique duas vezes no ficheiro para instalá-lo. (Ver abaixo) + +{% include figure.html filename="introducao-ao-mysql-e-r-8.png" caption="Ficheiro de instalação" %} + +###### 2. Guarde a senha temporária + +Siga as instruções para aceitar a licença e o local de instalação. **Importante: Uma senha temporária será solicitada. Guarde-a cuidadosamente.** (Veja o exemplo abaixo. Sua senha temporária será diferente da mostrada abaixo.) Se um erro for cometido, é possível remover o servidor instalado e reinstalá-lo, mas essa é uma pequena complicação. Um dos revisores dessa lição achou que [essa resposta do StackOverflow](https://perma.cc/J4Q5-SLK5) pode auxiliar nesta etapa. + +{% include figure.html filename="getting-started-with-mysql-18.png" caption="Senha temporária" %} + +Concluída a instalação, iremos alterar a senha *root* para o servidor do MySQL. + +###### 3. Modifique a senha do servidor do MySQL + +**Esta seção da lição causou dificuldade para algumas pessoas. Leve o tempo que for necessário e note, por favor, que os comandos do MySQL terminam com um ponto e vírgula. Observe-os em alguns dos comandos abaixo.** + +3.1. Abra uma janela do terminal + +3.2. Adicione /usr/local/mysql/bin ao PATH através do comando abaixo. O PATH é uma lista de diretórios que o computador considera quando um comando é digitado para executar um programa. No próximo passo abaixo, ao executar o *mysql*, o PATH busca pelos diretórios que contém o programa *mysql*. O PATH procura pelo *mysql* no diretório */usr/local/mysql/bin* e o executa. O PATH apenas salva o caminho completo que for digitado, nesse caso, */usr/local/mysql/bin/mysql*, para um programa quando se quer executá-lo. + +``` +export PATH=${PATH}:/usr/local/mysql/bin +``` + +3.3. Inicie o servidor do MySQL. + +Vá até System Preferences > imagem do MySQL > clique em "Start MySQL server". + +3.4. Inicie uma sessão no MySQL. No comando abaixo, depois de *--password*, digite a senha guardada no passo *2. Guarde a senha temporária*. + +``` +mysql --user=root --password=senha_root_guardada_acima +``` + +3.5. Configure a senha *root* para uma **nova** senha. Escolha e guarde a nova senha cuidadosamente. No *prompt* mysql> , digite o seguinte comando, substituindo a nova senha entre aspas simples no comando SET PASSWORD=PASSWORD('password') com a nova senha criada. + +``` +SET PASSWORD=PASSWORD('nova_senha_criada_na_etapa_3.5'); +``` + +3.6. Reinicie o computador. Depois de reiniciar, é possível que seja necessário repetir a etapa *3.3 Inicie o servidor do MySQL* acima. + +###### 4. Download do MySQL Workbench + +Clique nesse link: [https://dev.mysql.com/downloads/workbench/](https://dev.mysql.com/downloads/workbench/). Role a página para baixo e clique em **Select Operating System** para selecionar o sistema operacional que corresponde ao seu computador. Se necessário, clique em **Select OS Version** para selecionar a versão do sistema operacional. Feito isso, clique no botão azul de **Download**. Na página de download, role para baixo e terá a opção de iniciar o download ao clicar em **No thanks, just start my download.** (Não, obrigado, apenas inicie o download.) + +Com o ficheiro baixado, clique duas vezes para instalá-lo. Feita a instalação do MySQL Workbench de acordo com as instruções na tela, arraste o ícone para a pasta de aplicações da esquerda. (Ver abaixo) + +{% include figure.html filename="introducao-ao-mysql-e-r-10.png" caption="MySQL Workbench" %} + +# Crie um banco de dados + +Aqui iremos criar um banco de dados que serve como um contentor para as tabelas nas quais armazenaremos informações. Uma tabela é a estrutura que mantém os dados que queremos armazenar. Tabelas contém muitas linhas de registros. Um exemplo de informações básicas de contato conteria campos para nome, número de telefone e endereço de e-mail. Numa tabela, os campos são organizados por *colunas*. + +Aqui está uma tabela de amostra com uma linha de dados que representa um registro: + +| nome | número de telefone | endereço de e-mail | +| ----------- | ------------------ | ------------------ | +| Pat Abraham | 613-555-1212 | pat@zmail.ca | + +## Abra o MySQL Workbench + +Abra o MySQL Workbench. Clique duas vezes em *Local Instance MySQL80* (num Mac isto pode aparecer como *Local Instance 3306*). É possível que a senha *root* criada nas etapas acima seja solicitada. Em alguns Macs, uma aba de *Query* será aberta; se não for, abra uma aba de *Query* utilizando: *File > New Query Tab*. + +## Crie um banco de dados + +Agora iremos criar um novo banco de dados. Utilizando o MySQL Workbench, realize os seguintes passos: + +1. Na janela de **Query**, digite: + + ``` + CREATE DATABASE periodicos_resultados_pesquisa; + ``` + +2. Execute o comando CREATE DATABASE. Clique no **relâmpago/raio** ou, utilizando o menu, clique em *Query* e então em *Execute Current Statement*. + +3. O novo banco de dados **periodicos_resultados_pesquisa** deve estar visível na aba **SCHEMAS**, no canto superior esquerdo da tela. Se não conseguir visualizar um item chamado periodicos_resultados_pesquisa, clique no botão de atualizar. + +(Ver abaixo:) + +{% include figure.html filename="introducao-ao-mysql-e-r-11.png" caption="Crie um banco de dados no MySQL Workbench" %} + +## USE o banco de dados + +Em seguida, iremos inserir uma declaração USE para informar ao MySQL qual banco de dados será usado. Isto se torna mais importante quando se tem mais de um banco de dados no computador. + +Na janela de **Query**, apague todo o comando CREATE DATABASE e digite: + +``` +USE periodicos_resultados_pesquisa; +``` + +Novamente, clique no **relâmpago/raio** ou, usando o menu, clique em *Query* e então em *Execute Current Statement*. É possível usar a tecla de teclado para isso. Num Mac, use *Command+Return*. Num PC, use *Ctrl+Shift+Enter*. A partir desse ponto da lição, todas as vezes que um comando for digitado na janela de *Query* será executado desta maneira. + +(Ver abaixo:) + +{% include figure.html filename="introducao-ao-mysql-e-r-12.png" caption="USE um banco de dados no MySQL Workbench" %} + +# Adicione uma tabela + +1. No MySQL Workbench, procure no lado esquerdo no painel **Navigator**, na aba **SCHEMAS**, por **periodicos_resultados_pesquisa**. +2. Clique em **Tables** com o lado direito do mouse e depois clique em **Create Table**. +3. Para **Table Name:** digite **tbl_periodicos_resultados_pesquisa** + +## Adicione colunas à tabela + +Adicione essas colunas: + +1. **id** Data type: **INT**. Clique PK (Primary Key), NN (Not Null) e AI (Auto Increment). Esta coluna de *id* será usada para relacionar registros nesta tabela com registros em outras tabelas. +2. **titulo_artigo** Data type: **VARCHAR(99)**. Esta coluna armazenará o título de cada resultado de artigo que coletarmos da busca. +3. **data_publicacao_artigo** Data type: **DATETIME**. Esta coluna armazenará a data em que o periódico foi publicado. +4. **url_artigo** Data type: **VARCHAR(99)**. Esta coluna armazenará a url de cada resultado que coletarmos da pesquisa. +5. **termo_busca_usado** Data type: **VARCHAR(45)**. Esta coluna irá armazenar a palavra que usamos para buscar os periódicos. + Clique no botão **Apply**. + +Se preferir, todas as etapas acima podem ser realizadas com um comando. Este comando pode ser executado na janela de *Query* para criar a tabela com as colunas indicadas acima. + +``` +CREATE TABLE periodicos_resultados_pesquisa.tbl_periodicos_resultados_pesquisa ( +id INT NOT NULL AUTO_INCREMENT, +titulo_artigo VARCHAR(99) NULL, +data_publicacao_artigo DATETIME NULL, +url_artigo VARCHAR(99) NULL, +termo_busca_usado VARCHAR(45) NULL, +PRIMARY KEY (id)); +``` + +*Dica: Leve o tempo que for necessário para pensar sobre a elaboração da tabela e sua nomeação, uma vez que um banco de dados bem elaborado será mais fácil de trabalhar e entender.* + +## Adicione um usuário para se conectar ao banco de dados + +Um usuário é uma conta que tem permissão para se conectar a um banco de dados. Abaixo, adicionaremos um novo usuário para que essa conta apenas se conecte a esse novo banco de dados. Usar essa conta de usuário para uma conexão com esse banco de dados limita a exposição a outros bancos de dados, caso a senha para este usuário seja comprometida. Dar ao usuário os privilégios mínimos requeridos para realizar o necessário reduz o risco, caso outra pessoa tiver acesso à senha de usuário. Por exemplo, se um usuário pode apenas ler um banco de dados, é um risco menor se a senha for descoberta do que um usuário que também pode alterar ou apagar o banco de dados. + +No menu do MySQL Workbench, clique em **Server** e depois em **Users and Privileges** + +**Usuários de Mac** - Em alguns computadores Mac, como meu laptop de teste, o painel de **Schema Privileges** não é exibido corretamente. Veja a nota abaixo da captura de tela se isso ocorrer. + +Clique no botão **Add Account** e complete os detalhes para a nova conta de usuário na caixa de diálogo: + +1. Login Name: **periodicos_pesquisa_usuario** +2. Authentication Type: selecione **Standard** +3. Limit to Hosts Matching: **localhost** +4. Tecle *Enter* e confirme uma senha *AlgoDificil* +5. Clique na aba **Administrative Roles**. Certifique-se de que nada está marcado. Esta conta é apenas para acessar o banco de dados. +6. Clique na aba **Schema Privileges** e clique **Add Entry**. +7. Na caixa de diálogo **New Schema Privilege Definition**, clique na caixa de seleção **Selected schema:** e selecione **periodicos_resultados_pesquisa**. Clique OK. +8. Clique em todas as opções de *Object Rights*: SELECT, INSERT, UPDATE, DELETE, EXECUTE, SHOW VIEW, como mostrado na imagem abaixo. (Este usuário precisará fazer muitas coisas posteriormente na lição, por isso, estamos lhe concendendo várias permissões.) +9. Clique em **Apply**. + +{% include figure.html filename="introducao-ao-mysql-e-r-13.PNG" caption="Configurando permissões para a nova conta" %} + +### Schema Privileges não exibidos corretamente + +Alguns computadores Mac, como meu laptop de teste, não exibem corretamente o painel de **Schema Privileges**. Nesse caso, é possível realizar a tarefa acima através de um *script* usando uma janela de Query. + +Se o usuário já foi criado acima, execute o seguinte comando para lhe conceder privilégios de usuário: + +``` +GRANT SELECT, INSERT, UPDATE, DELETE, EXECUTE, SHOW VIEW ON periodicos_resultados_pesquisa.* TO 'periodicos_pesquisa_usuario'@'localhost'; +``` + +Se o usuário não foi criado ainda, execute estes dois comandos para criar um usuário e depois lhe conceder privilégios de usuário: + +``` +CREATE USER 'periodicos_pesquisa_usuario'@'localhost' IDENTIFIED BY 'AlgoDificil'; +GRANT SELECT, INSERT, UPDATE, DELETE, EXECUTE, SHOW VIEW ON periodicos_resultados_pesquisa.* TO 'periodicos_pesquisa_usuario'@'localhost'; +``` + +### MySQL versão 8 e tipo de autenticação de usuário. + +Quando um usuário é criado no MySQL 8 Workbench o **Authentication Type** (tipo de autenticação) é configurado para o padrão **caching_sha2_password**. Esse tipo de autenticação causa um erro para o pacote R que usaremos para conectar o banco de dados mais tarde nesta lição. O erro é *Authentication plugin 'caching_sha2_password' cannot be loaded* e é descrito no [Stack Overflow](https://perma.cc/7NVR-TSYT). + +Para evitar esse erro, podemos modificar o tipo de autenticação do usuário para padrão (Standard). Para fazer isso, execute o seguinte comando na janela de *Query*: + +``` +ALTER USER 'periodicos_pesquisa_usuario'@'localhost' IDENTIFIED WITH mysql_native_password BY 'AlgoDificil'; +``` + +# Crie um R Script que se conecte ao banco de dados + +Abra o RStudio, que foi instalado anteriormente na lição. Veja a seção [RStudio](#rstudio). + +Agora usaremos o RStudio para escrever um novo R Script e salvá-lo com o nome periodicos_resultados_pesquisa.R. + +Vá em File > New File > R Script e depois salve o novo ficheiro com o nome periodicos_resultados_pesquisa.R. + +Usaremos o pacote RMariaDB para realizar a conexão com o MySQL. (Se tiver curiosidade, a documentação para o pacote RMariaDB pode ser encontrada [aqui](https://perma.cc/FX5P-DAW7).) + +Se não possui o pacote RMariaDB instalado (o que é provável, caso seja a primeira vez que usa o RStudio), instale-o utilizando o _console_ do RStudio. Após abrir o RStudio, copie e cole o seguinte para a janela da esquerda no >, e depois dê enter: + +``` +install.packages("RMariaDB") +``` + +Adicione o seguinte comando ao script periodicos_resultados_pesquisa.R (janela de cima, à esquerda) + +``` +library(RMariaDB) +``` + +## Conectando a um banco de dados com uma senha + +Primeiro, nos conectaremos ao banco de dados usando uma senha. (Depois utilizaremos um meio de conexão melhor). Por hora, usaremos uma variável para armazenar a senha. Cada vez que iniciar o R, será necessário apagar esta variável, mas isso é melhor do que publicar uma senha *hardcoded* caso compartilhe seus scripts, como pode fazer usando o GitHub. + +No console do RStudio, digite o comando abaixo, substituindo *AlgoDificil* com a senha criada para periodicos_pesquisa_usuario nos passos realizados acima para adicionar um usuário ao banco de dados. + +``` +senhadeusuariolocal <- "AlgoDificil" +``` + +Adicione as seguintes declarações em R ao ficheiro periodicos_resultados_pesquisa.R file e salve-o. + +Para executar este script, selecione todo o texto e clique no botão *Run* (Executar). (Há outras maneiras de executar apenas uma parte do script ou o script inteiro. Se tiver curiosidade, procure no menu abaixo de Code > Run Region. O comando CTRL+ALT+R executa todo o código em R no script.) + +``` +library(RMariaDB) +# O método de conexão abaixo utiliza uma senha armazenada numa variável. +# Para utilizar isto, configure senhadeusuariolocal="A senha de periodicos_pesquisa_usuario" + +artigosDb <- dbConnect(RMariaDB::MariaDB(), user='periodicos_pesquisa_usuario', password=senhadeusuariolocal, dbname='periodicos_resultados_pesquisa', host='localhost') +dbListTables(artigosDb) +dbDisconnect(artigosDb) +``` + +No console, deverá visualizar: + +``` +> dbListTables(artigosDb) +[1] "tbl_periodicos_resultados_pesquisa" +> dbDisconnect(artigosDb) +``` + +Sucesso! O que conseguiu: + +1. Conectar ao banco de dados com dbConnect. +2. Listar a tabela no banco de dados com dbListTables. +3. Desconectar do banco de dados usando dbDisconnect. + +### Conectar-se ao banco de dados com uma senha armazenada num ficheiro de configuração + +O exemplo acima de conexão é uma das maneiras de conectar-se. O método de conexão descrito abaixo armazena a informação da conexão do banco de dados num ficheiro de configuração, para que não seja necessário digitar uma senha numa variável todas as vezes que uma sessão no R for iniciada. Acredito que esse é um processo minucioso, mas é uma maneira mais padronizada e segura de proteger as credenciais usadas para acessar seu banco de dados. Esse método de conexão será usado no código para o restante desse tutorial, mas pode ser substituído pelo método de conexão mais simples mostrado acima se preferir. + +#### Crie o ficheiro .cnf para armazenar a informação de conexão com o banco de dados MySQL + +1. Abra um editor de texto, como o notepad, nano ou TextEdit e cole os itens abaixo, modificando a senha para a criada para periodicos_pesquisa_usuario nas etapas acima para adicionar um usuário e conectá-lo ao banco de dados. + +``` +[periodicos_resultados_pesquisa] +user=periodicos_pesquisa_usuario +password=AlgoDificil +host=127.0.0.1 +port=3306 +database=periodicos_resultados_pesquisa +``` + +2. Salve este ficheiro em algum local fora do diretório de trabalho do R. Salvei o meu no mesmo diretório de outros ficheiros de configuração do MySQL. No PC, o caminho foi o seguinte: C:\Program Files\MySQL\MySQL Server 8.0. Dependendo de seu sistema operacional e da versão do MySQL, esse local pode estar em outro lugar. No Mac, usei /Users/blackadar/Documents/ como a pasta de destino. Testei colocar este ficheiro em lugares diferentes, apenas é necessário que o R possa localizá-lo quando o script for executado. Nomeie o ficheiro como **periodicos_resultados_pesquisa.cnf**. + +3. Atualize o script periodicos_resultados_pesquisa.R acima para conectar-se ao banco de dados usando o ficheiro de configuração. + +``` +library(RMariaDB) +# O método de conexão abaixo utiliza uma senha armazenada num ficheiro de configuração. + +# O R precisa de um caminho completo para encontrar o ficheiro de configuração. +rmariadb.settingsfile<-"C:/Program Files/MySQL/MySQL Server 8.0/periodicos_resultados_pesquisa.cnf" + +rmariadb.db<-"periodicos_resultados_pesquisa" +artigosDb<-dbConnect(RMariaDB::MariaDB(),default.file=rmariadb.settingsfile,group=rmariadb.db) + +# Lista a tabela. Isso confirma que estamos conectados ao banco de dados. +dbListTables(artigosDb) + +# Desconecta para limpar a conexão com o banco de dados. +dbDisconnect(artigosDb) +``` + +4. Execute seu script. + +No console, entre outras linhas, deverá ver novamente: + +``` +> dbListTables(artigosDb) +[1] "tbl_periodicos_resultados_pesquisa" +``` + +De maneira bem sucedida, a conexão com o banco de dados foi realizada utilizando um ficheiro de configuração. + +# Armazenando dados numa tabela com o SQL + +Nesta seção da lição, criaremos uma declaração no SQL para inserir uma linha de dados no banco de dados sobre esse [artigo de periódico](https://perma.cc/C8MR-WYV2). Iremos inserir primeiro o registro no MySQL Workbench e depois faremos isso no R. + +1. No MySQL Workbench, clique na imagem categorizada como SQL+ para criar uma nova aba para o SQL executar consultas (ou vá ao menu "File" e escolha a opção "New Query Tab"). + +2. Cole a declaração abaixo na janela de Query. Esta ação irá inserir um registro na tabela. + + ``` + INSERT INTO tbl_periodicos_resultados_pesquisa ( + titulo_artigo, + data_publicacao_artigo, + url_artigo, + termo_busca_usado) + VALUES('THE LOST LUSITANIA.', + '1915-05-21', + LEFT(RTRIM('http://newspapers.library.wales/view/4121281/4121288/94/'),99), + 'German+Submarine'); + ``` + +3. Clique na imagem de relâmpago/raio na aba do SQL para executar a declaração SQL. + +{% include figure.html filename="introducao-ao-mysql-e-r-14.png" caption="Inserindo um registro numa tabela usando MySQL Workbench" %} + +## Explicação da declaração INSERT + +| SQL | Significado | +| --------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| INSERT INTO tbl_periodicos_resultados_pesquisa ( | Insere um registro na tabela nomeada tbl_periodicos_resultados_pesquisa | +| titulo_artigo, | nome do campo a ser preenchido por um valor | +| data_publicacao_artigo, | " | +| url_artigo, | " | +| termo_busca_usado) | " | +| VALUES('THE LOST LUSITANIA.', | O valor a ser inserido no campo titulo_artigo | +| '1915-05-21', | campo data_publicacao_artigo | +| LEFT(RTRIM('https://newspapers.library.wales/view/4121281/4121288/94/'),99), | campo url_artigo. Este campo é um VARCHAR(99), portanto tem um máximo de 99 caracteres. Inserir uma URL mais longa que 99 caracteres causaria um erro, portanto, duas funções são utilizadas para controlar isso. RTRIM() reduz espaços residuais à direita da URL. LEFT(value,99) retorna apenas os 99 caracteres mais à esquerda da URL reduzida. Esta URL é mais curta que isso, então essas funções estão aqui apenas como exemplo. | +| 'German+Submarine'); | campo termo_busca_usado | + +Opcional: Modifique a declaração INSERT acima e execute-a algumas vezes. Por exemplo: + +``` +INSERT INTO tbl_periodicos_resultados_pesquisa ( +titulo_artigo, +data_publicacao_artigo, +url_artigo, +termo_busca_usado) +VALUES('test insert.', +'1916-07-01', +LEFT(RTRIM('http://newspapers.library.wales/view/4121281/4121288/94/'),99), +'German+Submarine'); +``` + +## Consultando dados numa tabela com o SQL + +Nesta seção da lição, criaremos uma declaração no SQL para selecionar uma linha de dados do banco de dados que inserimos. Selecionaremos o primeiro registro no MySQL Workbench e depois faremos isso no R. + +1. Cole a declaração abaixo numa janela de query no MySQL Workbench. Isto irá selecionar registros da tabela. + + ``` + SELECT titulo_artigo FROM tbl_periodicos_resultados_pesquisa; + ``` + +2. Clique na imagem de relâmpago/raio na aba do SQL para executá-la. Deverá visualizar o título do artigo "THE LOST LUSITANIA." na grade de resultados. Ver abaixo. + +{% include figure.html filename="introducao-ao-mysql-e-r-15.png" caption="Selecionando registros de uma tabela usando MySQL Workbench" %} + +Opcional: Modifique a declaração SELECT acima alterando os campos selecionados e execute novamente. Adicione mais de um campo para a declaração SELECT e execute: + +``` +SELECT titulo_artigo, data_publicacao_artigo FROM tbl_periodicos_resultados_pesquisa; +``` + +## Armazenando dados numa tabela com SQL usando R + +Vamos fazer isso usando R! Abaixo se encontra uma versão expandida do R Script que usamos para nos conectar ao banco de dados. Para sermos concisos, os três primeiros comentários que tínhamos no R Script mostrado acima foram removidos. Não são mais necessários. + +Na linha 4 do script abaixo, lembre-se de modificar o caminho do rmariadb.settingsfile que corresponde à localização desse ficheiro em seu computador. + +``` +library(RMariaDB) +# O método de conexão abaixo utiliza uma senha armazenada num ficheiro de configuração. + +# O R precisa de um caminho completo para encontrar o ficheiro de configuração. +rmariadb.settingsfile<-"C:/Program Files/MySQL/MySQL Server 8.0/periodicos_resultados_pesquisa.cnf" + +rmariadb.db<-"periodicos_resultados_pesquisa" +artigosDb<-dbConnect(RMariaDB::MariaDB(),default.file=rmariadb.settingsfile,group=rmariadb.db) + +# Opcional. Liste a tabela. Isso confirma que nos conectamos ao banco de dados. +dbListTables(artigosDb) + +# Cria a declaração de query. +query<-"INSERT INTO tbl_periodicos_resultados_pesquisa ( +titulo_artigo, +data_publicacao_artigo, +url_artigo, +termo_busca_usado) +VALUES('THE LOST LUSITANIA.', +'1915-05-21', +LEFT(RTRIM('http://newspapers.library.wales/view/4121281/4121288/94/'),99), +'German+Submarine');" + +# Opcional. Exibe o query para o caso de ser necessário solucionar problemas. +print(query) + +# Executa o query no artigoDb que conectamos abaixo. +rsInsert <- dbSendQuery(artigosDb, query) + +# Limpa o resultado. +dbClearResult(rsInsert) + +# Desconecta para limpar a conexão com o banco de dados. +dbDisconnect(artigosDb) +``` + +No script acima, realizamos duas etapas para inserir um registro: + +1. Defina a declaração INSERT na linha com: query <- "INSERT INTO tbl_periodicos_resultados_pesquisa ( +2. Execute a declaração INSERT armazenada na variável da consulta com: rsInsert <- dbSendQuery(artigosDb, query) + +Execute o script acima no R Studio e depois execute uma declaração SELECT no MySQL Workbench. Consegue visualizar o novo registro adicionado? + +### Realize uma limpeza nos dados de teste + +Neste ponto é provável que haja mais de um registro com o título de artigo "THE LOST LUSITANIA.", o que é razoável para a testagem, mas não queremos dados duplicados. Iremos remover os dados de teste e começar novamente. Usando a janela de query no MySQL Workbench, execute a declaração SQL: + +``` +TRUNCATE tbl_periodicos_resultados_pesquisa; +``` + +No painel Action Output do MySQL Workbench deverá visualizar: + +``` +TRUNCATE tbl_periodicos_resultados_pesquisa; 0 row(s) affected 0.093 sec +``` + +Para praticar o que acabamos de fazer: + +1. Execute uma declaração SELECT novamente. Não deverá receber linhas de retorno. +2. Execute novamente o script em R acima para inserir um registro. +3. Realize uma declaração SELECT. Deverás visualizar uma linha de dados. + +### Modifique a declaração INSERT para usar variáveis + +Iremos inserir muitos dados na tabela usando o R, então mudaremos a declaração INSERT para usar variáveis. Veja no código abaixo o destaque *# Compila o query.* + +``` +library(RMariaDB) +# O método de conexão abaixo utiliza uma senha armazenada num ficheiro de configuração. + +# O R precisa de um caminho completo para encontrar o ficheiro de configuração. +rmariadb.settingsfile<-"C:/Program Files/MySQL/MySQL Server 8.0/periodicos_resultados_pesquisa.cnf" + +rmariadb.db<-"periodicos_resultados_pesquisa" +artigosDb<-dbConnect(RMariaDB::MariaDB(),default.file=rmariadb.settingsfile,group=rmariadb.db) + +# Opcional. Lista a tabela. Isso confirma que nos conectamos ao banco de dados. +dbListTables(artigosDb) + +# Compila o query. + +# Atribui variáveis. +entradaTitulo <- "THE LOST LUSITANIA." +entradaPublicacao <- "21 05 1916" +# Converte o valor da string para uma data para armazená-la no banco de dados. +entradaDataPublicacao <- as.Date(entradaPublicacao, "%d %M %Y") +entradaUrl <- "http://newspapers.library.wales/view/4121281/4121288/94/" +buscaSimplesTermos <- "German+Submarine" + +# Cria a declaração de query. +query<-paste( + "INSERT INTO tbl_periodicos_resultados_pesquisa ( + titulo_artigo, + data_publicacao_artigo, + url_artigo, + termo_busca_usado) + VALUES('",entradaTitulo,"', + '",entradaDataPublicacao,"', + LEFT(RTRIM('",entradaUrl,"'),99), + '",buscaSimplesTermos,"')", + sep = '' +) + +# Opcional. Exibe o query para o caso de ser necessário solucionar problemas. +print(query) + +# Executa o query no banco de dados artigosDb que conectamos acima. +rsInsert <- dbSendQuery(artigosDb, query) + +# Limpa o resultado. +dbClearResult(rsInsert) + +# Desconecta para limpar a conexão com o banco de dados. +dbDisconnect(artigosDb) +``` + +Vamos testar esse script: + +1. Execute uma declaração SELECT e observe as linhas que possui. +2. Execute o script em R acima para inserir outro registro. +3. Realize a declaração SELECT. Deverá visualizar uma linha adicional de dados. + +### Erros do SQL + +Vamos criar um simples erro no SQL para visualizar o que acontece. + +No R, modifique: + +``` +entradaTitulo <- "THE LOST LUSITANIA." +``` + +para + +``` +entradaTitulo <- "THE LOST LUSITANIA'S RUDDER." +``` + +e execute novamente o script. + +No console R, há um erro: + +``` +> rsInsert <- dbSendQuery(artigosDb, query) +Error in result_create(conn@ptr, statement, is_statement) : + You have an error in your SQL syntax; check the manual that corresponds to your MySQL server version for the right syntax to use near 'S RUDDER.', + '1916-05-21', + LEFT(RTRIM('http://newspapers.library.wales/view/4' at line 6 [1064] +``` + +É possível verificar, com uma declaração SELECT, se não há registro na tabela com um título de artigo denominado *THE LOST LUSITANIA'S RUDDER*. + +As aspas simples fazem parte da sintaxe do SQL e indicam uma entrada textual. Se estiverem no lugar errado, provocam um erro. Temos que lidar com casos nos quais há dados com aspas. O SQL aceita duas aspas numa declaração de inserção para representar aspas em dados(''). + +Lidaremos com as aspas utilizando uma função `gsub` para substituir aspas simples por aspas duplas, como mostrado abaixo. + +``` +entradaTitulo <- "THE LOST LUSITANIA'S RUDDER." +# altera aspas simples para aspas duplas +entradaTitulo <- gsub("'", "''", entradaTitulo) +``` + +Agora que a questão das aspas no título do artigo está resolvida, execute novamente o script e depois confira com uma declaração SELECT no MySQL Workbench. + +``` +SELECT * FROM periodicos_resultados_pesquisa.tbl_periodicos_resultados_pesquisa WHERE titulo_artigo = "THE LOST LUSITANIA'S RUDDER."; +``` + +Uma vez que o registro teste foi visualizado, digite TRUNCATE tbl_periodicos_resultados_pesquisa para remover esses dados de teste. + +# Armazenando um ficheiro de valores separados por vírgulas (.csv) no banco de dados MySQL + +Na próxima parte da lição, vamos realizar consultas na tabela do banco de dados. Nosso objetivo é obter dados suficientes na tabela para construir um gráfico. Para nos prepararmos para isso, carregaremos alguns dados de amostra de um ficheiro de valores separados por vírgulas (.csv). + +Faça o download dos ficheiros .csv para o seu diretório de trabalho do R. Esses ficheiros estão armazenados no GitHub, então faça o download da versão *Raw* dos ficheiros. + +1. [dados-amostra-jardim.csv](/assets/getting-started-with-mysql-using-r/dados-amostra-jardim.csv) Esta é uma lista de artigos de periódicos galeses publicados durante a Primeira Guerra Mundial que correspondem aos termos de busca "*allotment*"(loteamento) e "*garden*"(jardim). +2. [dados-amostra-submarino.csv](/assets/getting-started-with-mysql-using-r/dados-amostra-submarino.csv) Esta é uma lista de artigos de periódicos galeses publicados durante a Primeira Guerra Mundial que correspondem aos termos de busca "*German*"(alemão) e "*submarine*"(submarino). + +No R, execute a função read.csv() e depois visualize o data frame com os dados amostrais. + +``` +dadosAmostraJardim <- read.csv(file="dados-amostra-jardim.csv", header=TRUE, sep=",") +dadosAmostraJardim +``` + +Muitos dados serão visualizados, incluindo os que se encontram abaixo. Cheque a aba "Environment" (ambiente) na parte direita do RStudio. O Data Frame dadosAmostraJardim deve conter "1242 obs. of 4 variables". + +``` + titulo_artigo +1 -."e;'N'III GARDEN REQUISITES. +<...the result of the data frame results have been removed...> + data_publicacao_artigo url_artigo termo_busca_usado +1 1918-05-11 http://newspapers.library.wales/view/3581057/3581061/27/ AllotmentAndGarden +<...the result of the data frame results have been removed...> +``` + +Observe que nesses dados de amostra, os nomes dos campos estão incluídos no cabeçalho por conveniência: titulo_artigo, data_publicacao_artigo, url_artigo e termo_busca_usado. + +Como observado acima, nosso objetivo aqui é inserir os dados de amostra que estão armazenados no data frame dadosAmostraJardim na tabela MySQL periodicos_resultados_pesquisa. Podemos fazer isso de diferentes maneiras. Uma delas é repetir para cada linha de dado do data frame e executar um comando INSERT, como fizemos acima. Aqui, no entanto, usaremos um comando para inserir todas as linhas em dadosAmostraJardim de uma vez: *dbWriteTable*. Não execute essa declaração ainda, apenas a leia. + +``` +dbWriteTable(artigosDb, value = dadosAmostraJardim, row.names = FALSE, name = "tbl_periodicos_resultados_pesquisa", append = TRUE ) +``` + +| Função | Significado | +| -------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| dbWriteTable(artigosDb, | Use a conexão do banco de dados MySQL artigosDb. | +| value = dadosAmostraJardim, | Insere os valores do data frame dadosAmostraJardim para a tabela. | +| row.names = FALSE, | Nenhum nome de linha foi especificado. | +| name = "tbl_periodicos_resultados_pesquisa", | Insere os valores de dadosAmostraJardim para a tabela tbl_periodicos_resultados_pesquisa | +| append = TRUE ) | Adiciona os valores ao que já existe na tabela. Se esse script rodar novamente, todas as linhas em dadosAmostraJardim serão adicionadas à mesma tabela novamente. | + +Ainda não estamos preparados para executar o comando dbWriteTable(). Primeiro precisamos nos conectar ao banco de dados. Aqui está o script para fazer isso, assim como para carregar o data frame dados-amostra-submarino.csv. Leia-o e execute-o. + +``` +library(RMariaDB) +rmariadb.settingsfile<-"/Program Files/MySQL/MySQL Server 8.0/periodicos_resultados_pesquisa.cnf" + +rmariadb.db<-"periodicos_resultados_pesquisa" +artigosDb<-dbConnect(RMariaDB::MariaDB(),default.file=rmariadb.settingsfile,group=rmariadb.db) + +# A função "setwd" define o directório de trabalho. Deve mudar o caminho desse directório para o directório onde guardou os ficheiros .csv. +setwd("C:/Users/User/Documents") + +# Realiza uma busca nos dados de amostra dos periódicos pelos termos "Allotment" e "Garden" +dadosAmostraJardim <- read.csv(file="dados-amostra-jardim.csv", header=TRUE, sep=",") + +# Uma coluna titulo_artigo na tabela do banco de dados pode armazenar valores até 99 caracteres. +# Esta declaração reduz qualquer título de artigo maior que 99 caracteres. +dadosAmostraJardim$titulo_artigo <- substr(dadosAmostraJardim$titulo_artigo,0,99) + +# Esta declaração formata data_publicacao_artigo para representar o tipo de dado DATETIME. +dadosAmostraJardim$data_publicacao_artigo <- paste(dadosAmostraJardim$data_publicacao_artigo," 00:00:00",sep="") + +dbWriteTable(artigosDb, value = dadosAmostraJardim, row.names = FALSE, name = "tbl_periodicos_resultados_pesquisa", append = TRUE ) + +# Realiza um busca nos dados de amostra dos periódicos pelos termos German+Submarine. +dadosAmostraSubmarino <- read.csv(file="dados-amostra-submarino.csv", header=TRUE, sep=",") + +dadosAmostraSubmarino$titulo_artigo <- substr(dadosAmostraSubmarino$titulo_artigo,0,99) +dadosAmostraSubmarino$data_publicacao_artigo <- paste(dadosAmostraSubmarino$data_publicacao_artigo," 00:00:00",sep="") + +dbWriteTable(artigosDb, value = dadosAmostraSubmarino, row.names = FALSE, name = "tbl_periodicos_resultados_pesquisa", append = TRUE ) + +# Desconecta para limpar a conexão com o banco de dados. +dbDisconnect(artigosDb) +``` + +Se o script for executado mais de uma vez, serão gerados registros duplicados. Se isso acontecer, apenas execute o comando TRUNCATE na tabela e execute o script novamente, mas apenas uma vez. É possível verificar se o número de registros é o correto. No MySQL Workbench, execute o seguinte na janela de Query: + +``` +SELECT COUNT(*) FROM tbl_periodicos_resultados_pesquisa; +``` + +A contagem deve retornar 2880 registros. 1242 de dadosAmostraJardim e 1638 de dadosAmostraSubmarino. + +# Selecionado dados de uma tabela com SQL usando R + +Nosso objetivo aqui é usar a tabela de artigos que importamos e criar um gráfico do número de artigos publicados nos *Welsh Newspapers* (jornais galeses) ao longo de cada mês da Primeira Guerra Mundial que corresponda aos termos de busca *allotment*(loteamento) e *garden* (jardim), e *German* (alemão) e *submarine*(submarino). + +O script abaixo consulta o banco de dados e produz o gráfico de linha abaixo. Leia o script e observe o que está acontecendo. Segue uma explicação do script. + +``` +library(RMariaDB) +rmariadb.settingsfile<-"/Program Files/MySQL/MySQL Server 8.0/periodicos_resultados_pesquisa.cnf" + +rmariadb.db<-"periodicos_resultados_pesquisa" +artigosDb<-dbConnect(RMariaDB::MariaDB(),default.file=rmariadb.settingsfile,group=rmariadb.db) + +termoBuscaUsado = "German+Submarine" +# Solicita uma contagem do número de artigos que correspondem ao termoBuscaUsado que foram publicados a cada mês. +query<-paste("SELECT ( COUNT(CONCAT(MONTH(data_publicacao_artigo), ' ',YEAR(data_publicacao_artigo)))) as 'count' + FROM tbl_periodicos_resultados_pesquisa + WHERE termo_busca_usado ='", termoBuscaUsado,"' + GROUP BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo) + ORDER BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo);",sep="") + +print(query) +rs = dbSendQuery(artigosDb,query) +dbRows<-dbFetch(rs) + +contagemArtigos<-c(as.integer(dbRows$count)) + +# Coloca os resultados da consulta numa série temporal. +qts1 = ts(contagemArtigos, frequency = 12, start = c(1914, 8)) +print(qts1) + +# Plota a série temporal qts1 dos dados com uma linha de espessura 3 na cor vermelha. +plot(qts1, + lwd=3, + col = "red", + xlab="Mês da Guerra", + ylab="Números de artigos de periódicos", + xlim=c(1914,1919), + ylim=c(0,150), + main=paste("Número de artigos nos jornais galeses (Welsh newspapers) que correspondem aos termos de busca listados.",sep=""), + sub="Legenda do termo de busca: Vermelho = German+Submarine. Verde = Allotment And Garden.") + +termoBuscaUsado="AllotmentAndGarden" + +# Solicita uma contagem do número de artigos que correspondem ao termoBuscaUsado que foram publicados a cada mês. +query<-paste("SELECT ( COUNT(CONCAT(MONTH(data_publicacao_artigo),' ',YEAR(data_publicacao_artigo)))) as 'count' FROM tbl_periodicos_resultados_pesquisa WHERE termo_busca_usado='",termoBuscaUsado,"' GROUP BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo) ORDER BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo);",sep="") +print(query) +rs = dbSendQuery(artigosDb,query) +dbRows<-dbFetch(rs) + +contagemArtigos<-c(as.integer(dbRows$count)) + +# Coloca os resultados da consulta numa série temporal. +qts2 = ts(contagemArtigos, frequency = 12, start = c(1914, 8)) + +# Adiciona esta linha com a série temporal qts2 à plotagem existente. +lines(qts2, lwd=3,col="darkgreen") + +# Limpa o resultado. +dbClearResult(rs) + +# Desconecta para limpar a conexão com o banco de dados. +dbDisconnect(artigosDb) +``` + +## Explicação do script de seleção de dados e criação do gráfico. + +O método que conecta o banco de dados é explicado [acima](#conectando-a-um-banco-de-dados-com-uma-senha). + +Este script seleciona dois resultados de um conjunto de dados e cria um gráfico com esses dados. Um dos resultados é a combinação dos artigos de periódicos com a busca pelos termos "German+Submarine". Eles são consultados através da declaração SELECT: + +``` +SELECT ( + COUNT(CONCAT(MONTH(data_publicacao_artigo),' ',YEAR(data_publicacao_artigo)))) as 'count' + FROM tbl_periodicos_resultados_pesquisa + WHERE termo_busca_usado='",termoBuscaUsado,"' + GROUP BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo) + ORDER BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo); +``` + +| SQL | Significado | +| ----------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| SELECT ( | SELECT - Seleciona os dados que correspondem à condição na cláusula WHERE na tabela do banco de dados nomeado . | +| COUNT(CONCAT(MONTH(data_publicacao_artigo),' ',YEAR(data_publicacao_artigo)))) as 'count' | Fornece uma contagem do número de artigos publicados que compartilham o mesmo mês e ano de publicação. CONCAT representa a ação concatenar, que cria um único valor textual de dois ou mais valores textuais, nesse caso, o mês e o ano. | +| FROM tbl_periodicos_resultados_pesquisa | Este é o banco de dados a partir do qual estamos selecionando os dados. | +| GROUP BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo) | Esta declaração GROUP BY é importante para a contagem (COUNT) acima. Aqui os dados estão agrupados por mês e ano, para que seja possível contar todos os registros no grupo. | +| ORDER BY YEAR(data_publicacao_artigo),MONTH(data_publicacao_artigo); | Coloca os resultados ordenados por data, o que é útil já que queremos construir um gráfico por data. | + +As declarações abaixo executam a consulta e colocam o resultado *rs* num data frame *dbRows*: + +``` +rs = dbSendQuery(artigosDb,query) +dbRows<-dbFetch(rs) +``` + +Abaixo, o data frame *dbRows* é colocado numa série temporal com a função *ts()*, para que seja possível plotar para cada mês, iniciando de agosto de 1914. + +``` +# Coloca os resultados da consulta numa série temporal. +qts1 = ts(contagemArtigos, frequency = 12, start = c(1914, 8)) +``` + +Abaixo, os dados na série temporal *qts1* são plotados num gráfico: + +``` +plot(qts1, + lwd=3, + col = "red", + xlab="Mês da Guerra", + ylab="Números de artigos de periódicos", + xlim=c(1914,1919), + ylim=c(0,150), + main=paste("Número de artigos nos jornais galeses (Welsh newspapers) que correspondem aos termos de busca listados.",sep=""), + sub="Legenda do termo de busca: Vermelho = German+Submarine. Verde = Allotment And Garden.") +``` + +O que isso difere da parte do script que gera o gráfico dos artigos correspondentes à busca dos termos "Allotment And Garden"? Não muito, definitivamente. Apenas usamos a função *lines()* para plotar os resultados no mesmo gráfico que construímos acima. + +``` +lines(qts2, lwd=3,col="darkgreen") +``` + +### Resultados da seleção de dados e da criação do gráfico + +Aqui abaixo está o gráfico que deveria aparecer: + +{% include figure.html filename="introducao-ao-mysql-e-r-16.png" caption="Plotagem do número de artigos de periódicos publicados cada mês que correspondem aos termos de busca" %} + +# Indo mais longe com o MySQL + +Se deseja colocar um banco de dados num website, uma maneira de fazê-lo é usando MySQL e a linguagem PHP para construir as páginas do site. Um exemplo deste tipo de website é o que construí para [buscar edições do "the Equity newspaper"](https://perma.cc/237N-DD9E). O livro de Larry Ullman's, *PHP and MySQL for Dynamic Web Sites*, aborda como configurar e conectar um banco de dados usando MySQL e PHP de uma maneira resistente à hackers. + +Para exemplos do uso de SQL para ordenar e agrupar dados, assim com também realizar cálculos, veja: [MySQL by Examples for Beginners](https://web.archive.org/web/20171228130133/https://www.ntu.edu.sg/home/ehchua/programming/sql/MySQL_Beginner.html) ou MySQL [Examples of Common Queries](https://perma.cc/84HN-9DBL). + +# Conclusão + +Espero que tenha obtido o conhecimento para configurar uma tabela de banco de dados, conectá-lo e armazenar registros. Embora tenhamos abordado apenas uma pequena parte das diferentes maneiras de realizar consultas nos dados, espero também que saiba a técnica de uso das declarações SELECT para que possa utilizá-las em seus futuros projetos de história digital. + +# Créditos + +Finalizei esta lição graças ao suporte do [George Garth Graham Undergraduate Digital History Research Fellowship](https://perma.cc/S7PP-FY5U). + +Agradeço à Drª. Amanda Visconti pelo suporte e orientação ao longo da preparação desta lição. + +# Referências + +Ullman, L. 2005. *PHP and MySQL for Dynamic Web Sites, 2nd ed.* Berkeley, Calif: Peachpit. + +# Notas + +[^1]: Jason A. French, "Using R With MySQL Databases," blog (3 July 2014), [https://www.jason-french.com/blog/2014/07/03/using-r-with-mysql-databases/](https://perma.cc/5VYV-L5PG). + +[^2]: Taylor Arnold and Lauren Tilton, "Basic Text Processing in R," Programming Historian (27 March 2017), [tradução para português](/pt/licoes/processamento-basico-texto-r). + +[^3]: Taryn Dewar, "R Basics with Tabular Data," Programming Historian (05 September 2016), [tradução para português](/pt/licoes/nocoes-basicas-R-dados-tabulares). + +O script em R usado para recolher dados de amostra se encontra [aqui](https://perma.cc/87AE-LJRG). diff --git a/pt/licoes/introducao-omeka-net.md b/pt/licoes/introducao-omeka-net.md index e4680bbeba..14c15ff2b7 100644 --- a/pt/licoes/introducao-omeka-net.md +++ b/pt/licoes/introducao-omeka-net.md @@ -1,188 +1,188 @@ ---- -title: Introdução ao Omeka.net -slug: introducao-omeka-net -layout: lesson -date: 2016-02-17 -translation_date: 2021-06-07 -authors: -- Miriam Posner -editors: -- Adam Crymble -translator: -- Gabriela Kucuruza -translation-editor: -- Daniel Alves -translation-reviewer: -- Ângela Pité -- Rômulo Predes -difficulty: 1 -exclude_from_check: - - reviewers -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/379 -activity: presenting -topics: [website] -abstract: "Com o Omeka.net é fácil criar sites na web para mostrar coleções de itens." -original: up-and-running-with-omeka -avatar_alt: Esqueleto de dinossauro num museu -doi: 10.46430/phpt0011 ---- - -{% include toc.html %} - - - - - -O [Omeka.net](http://www.omeka.net) facilita a criação de websites para mostrar coleções de itens. - -## Cadastre-se numa conta do Omeka - -{% include figure.html filename="intro-omeka-net-1.png" caption="Cadastre-se na conta de teste" %} - -Entre em www.omeka.net e clique em **Sign Up** (Cadastre-se). Escolha o Plano de Teste. Preencha o formulário de cadastro. Verifique o seu e-mail pelo link de ativação da conta. - -## Crie um novo site do Omeka - -{% include figure.html filename="intro-omeka-net-2.png" caption="Página da conta do Omeka.net" %} - -Depois de clicar no link no seu e-mail, clique em **Add a Site** (Adicionar um site). - -Preencha a informação sobre o URL do seu site, o título que quer usar e a descrição que preferir. Clique em **Add Your New Site** (Adicione o seu novo site). - -## Você tem um novo site do Omeka! - - -{% include figure.html filename="intro-omeka-net-3.png" caption="Veja o seu site" %} - -Para ver o seu site, clique em **View Site** (Ver Site). - -## Um site vazio no Omeka - -{% include figure.html filename="intro-omeka-net-4.png" caption="Vista pública do site" %} - -Esse é o seu site vazio do Omeka esperando para ser preenchido. Para retornar ao painel de controle (*dashboard*) clique no botão **Back** (Retornar) ou escreva `http://www.omeka.net/dashboard`. Agora, clique em **Manage Site** (Administre o site). - -## Instale alguns plugins - -{% include figure.html filename="intro-omeka-net-5.png" caption="Página dos Plugins" %} - -O seu site do Omeka vem com plugins que oferecem funções adicionais. Precisamos ativá-los. Para fazer isso, clique no item **Plugins** no menu, no canto superior direito. Na página seguinte, clique no botão **Install** (Instalar) em **Exhibit Builder** (construtor de exposições) (deixe as opções como estão na página seguinte) e em **Simple Pages** (Páginas simples). - -## Configurar o seu site para português (nota da tradução) - -A configuração padrão do Omeka é em inglês. Porém, podemos mudar a língua do seu site para português (pt-BR e pt-PT) através de um Plugin. Para realizar essa configuração, siga os passos a seguir: - -1. Clique em **Manage Site** (Administrar Site) no Menu Principal. -2. Clique em Plugins no menu superior ou acesse os Plugins através do link `https://nome_do_seu_site.omeka.net/admin/plugins`, sendo `nome_do_seu_site` o nome escolhido para o seu site. - -3. Encontre o Plugin **Locale** e clique no botão **Install** (Instalar). Ao clicar, a sua tela ficará parecida com a imagem abaixo. - -4. Ao clicar em instalar, aparecerá uma página com as opções de tradução. Escolha **Português - Brasil (pt_BR)** ou **Português - Portugal (pt_PT)**. - -5. Clique em **Save Changes** (Salvar Mudanças) - -{% include figure.html filename="intro-omeka-net-6.png" caption="A sua tela ficará parecida com a imagem acima. Nela, o Plugin Locale está indicado." %} - -Agora, o seu site e o painel de controle estarão em português. - -## Trocar temas - -{% include figure.html filename="intro-omeka-net-7.png" caption="Página de Configuração dos Temas" %} - -O Omeka permite que a aparência do site público seja alterada por meio dos temas. Para fazer isso, clique em **Aparência** (Appearence, à direita do canto superior do seu painel de controle). Mude os temas selecionando uma das opções disponíveis na página. Clique o botão verde **Utilizar este tema** (Use this theme) para atualizar o seu novo tema. Visite, então, o seu site público clicando no nome do seu site, no canto superior esquerdo da página. - -## Temos um novo tema! - -{% include figure.html filename="intro-omeka-net-8.png" caption="Vista pública com o novo tema" %} - -Confira o seu novo tema e volte para o seu painel de controle. É possível retornar para o seu antigo tema, continuar com esse ou selecionar uma das outras opções. - - -## Adicione um item - -{% include figure.html filename="intro-omeka-net-9.png" caption="Adicione um item" %} - -Clique em **Itens** no lado esquerdo do menu e depois (naturalmente!) **Adicione um item** (Add an item). - -## Descreva o seu novo item - -{% include figure.html filename="intro-omeka-net-10.png" caption="Torne o seu item público usando a caixa de seleção assinalada" %} - -Lembre, **Dublin Core** refere-se às informações descritivas (metadados) que você insere sobre um item. Todas essas informações são opcionais e não há como inseri-las incorretamente. Tente, porém, ser consistente. - -Não se esqueça de clicar na caixa de seleção **Público** (Public) para que o seu item fique visível para o público em geral. Se você não clicar nessa caixa, apenas pessoas cadastradas no seu site poderão ver o item. - -Para adicionar múltiplos campos - por exemplo, se você quiser adicionar vários assuntos ao seu item - use o botão verde **Adicionar informação** (Add input) à esquerda das caixas de texto. - -## Uma questão complexa - -{% include figure.html filename="intro-omeka-net-11.png" caption="O que é isto?" %} - -Eu estou a criar um registo de item para o meu cachorro, Boris. Mas eu estou a descrever o Boris _ele mesmo_ ou uma _fotografia_ do Boris? No caso da primeira opção, o **Criador** seria... bem, suponho que isso dependa das suas crenças religiosas. Se é o segundo caso, o criador seria Brad Wallace, quem tirou a foto. - -A decisão sobre descrever um objeto ou a representação de um objeto é sua. Uma vez que tenha decidido, seja consistente. - -## Anexe um ficheiro ao registo do seu item - -{% include figure.html filename="intro-omeka-net-12.png" caption="Adicionando ficheiros a um item" %} - -Uma vez que terminamos de adicionar os metadados do Dublin Core, podemos anexar um ficheiro ao registo do seu item clicando em **Arquivos** (Ficheiros em PT_PT / Files), no topo do formulário de Dublin Core. (Não é necessário clicar em **Adicionar Item** antes de fazer isso; o Omeka irá salvar automaticamente essa informação). Podemos adicionar múltiplos ficheiros, mas saiba que o plano Básico apenas vem com 500 MB de espaço de armazenamento. - -Tendo adicionado o ficheiro ou os ficheiros, podemos adicionar **Tags** (Etiquetas em PT_PT) clicando no botão. Também podemos clicar em **Metadados** (Meta-dados do Tipo de Item em PT_PT / Item Type Metadata) para escolher a tipologia - pessoa, lugar, animal, vegetal, mineral - do seu item. Se não encontrar um tipo de item apropriado para o seu item, não se preocupe. Nós podemos adicionar um novo tipo de item depois. - -Quando tudo estiver pronto, clique no botão verde **Adicionar item**. - -## Você tem um item! - -{% include figure.html filename="intro-omeka-net-13.png" caption="Explorar itens, vista de administrador" %} - -Esta lista contém todos os itens que foram adicionados. Se o item não fosse público, estaria escrito _Privado_ depois do título. Para ver como a página do seu novo item se parece, clique no nome do item. - -## Esta não é a página pública para o seu item - -{% include figure.html filename="intro-omeka-net-14.png" caption="Página de Item, vista de administrador" %} - -Pode parecer, mas essa página não é o que um usuário não-cadastrado irá ver quando navegar para a página do seu item. Para ver o que um usuário veria, clique no botão azul **Ver a Página Pública**, à direita. (Ou você pode editar o item clicando em **Editar** na direita). - -## Esta é a página pública para o seu item - -{% include figure.html filename="intro-omeka-net-15.png" caption="Página do item, vista pública" %} - -Isso é o que o usuário geral verá se ele navegar pela sua página. - -## Crie uma coleção - -{% include figure.html filename="intro-omeka-net-16.png" caption="Criar uma coleção" %} - -É possível começar a ordenar a sua lista de itens agrupando-os em coleções. Para fazer isso, retorne para o painel de controle (Dashboard), clique na aba de **Coleções** (Collections) e clique em **Adicionar uma coleção**. - -## Insira informações sobre a sua coleção - -{% include figure.html filename="intro-omeka-net-17.png" caption="Adicionar metadados da coleção" %} - -No Omeka, os metadados são fundamentais! Insira alguma informação sobre a sua nova coleção e lembre-se de clicar no botão **Público** perto do fim da página. Então salve a coleção. - -## Adicione itens à sua coleção - -{% include figure.html filename="intro-omeka-net-18.png" caption="Clique na caixa seleção de cada item para editar" %} - -Para preencher a coleção que acabou de criar, clique na aba de *Itens*. Da sua lista **Ver Itens** (Explorar Itens em PT_PT), clique nas caixas de verificação dos itens que pertencem à sua nova coleção. Então clique no botão **Editar**. - -## Escolha a coleção - -{% include figure.html filename="intro-omeka-net-19.png" caption="Escolha uma coleção do menu suspenso" %} - -Na página Editar Itens (Editar Itens em Lote em PT_PT), selecione a Coleção à qual gostaria de adicionar os seus itens. (Além disso, tenha atenção a todas as outras coisas que podem ser feitas nessa página). - -## Veja a sua nova coleção - -{% include figure.html filename="intro-omeka-net-20.png" caption="Ver coleção, vista pública" %} - -Retorne para o seu site público. Se clicarmos na aba de **Ver Coleções** (Explorar as Colecções em PT_PT) na face pública do seu site, deverá haver agora uma nova coleção contendo os itens que foram identificados. - -Agora que alguns itens foram adicionados e agrupados numa coleção, dedique algum tempo para editar ainda mais o seu site. Ele está a começar a tomar forma agora que há tanto itens individuais como unidades temáticas, mas o Omeka pode fazer ainda mais. Iremos falar sobre isso numa próxima lição. - -## Recursos Adicionais - -A equipe do Omeka compilou ótimos recursos nas [páginas de ajuda](http://info.omeka.net/)(em inglês) do software. - +--- +title: Introdução ao Omeka.net +slug: introducao-omeka-net +layout: lesson +date: 2016-02-17 +translation_date: 2021-06-07 +authors: +- Miriam Posner +editors: +- Adam Crymble +translator: +- Gabriela Kucuruza +translation-editor: +- Daniel Alves +translation-reviewer: +- Ângela Pité +- Rômulo Predes +difficulty: 1 +exclude_from_check: + - reviewers +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/379 +activity: presenting +topics: [website] +abstract: "Com o Omeka.net é fácil criar sites na web para mostrar coleções de itens." +original: up-and-running-with-omeka +avatar_alt: Esqueleto de dinossauro num museu +doi: 10.46430/phpt0011 +--- + +{% include toc.html %} + + + + + +O [Omeka.net](https://www.omeka.net) facilita a criação de websites para mostrar coleções de itens. + +## Cadastre-se numa conta do Omeka + +{% include figure.html filename="intro-omeka-net-1.png" caption="Cadastre-se na conta de teste" %} + +Entre em www.omeka.net e clique em **Sign Up** (Cadastre-se). Escolha o Plano de Teste. Preencha o formulário de cadastro. Verifique o seu e-mail pelo link de ativação da conta. + +## Crie um novo site do Omeka + +{% include figure.html filename="intro-omeka-net-2.png" caption="Página da conta do Omeka.net" %} + +Depois de clicar no link no seu e-mail, clique em **Add a Site** (Adicionar um site). + +Preencha a informação sobre o URL do seu site, o título que quer usar e a descrição que preferir. Clique em **Add Your New Site** (Adicione o seu novo site). + +## Você tem um novo site do Omeka! + + +{% include figure.html filename="intro-omeka-net-3.png" caption="Veja o seu site" %} + +Para ver o seu site, clique em **View Site** (Ver Site). + +## Um site vazio no Omeka + +{% include figure.html filename="intro-omeka-net-4.png" caption="Vista pública do site" %} + +Esse é o seu site vazio do Omeka esperando para ser preenchido. Para retornar ao painel de controle (*dashboard*) clique no botão **Back** (Retornar) ou escreva `http://www.omeka.net/dashboard`. Agora, clique em **Manage Site** (Administre o site). + +## Instale alguns plugins + +{% include figure.html filename="intro-omeka-net-5.png" caption="Página dos Plugins" %} + +O seu site do Omeka vem com plugins que oferecem funções adicionais. Precisamos ativá-los. Para fazer isso, clique no item **Plugins** no menu, no canto superior direito. Na página seguinte, clique no botão **Install** (Instalar) em **Exhibit Builder** (construtor de exposições) (deixe as opções como estão na página seguinte) e em **Simple Pages** (Páginas simples). + +## Configurar o seu site para português (nota da tradução) + +A configuração padrão do Omeka é em inglês. Porém, podemos mudar a língua do seu site para português (pt-BR e pt-PT) através de um Plugin. Para realizar essa configuração, siga os passos a seguir: + +1. Clique em **Manage Site** (Administrar Site) no Menu Principal. +2. Clique em Plugins no menu superior ou acesse os Plugins através do link `https://nome_do_seu_site.omeka.net/admin/plugins`, sendo `nome_do_seu_site` o nome escolhido para o seu site. + +3. Encontre o Plugin **Locale** e clique no botão **Install** (Instalar). Ao clicar, a sua tela ficará parecida com a imagem abaixo. + +4. Ao clicar em instalar, aparecerá uma página com as opções de tradução. Escolha **Português - Brasil (pt_BR)** ou **Português - Portugal (pt_PT)**. + +5. Clique em **Save Changes** (Salvar Mudanças) + +{% include figure.html filename="intro-omeka-net-6.png" caption="A sua tela ficará parecida com a imagem acima. Nela, o Plugin Locale está indicado." %} + +Agora, o seu site e o painel de controle estarão em português. + +## Trocar temas + +{% include figure.html filename="intro-omeka-net-7.png" caption="Página de Configuração dos Temas" %} + +O Omeka permite que a aparência do site público seja alterada por meio dos temas. Para fazer isso, clique em **Aparência** (Appearence, à direita do canto superior do seu painel de controle). Mude os temas selecionando uma das opções disponíveis na página. Clique o botão verde **Utilizar este tema** (Use this theme) para atualizar o seu novo tema. Visite, então, o seu site público clicando no nome do seu site, no canto superior esquerdo da página. + +## Temos um novo tema! + +{% include figure.html filename="intro-omeka-net-8.png" caption="Vista pública com o novo tema" %} + +Confira o seu novo tema e volte para o seu painel de controle. É possível retornar para o seu antigo tema, continuar com esse ou selecionar uma das outras opções. + + +## Adicione um item + +{% include figure.html filename="intro-omeka-net-9.png" caption="Adicione um item" %} + +Clique em **Itens** no lado esquerdo do menu e depois (naturalmente!) **Adicione um item** (Add an item). + +## Descreva o seu novo item + +{% include figure.html filename="intro-omeka-net-10.png" caption="Torne o seu item público usando a caixa de seleção assinalada" %} + +Lembre, **Dublin Core** refere-se às informações descritivas (metadados) que você insere sobre um item. Todas essas informações são opcionais e não há como inseri-las incorretamente. Tente, porém, ser consistente. + +Não se esqueça de clicar na caixa de seleção **Público** (Public) para que o seu item fique visível para o público em geral. Se você não clicar nessa caixa, apenas pessoas cadastradas no seu site poderão ver o item. + +Para adicionar múltiplos campos - por exemplo, se você quiser adicionar vários assuntos ao seu item - use o botão verde **Adicionar informação** (Add input) à esquerda das caixas de texto. + +## Uma questão complexa + +{% include figure.html filename="intro-omeka-net-11.png" caption="O que é isto?" %} + +Eu estou a criar um registo de item para o meu cachorro, Boris. Mas eu estou a descrever o Boris _ele mesmo_ ou uma _fotografia_ do Boris? No caso da primeira opção, o **Criador** seria... bem, suponho que isso dependa das suas crenças religiosas. Se é o segundo caso, o criador seria Brad Wallace, quem tirou a foto. + +A decisão sobre descrever um objeto ou a representação de um objeto é sua. Uma vez que tenha decidido, seja consistente. + +## Anexe um ficheiro ao registo do seu item + +{% include figure.html filename="intro-omeka-net-12.png" caption="Adicionando ficheiros a um item" %} + +Uma vez que terminamos de adicionar os metadados do Dublin Core, podemos anexar um ficheiro ao registo do seu item clicando em **Arquivos** (Ficheiros em PT_PT / Files), no topo do formulário de Dublin Core. (Não é necessário clicar em **Adicionar Item** antes de fazer isso; o Omeka irá salvar automaticamente essa informação). Podemos adicionar múltiplos ficheiros, mas saiba que o plano Básico apenas vem com 500 MB de espaço de armazenamento. + +Tendo adicionado o ficheiro ou os ficheiros, podemos adicionar **Tags** (Etiquetas em PT_PT) clicando no botão. Também podemos clicar em **Metadados** (Meta-dados do Tipo de Item em PT_PT / Item Type Metadata) para escolher a tipologia - pessoa, lugar, animal, vegetal, mineral - do seu item. Se não encontrar um tipo de item apropriado para o seu item, não se preocupe. Nós podemos adicionar um novo tipo de item depois. + +Quando tudo estiver pronto, clique no botão verde **Adicionar item**. + +## Você tem um item! + +{% include figure.html filename="intro-omeka-net-13.png" caption="Explorar itens, vista de administrador" %} + +Esta lista contém todos os itens que foram adicionados. Se o item não fosse público, estaria escrito _Privado_ depois do título. Para ver como a página do seu novo item se parece, clique no nome do item. + +## Esta não é a página pública para o seu item + +{% include figure.html filename="intro-omeka-net-14.png" caption="Página de Item, vista de administrador" %} + +Pode parecer, mas essa página não é o que um usuário não-cadastrado irá ver quando navegar para a página do seu item. Para ver o que um usuário veria, clique no botão azul **Ver a Página Pública**, à direita. (Ou você pode editar o item clicando em **Editar** na direita). + +## Esta é a página pública para o seu item + +{% include figure.html filename="intro-omeka-net-15.png" caption="Página do item, vista pública" %} + +Isso é o que o usuário geral verá se ele navegar pela sua página. + +## Crie uma coleção + +{% include figure.html filename="intro-omeka-net-16.png" caption="Criar uma coleção" %} + +É possível começar a ordenar a sua lista de itens agrupando-os em coleções. Para fazer isso, retorne para o painel de controle (Dashboard), clique na aba de **Coleções** (Collections) e clique em **Adicionar uma coleção**. + +## Insira informações sobre a sua coleção + +{% include figure.html filename="intro-omeka-net-17.png" caption="Adicionar metadados da coleção" %} + +No Omeka, os metadados são fundamentais! Insira alguma informação sobre a sua nova coleção e lembre-se de clicar no botão **Público** perto do fim da página. Então salve a coleção. + +## Adicione itens à sua coleção + +{% include figure.html filename="intro-omeka-net-18.png" caption="Clique na caixa seleção de cada item para editar" %} + +Para preencher a coleção que acabou de criar, clique na aba de *Itens*. Da sua lista **Ver Itens** (Explorar Itens em PT_PT), clique nas caixas de verificação dos itens que pertencem à sua nova coleção. Então clique no botão **Editar**. + +## Escolha a coleção + +{% include figure.html filename="intro-omeka-net-19.png" caption="Escolha uma coleção do menu suspenso" %} + +Na página Editar Itens (Editar Itens em Lote em PT_PT), selecione a Coleção à qual gostaria de adicionar os seus itens. (Além disso, tenha atenção a todas as outras coisas que podem ser feitas nessa página). + +## Veja a sua nova coleção + +{% include figure.html filename="intro-omeka-net-20.png" caption="Ver coleção, vista pública" %} + +Retorne para o seu site público. Se clicarmos na aba de **Ver Coleções** (Explorar as Colecções em PT_PT) na face pública do seu site, deverá haver agora uma nova coleção contendo os itens que foram identificados. + +Agora que alguns itens foram adicionados e agrupados numa coleção, dedique algum tempo para editar ainda mais o seu site. Ele está a começar a tomar forma agora que há tanto itens individuais como unidades temáticas, mas o Omeka pode fazer ainda mais. Iremos falar sobre isso numa próxima lição. + +## Recursos Adicionais + +A equipe do Omeka compilou ótimos recursos nas [páginas de ajuda](https://info.omeka.net/)(em inglês) do software. + diff --git a/pt/licoes/limpar-dados-openrefine.md b/pt/licoes/limpar-dados-openrefine.md index 57290f8184..d4cb1b409a 100644 --- a/pt/licoes/limpar-dados-openrefine.md +++ b/pt/licoes/limpar-dados-openrefine.md @@ -1,155 +1,155 @@ ---- -title: "Limpar dados com o OpenRefine" -slug: limpar-dados-openrefine -original: cleaning-data-with-openrefine -layout: lesson -collection: lessons -date: 2013-08-05 -translation_date: 2023-04-29 -tested-date: 2024-03-14 -lesson-testers: Antonin Delpeuch -authors: -- Seth van Hooland -- Ruben Verborgh -- Max De Wilde -reviewers: -- Patrick Burns -- Nora McGregor -editors: -- Adam Crymble -translator: -- Francisco Nabais -translation-editor: -- Aracele Torres -translation-reviewer: -- Eric Brasil -- Joana Vieira Paulino -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/427 -difficulty: 2 -activity: transforming -topics: [data-manipulation] -abstract: Este tutorial foca-se na forma como o usuário pode diagnosticar e agir perante a precisão dos dados -avatar_alt: Dois homens a lavar a roupa ao ar livre -doi: 10.46430/phpt0038 ---- - -{% include toc.html %} - -
    -Nota de Tradução: Alguns termos, por aparecerem constantemente e facilitarem a interpretação das imagens, apenas foram propositadamente traduzidos uma vez e serão colocados entre parênteses em português na primeira vez que surgem. -
    - -## Objetivos da lição - -Não aceite os dados tal como são apresentados. Esta é a principal mensagem deste tutorial que se foca na forma como os usuários podem diagnosticar e agir perante a precisão dos dados. Nesta lição, o usuário vai aprender os princípios e a prática da limpeza de dados, ao mesmo tempo que aprende como é que o [*OpenRefine*](http://openrefine.org) (em inglês) pode ser utilizado para realizar quatro tarefas essenciais que vão ajudar na limpeza de dados: - -1. Remover registos duplicados -2. Separar múltiplos valores contidos no mesmo campo -3. Analisar a distribuição de valores ao longo do Dataset -4. Agrupar diferentes representações da mesma realidade - -Estes passos são explicitados com a ajuda de uma série de exercicios baseados na coleção de metadados do *[Powerhouse](https://powerhouse.com.au/)* (em inglês), demonstrando, assim, como métodos (semi)automáticos podem ajudar na correção de erros dos dados. - -## Porque é que os historiadores devem se preocupar com a qualidade dos dados? - -Registros duplicados, valores vazios e formatos incossistentes são fenómenos com os quais devemos estar preparados para lidar quando utilizamos data sets históricos. Esta lição vai ensinar o usuário a descobrir inconsistências nos dados contidos em tabelas ou bases de dados. À medida que, cada vez mais, partilhamos, agregamos e reutilizamos dados na web, os historiadores terão uma maior necessidade de responder a problemas inevitáveis associados à qualidade dos dados. Utilizando um programa chamado *OpenRefine*, o usuário será capaz de identificar facilmente erros sistemáticos, tais como células em branco, duplicações, inconsistências ortográficas, etc. O *OpenRefine* não só permite um diagnóstico rápido da precisão dos dados, mas também age perante certos erros de forma automática. - -## Descrição da ferramenta: *OpenRefine* - -No passado, os historiadores dependiam de especialistas em tecnologias da informação para diagnosticar a qualidade dos dados e para executar tarefas de limpeza dos mesmos. Isto exigia programas computacionais personalizados quando se trabalhava com data sets consideráveis. Felizmente, o surgimento de Ferramentas Interativas de Transformação de Dados (Interactive Data Transformation tools, ou IDTs em inglês), permite que até profissionais sem habilidades técnicas aprofundadas possam realizar operações rápidas e baratas em grandes data sets. - -As Ferramentas Interativas de Transformação de Dados assemelham-se às tabelas de dados do desktop com as quais estamos familiarizados, chegando a partilhar funcionalidades com as mesmas. O usuário pode, por exemplo, usar aplicações como o Microsoft Excel para organizar os seus dados com base em vários filtros, sejam eles numéricos, alfabéticos ou até personalizados, o que permite detetar erros mais facilmente. Configurar estes filtros em tabelas de dados pode ser complicado, já que estes são uma função secundária do software. Geralmente, podemos dizer que as tabelas de dados são projetadas para funcionar em linhas ou células individuais, enquanto as Ferramentas Interativas de Transformação de Dados operam em grandes intervalos de dados ao mesmo tempo. Estas "Tabelas de dados em esteroides" fornecem uma interface integrada e amigável através da qual os usuários finais podem detetar e corrigir erros. - -Nos últimos anos, têm sido desenvolvidas várias ferramentas para a transformação de dados interativos, tais como [*Potter’s Wheel ABC*](https://perma.cc/Q6QD-E64N) (em inglês) e [*Wrangler*](https://perma.cc/Y45B-6ZLU) (em inglês). Aqui queremos concentrar-nos, sobretudo, no *OpenRefine* (anteriormente *Freebase Gridworks* e *Google Refine*), já que, na opinião dos autores, esta é a ferramenta mais amigável para processar e limpar eficazmente grandes quantidades de dados numa interface baseada no navegador de internet. - -Além do *[data profiling](https://perma.cc/32Z8-8EMT)* (perfil de dados) (em inglês) e das operações de limpeza, as extensões do *OpenRefine* permitem aos usuários identificar conceitos num texto desestruturado através de um processo denominado *[Named-Entity Recognition](https://perma.cc/FCB6-9DU2)* (Reconhecimento de Entidade Nomeada) (em inglês) (NER) e reconciliar os seus próprios dados com bases de conhecimento existentes. Ao fazê-lo, o *OpenRefine* pode ser uma ferramenta prática de ligação dos dados com conceitos e autoridades que já foram declarados na web por entidades como a *[Library of Congress](https://perma.cc/24QD-NP6Y)* (Biblioteca do Congresso dos Estados Unidos da América) (em inglês) ou o [OCLC](https://perma.cc/48KR-ZTAJ) (Centro de Bibliotecas de Computadores Online) (em inglês). A limpeza de dados é um pré-requisito para estas etapas; A taxa de sucesso do NER e o êxito do processo de correspondência entre os dados do usuário e as entidades externas, dependem da habilidade do mesmo de tornar estes dados o mais concretos possível. - -## Descrição do exercício: *Powerhouse* - -O *Powerhouse*, em Sydney, permite-lhe exportar gratuitamente os metadados da sua coleção no seu [sítio Web](https://powerhouse.com.au/). Este museu é um dos maiores do mundo na área da ciência e tecnologia, fornecendo acesso a quase 90,000 objetos, que vão desde motores a vapor até vidros finos e de peças de alta-costura a chips de computadores. - -O museu divulgou ativamente a sua coleção em linha e disponibilizou gratuitamente a maior parte dos seus dados. No seu sítio Web, era possível descarregar um ficheiro de texto separado por separadores denominado `phm-collection.tsv` e abri-lo como uma tabela de dados. O ficheiro descomprimido (58MB) contém metadados básicos (17 campos) para 75,823 objetos, sob a licença *[Creative Commons Attribution Share Alike (CCASA)](https://perma.cc/M3QW-RLW6)* (em inglês). Neste tutorial utilizaremos uma cópia dos dados que está arquivada para o usuário fazer o download (mais à frente). Isto garante que se o *Powerhouse* atualizar os seus dados, o usuário ainda vai conseguir acompanhar esta lição. - -Ao longo do processo de limpeza e de criação do perfil dos dados, a lição vai focar o campo das `'Categorias'`, que é preenchido com termos do [*Powerhouse Object Names Thesaurus* (BARTOC)](https://perma.cc/PEP6-X2LD) (em inglês). O BARTOC reconhece o uso e a ortografia australiana e reflete, de uma maneira muito direta, os pontos fortes da coleção. Nesta coleção, o usuário vai encontrar, por exemplo, mais e melhores representações da história social e das artes decorativas e menos objetos com nomes associados às belas-artes e à história natural. - -Os termos no campo das Categorias compreendem o que chamamos de [Vocabulário Controlado](https://perma.cc/FEW7-CFDB). Um Vocabulário Controlado consiste em palavras-chave que, ao utilizarem um número limitado de termos, descrevem o conteúdo de uma coleção, sendo, normalmente, um ponto de entrada importante para historiadores em data sets de bibliotecas, arquivos e museus. É, por isso, que será dada uma importância especial ao campo das 'Categorias'. Depois de ser feita a limpeza dos dados, deverá ser possível reutilizar os termos do Vocabulário Controlado para encontrar informação adicional sobre esses termos num outro lugar online. Isto é conhecido como a criação de *[Linked Data](https://perma.cc/5SRF-V3UR)* (Dados Vinculados). - -### Primeiros passos: instalação do *OpenRefine* e importação de dados - -Deverá ser feito o [Download do *OpenRefine*](https://openrefine.org/download) (em inglês) e seguidas as instruções. O *OpenRefine* funciona em todas as plataformas: Windows, Mac, e Linux. Este será aberto no navegador de internet do usuário, mas é importante entender que a aplicação é executada localmente e que os dados não serão guardados online. Com o *OpenRefine* aberto no seu navegador de internet, clique em '**Language Settings**', presente no canto superior esquerdo, e altere a linguagem para '**Português**'. Os arquivos de dados estão disponíveis no *Programming Historian* como *[phm-collection](/assets/cleaning-data-with-openrefine/phm-collection.tsv)*. Por favor, faça o Download do ficheiro *phm-collection.tsv* que serão utilizados ao longo deste tutorial antes de continuar. - -Na página inicial do *OpenRefine* crie um novo projeto utilizando o ficheiro de dados que fez o download e clique '**Próximo**' . A primeira linha será processada como o nome da coluna por defeito, mas será preciso desmarcar a caixa de seleção 'Usar caracter " encerrar células contendo separadores de colunas', já que as aspas dentro do ficheiro não têm qualquer significado para o *OpenRefine*. Além disto, deverá selecionar a caixa de seleção 'Tentar analisar texto de células como números' para que o *OpenRefine* detete automaticamente números. Agora deverá clicar em '**Criar projeto**'. Se tudo correr como planejado, deverá ver no canto superior esquerdo 75,814 linhas. - -O data set do *Powerhouse* consiste em metadados detalhados sobre todos os objetos da coleção incluindo o título, a descrição, as várias categorias às quais o item pertence, informação sobre a proveniência do mesmo e um link persistente para a página que hospeda o objeto dentro do site do museu. Para ter uma ideia do objeto a que corresponde os metadados, clique no link persistente e o site será aberto. - -{% include figure.html filename="en-or-cleaning-data-with-openrefine-01.png" alt="Imagem de um objeto de amostra no site *Powerhouse* onde é possível observar um carro de brincar com desenhos de palhaços" caption="Figura 1: Captura de tela de um objeto de amostra no site *Powerhouse*" %} - -### Conheça os seus dados - -A primeira coisa a fazer é observar e conhecer os seus dados. Poderá inspecionar os diferentes valores de dados exibindo-os em `facetas e filtros`. Poderá considerar a [faceta](https://perma.cc/HKN9-NYXZ) (em inglês) uma lente através da qual é possível ver um subconjunto específico de dados baseados no critério da sua escolha. Clique no triângulo em frente ao nome da coluna, selecione Faceta e crie uma Faceta. Por exemplo, experimente o `Faceta de texto` ou o `Faceta numérica`, dependendo da natureza dos valores contidos nesses campos (os valores numéricos estão expostos a verde). No entanto, tenha em atenção que estas Facetas de texto têm uma maior eficácia em campos com valores redundantes (*Categories* (categorias), por exemplo); Se ocorrer o erro 'Muitas para mostrar' você pode escolher aumentar o limite da contagem de opções a cima do padrão dos 2,000. Todavia, um limite muito alto pode tornar o aplicativo mais lento (por norma, 5,000 é uma escolha segura). Facetas numéricas não têm esta restrição. Para mais opções, selecione Facetas personalizadas : Faceta por valores em branco, por exemplo, torna-se útil na procura de quantos valores foram preenchidos em cada campo. Vamos explorar mais detalhadamente estas funcionalidades nos exercícios a seguir. - -### Remoção de linhas em branco - -Uma coisa que irá reparar quando criar Facetas numéricas para a coluna do *Record ID* (Identificador do registo), é que existem três linhas sem dados. Poderá encontrá-las ao desmarcar a caixa de seleção numérica, deixando apenas valores não-numéricos. Na verdade, estes valores não estão realmente a branco, mas contêm apenas um caractere de espaço em branco, que pode ser visível se mover o seu cursor para onde deveria estar esse valor e clicar no botão '**edit**' (Editar) que aparece. Para remover estas linhas, clique no triângulo em frente à primeira coluna denominada por '**Todos**' , selecione '**Editar linhas**' e depois '**Remover as linhas que corresponderam**'. Feche a faceta numerica para verificar que permanecem agora 75,811 linhas. - -### Remoção de duplicações - -O segundo passo é detetar e remover duplicações. Estas podem ser identificadas ao classificar colunas, como o *Record ID*, por um valor único (neste caso vamos assumir que o *Record ID* é, de facto, único para cada entrada). Esta operação pode ser realizada ao clicar no triângulo à esquerda do *Record ID*, depois devemos selecionar a opção '**Ordenar**…' e escolher o marcador '**números**'. No *OpenRefine*, ordenar é apenas uma ajuda visual, a não ser que torne a reordenação permanente. Para o fazer, clique na opção Ordenar por cima do *Marks* (Marcas) e, em seguida, deverá escolher a opção '**Reordenar linhas permanentemente**'. Se se esquecer de fazer isto, posteriormente, irá ter resultados imprevisíveis neste tutorial. - -Linhas idênticas estão agora adjacentes umas às outras. Em seguida, deixe em branco as linhas do *Record ID* que têm o mesmo *Record ID* que as a cima delas, marcando-as como duplicações. Para o fazer, deve clicar no triângulo do *Record ID*, escolher **Editar células** \> **Transformar em vazias abaixo**. A mensagem de *status* dirá que 84 colunas foram afetadas (se se esqueceu de reordenar as linhas permanentemente, apenas vão ser afetadas 19 colunas; em caso afirmativo, desfaça a operação Transformar em vazias abaixo no separador 'Desfazer/Refazer' e volte ao parágrafo anterior refazendo-o de modo a ter a certeza que as linhas estão reordenadas e não apenas classificadas). Elimine essas linhas ao criar uma faceta em '**Transformar em vazias abaixo**' na coluna do *Record ID* ('**Faceta**' \> '**Facetas personalizadas**' \> '**Faceta por valores em branco**') em seguida deverá selecionar as 84 linhas a branco clicando em '**true**' (Verdade) e removê-las usando o triângulo da coluna '**Todos**' ('**Editar linhas**' \> **Remover as linhas que corresponderam**'). Quando fechar a faceta deverá observar que existem agora 75,727 linhas únicas. - -O usuário deverá ter uma atenção especial ao eliminar duplicações. Na etapa mencionada acima, assumimos que o data set possui um campo com valores únicos, indicando que uma linha inteira representa uma duplicação. Este não é necessariamente o caso e, por isso, devemos ter cuidado e verificar manualmente se a linha inteira representa uma duplicação ou não. - -### Atomização - -Depois de remover os registos duplicados, podemos focar-nos na coluna *Categories*. Em média, foram atribuídas 2.25 categorias a cada objeto. Estas categorias estão contidas no mesmo campo, separadas por uma barra vertical '\|'. O registo 9, por exemplo, contém três: 'Mineral samples\|Specimens\|Mineral Samples-Geological' (Amostras minerais\|Espécimes\|Amostras minerais-Geológicas). Para analisar em detalhe o uso destas palavras-chave, os valores do campo das categorias devem ser separados em células individuais com base na barra vertical, expandindo os 75,727 registos em 170,167 linhas. Escolha '**Editar células**', '**Dividir células com múltiplos valores**', digitando '**\|**' como separador de valores. O *OpenRefine* irá informá-lo que tem agora 170,167 linhas. - -É importante compreender totalmente o paradigma das linhas/entradas. Torne a coluna *Record ID* visível para ver o que se passa. Pode mudar entre a opção de visualização 'linhas' e 'entradas' ao clicar nos links que dão pelos mesmos nomes, logo em cima do cabeçalho das colunas. Na opção 'linhas', cada linha representa um par de *Record IDs* e uma única categoria, permitindo a manipulação de cada uma individualmente. A opção 'entradas' tem uma entrada para cada *Record ID*, que pode ter categorias diferentes em linhas diferentes (agrupadas a cinzento ou branco), mas cada registo é manipulado como um todo. Concretamente, existem agora 170,167 atribuições de categorias (Linhas), separadas em 75,736 itens de coleção (Entradas). Pode também ter reparado que estamos com mais 9 registos do que os originais 75,727, mas não se preocupe com isso agora, iremos voltar a esta pequena diferença mais tarde. - -### Facetting e agrupamento - -Um dos conteúdos do campo foi devidamente atomizado, filtros, facetas e agrupamentos podem ser aplicados para fornecer uma visão rápida e geral dos problemas clássicos dos metadados. Ao aplicar a faceta customizada '`Faceta por valores em branco`' à coluna *Categories*, é possível identificar imediatamente os 461 registos que não têm uma categoria, representando 0.6% da coleção. Ao aplicar uma faceta de texto ao campo das categorias podemos ter uma visão geral das 4,935 diferentes categorias utilizadas na coleção (o limite padrão é 2,000, mas poderá clicar na opção '**Definir o limite da contagem da escolha**' para aumentá-la para 5,000). Os títulos podem ser ordenados alfabeticamente (nome') ou por frequência ('quantidade'), fornecendo ao utilizador uma lista dos termos mais usados para indexar a coleção. Os três títulos principais são 'Numismática' (*Numismatics*) (8,041), 'Cerâmica' (*Ceramics*) (7,390) e 'Roupas e vestuário' (*Clothing and dress*) (7,279). - -Após aplicar a faceta, o *OpenRefine* propõe aglomerar as escolhas da faceta com base em vários métodos de similaridade. Tal como a Figura 2 demonstra, o agrupamento permite ao usuário resolver problemas relacionados com inconsistências, o uso incoerente tanto da forma singular como plural e erros de ortografia simples. O *OpenRefine* apresenta os valores relacionados e propõe uma fusão resultante no valor mais recorrente. Deverá selecionar a opção '**Agrupar**' para abrir o comando de uniformização dos termos, em seguida, escolha os valores que deseja agrupar ao selecionar as caixas individualmente ou ao clicar '**Marcar todos**' na parte inferior e, por fim, '**Unir selecionados e Re-agrupar**'. - -{% include figure.html filename="tr-pt-cleaning-data-with-openrefine-2.png" alt="Interface do *OpenRefine* referente ao agrupamento e edição da coluna 'Categories' em que é possível observar os métodos de agrupamento e ainda as diferenças detetadas nesta coluna" caption="Figura 2: Visão geral de alguns agrupamentos" %} - -O método padrão de aglomeramento não é muito complexo, portanto ainda não encontra todos os aglomerados. Experimente com diferentes métodos para ver quais são os resultados que estes produzem. Deverá ter cuidado: alguns métodos podem ser muito agressivos e alguns valores, que não deverão estar juntos, podem acabar agrupados. Agora que os valores foram agrupados individualmente, podemos colocá-los de volta numa única célula. Clique no triângulo das *categories* e escolha **Editar células**, **Unir células com múltiplos valores**, escolha a barra vertical ('\|') como separador, **OK**. As linhas têm agora a mesma aparência que tinham antes, com um campo de categorias com vários valores. - -### Aplicação de transformações *ad-hoc* através do uso de expressões GREL - -Relembre-se que existiu um aumento no número de registos depois do processo de divisão: nove registos apareceram do nada. Para encontrar a causa desta disparidade, precisamos de voltar atrás, antes da divisão das categorias em linhas separadas. Para fazer isso, altere o separador 'Desfazer / Refazer' à direita do separador 'Faceta / Filtro' e vai obter um histórico de todas as ações que executou desde que o projeto foi criado. Selecione o passo antes de '*Split multi-valued cells in column Categories*' (Dividir células com vários valores na coluna Categorias) (se seguiu o nosso exemplo deverá ser '*Remove 84 rows*' (Remover 84 linhas)). Depois volte para o separador 'Faceta / Filtro'. - -O problema surgiu durante a operação de divisão no caractere de barra vertical, portanto há uma grande probabilidade do que correu mal estar relacionado com esse caractere. Vamos aplicar um filtro na coluna Categorias ao selecionar '**Filtro de texto**' no menu. Primeiro, digite um único `|` no campo da esquerda: o *OpenRefine* deverá informá-lo que existem 71,064 registos correspondentes (i.e. registos que contenham uma barra vertical) num total de 75,727. Células que não contenham a barra vertical podem ser células em branco ou células apenas com uma categoria, não tendo assim um separador. Tal como o registo 29 que apenas tem '*Scientific instruments*' (Instrumentos científicos). - -Agora insira um segundo '\|' depois do primeiro para obter '\|\|' (dupla barra vertical): poderá observar que existem 9 registos que correspondem a este padrão. Estes são, provavelmente, os 9 registos culpados pela nossa discrepância: quando o *OpenRefine* divide os registos, a dupla barra vertical é interpretada como uma quebra entre dois registos em vez de um separador duplo sem sentido. Agora, como é que corrigimos estes valores? Vá ao menu do campo das categorias e escolha '**Editar células**' \> '**Transformar**…. Bem-vindo à interface de transformação de texto personalizada, uma funcionalidade poderosa do *OpenRefine* que usa a *Google Refine Expression Language* (GREL). - -A palavra '*value*' (valor) no campo de texto representa o valor atual de cada célula, valor esse visível em baixo. Podemos modificar este valor ao aplicar-lhe funções (ver a *[GREL documentation](https://perma.cc/A228-FFBE)* (documentação da GREL, em inglês) para uma lista completa). Neste caso, queremos substituir a dupla barra vertical por uma única barra. Isto pode ser realizado ao inserir a seguinte expressão GREL (certifique-se que não se esquece das aspas): - -``` -value.replace('||', '|') -``` - -Em baixo do campo de texto 'Expressão', terá uma pré-visualização dos valores modificados com as duplas barras verticais removidas. Clique em **OK** e tente dividir as categorias de novo com a opção '**Editar células**' \> '**Dividir células com múltiplos valores...**'. O número de registos ficará agora nos 75,727 (clique no link '**entradas**' para verificar). - -\* \* \*\ -Outro problema que pode ser resolvido com a ajuda da GREL é o dos registos para os quais a mesma categoria é listada duas vezes. Observe o registo 41 por exemplo, cujas categorias são '*Models|Botanical specimens|Botanical Specimens|Didactic Displays|Models*' (Modelos|Espécimes botânicos|Espécimes Botânicos|Expositores Didáticos|Modelos). A categoria '*Models*' aparece duas vezes sem nenhuma razão aparente, pelo que vamos querer remover esta duplicação. Clique no triângulo da coluna das '*Categories*' e escolha 'Editar células', 'Unir células com múltiplos valores', OK. Escolha a barra vertical como separador. Agora as categorias estão listadas como antes. Em seguida selecione '**Editar células**' \> '**Transformar**', também na coluna das categorias. Ao usar a GREL podemos dividir sucessivamente as categorias na barra vertical, procurar categorias únicas e juntá-las de novo. Para isso, basta digitar a seguinte expressão: - - -``` -value.split('|').uniques().join('|') -``` - -Ao fazê-lo irá reparar que 33.006 células foram afetadas, mais de metade da coleção. - -### Exportação dos seus dados limpos - -Desde que carregou os seus dados no *OpenRefine*, todas as operações de limpeza foram executadas na memória do software, deixando os dados originais intocados. Se desejar salvar os dados que limpou, terá de os exportar ao clicar no menu '**Exportar**' no canto superior direito do ecrã. O *OpenRefine* suporta uma larga variedade de formatos, tais como [CSV](https://perma.cc/SVC7-TH2C) (em inglês), HTML ou Excel: selecione o que melhor se adapta a si e acrescente o seu próprio modelo de exportação ao clicar 'Criando modelo'. Poderá também exportar o seu projeto num formato interno do *OpenRefine* de modo a partilhá-lo com os outros. - -### Construção sob os dados limpos - -Depois de limpar os seus dados, poderá dar o próximo passo e explorar outros recursos interessantes do *OpenRefine*. A comunidade de utilizadores do *OpenRefine* desenvolveu duas interessantes extensões que permitem ligar os seus dados a dados que já foram publicados na web. A *[RDF Transform extension](https://perma.cc/9RTF-S6LT)* (em inglês) transforma palavras-chave de texto simples em URLs. A [NER extension](https://perma.cc/SM98-U7GG) (em inglês) permite ao usuário aplicar a *named-entity recognition* (NER) que identifica palavras chave em texto corrido e atribui-lhes um URL. - -## Conclusões - -Se apenas se lembrar de uma coisa desta lição, deverá ser o seguinte: *Todos os dados são sujos, mas poderá fazer algo quanto a isso*. Tal como mostrámos aqui, já existe muito que pode ser feito para aumentar significativamente a qualidade dos dados. Em primeiro lugar, aprendemos como é que podemos ter uma visão geral e rápida de quantos valores vazios existem no nosso data set e com que frequência é que um valor particular (e.g. uma palavra-chave) é usada ao longo da coleção. Esta lição também demonstra como resolver problemas recorrentes, tais como duplicações e inconsistências ortográficas de maneira automática com a ajuda do *OpenRefine*. Não hesite em experimentar as ferramentas de limpeza enquanto executa estas etapas numa cópia dos seus data sets, já que o *OpenRefine* permite-lhe rastrear e refazer todos os passos caso tenha cometido um erro. +--- +title: "Limpar dados com o OpenRefine" +slug: limpar-dados-openrefine +original: cleaning-data-with-openrefine +layout: lesson +collection: lessons +date: 2013-08-05 +translation_date: 2023-04-29 +tested-date: 2024-03-14 +lesson-testers: Antonin Delpeuch +authors: +- Seth van Hooland +- Ruben Verborgh +- Max De Wilde +reviewers: +- Patrick Burns +- Nora McGregor +editors: +- Adam Crymble +translator: +- Francisco Nabais +translation-editor: +- Aracele Torres +translation-reviewer: +- Eric Brasil +- Joana Vieira Paulino +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/427 +difficulty: 2 +activity: transforming +topics: [data-manipulation] +abstract: Este tutorial foca-se na forma como o usuário pode diagnosticar e agir perante a precisão dos dados +avatar_alt: Dois homens a lavar a roupa ao ar livre +doi: 10.46430/phpt0038 +--- + +{% include toc.html %} + +
    +Nota de Tradução: Alguns termos, por aparecerem constantemente e facilitarem a interpretação das imagens, apenas foram propositadamente traduzidos uma vez e serão colocados entre parênteses em português na primeira vez que surgem. +
    + +## Objetivos da lição + +Não aceite os dados tal como são apresentados. Esta é a principal mensagem deste tutorial que se foca na forma como os usuários podem diagnosticar e agir perante a precisão dos dados. Nesta lição, o usuário vai aprender os princípios e a prática da limpeza de dados, ao mesmo tempo que aprende como é que o [*OpenRefine*](https://openrefine.org) (em inglês) pode ser utilizado para realizar quatro tarefas essenciais que vão ajudar na limpeza de dados: + +1. Remover registos duplicados +2. Separar múltiplos valores contidos no mesmo campo +3. Analisar a distribuição de valores ao longo do Dataset +4. Agrupar diferentes representações da mesma realidade + +Estes passos são explicitados com a ajuda de uma série de exercicios baseados na coleção de metadados do *[Powerhouse](https://powerhouse.com.au/)* (em inglês), demonstrando, assim, como métodos (semi)automáticos podem ajudar na correção de erros dos dados. + +## Porque é que os historiadores devem se preocupar com a qualidade dos dados? + +Registros duplicados, valores vazios e formatos incossistentes são fenómenos com os quais devemos estar preparados para lidar quando utilizamos data sets históricos. Esta lição vai ensinar o usuário a descobrir inconsistências nos dados contidos em tabelas ou bases de dados. À medida que, cada vez mais, partilhamos, agregamos e reutilizamos dados na web, os historiadores terão uma maior necessidade de responder a problemas inevitáveis associados à qualidade dos dados. Utilizando um programa chamado *OpenRefine*, o usuário será capaz de identificar facilmente erros sistemáticos, tais como células em branco, duplicações, inconsistências ortográficas, etc. O *OpenRefine* não só permite um diagnóstico rápido da precisão dos dados, mas também age perante certos erros de forma automática. + +## Descrição da ferramenta: *OpenRefine* + +No passado, os historiadores dependiam de especialistas em tecnologias da informação para diagnosticar a qualidade dos dados e para executar tarefas de limpeza dos mesmos. Isto exigia programas computacionais personalizados quando se trabalhava com data sets consideráveis. Felizmente, o surgimento de Ferramentas Interativas de Transformação de Dados (Interactive Data Transformation tools, ou IDTs em inglês), permite que até profissionais sem habilidades técnicas aprofundadas possam realizar operações rápidas e baratas em grandes data sets. + +As Ferramentas Interativas de Transformação de Dados assemelham-se às tabelas de dados do desktop com as quais estamos familiarizados, chegando a partilhar funcionalidades com as mesmas. O usuário pode, por exemplo, usar aplicações como o Microsoft Excel para organizar os seus dados com base em vários filtros, sejam eles numéricos, alfabéticos ou até personalizados, o que permite detetar erros mais facilmente. Configurar estes filtros em tabelas de dados pode ser complicado, já que estes são uma função secundária do software. Geralmente, podemos dizer que as tabelas de dados são projetadas para funcionar em linhas ou células individuais, enquanto as Ferramentas Interativas de Transformação de Dados operam em grandes intervalos de dados ao mesmo tempo. Estas "Tabelas de dados em esteroides" fornecem uma interface integrada e amigável através da qual os usuários finais podem detetar e corrigir erros. + +Nos últimos anos, têm sido desenvolvidas várias ferramentas para a transformação de dados interativos, tais como [*Potter’s Wheel ABC*](https://perma.cc/Q6QD-E64N) (em inglês) e [*Wrangler*](https://perma.cc/Y45B-6ZLU) (em inglês). Aqui queremos concentrar-nos, sobretudo, no *OpenRefine* (anteriormente *Freebase Gridworks* e *Google Refine*), já que, na opinião dos autores, esta é a ferramenta mais amigável para processar e limpar eficazmente grandes quantidades de dados numa interface baseada no navegador de internet. + +Além do *[data profiling](https://perma.cc/32Z8-8EMT)* (perfil de dados) (em inglês) e das operações de limpeza, as extensões do *OpenRefine* permitem aos usuários identificar conceitos num texto desestruturado através de um processo denominado *[Named-Entity Recognition](https://perma.cc/FCB6-9DU2)* (Reconhecimento de Entidade Nomeada) (em inglês) (NER) e reconciliar os seus próprios dados com bases de conhecimento existentes. Ao fazê-lo, o *OpenRefine* pode ser uma ferramenta prática de ligação dos dados com conceitos e autoridades que já foram declarados na web por entidades como a *[Library of Congress](https://perma.cc/24QD-NP6Y)* (Biblioteca do Congresso dos Estados Unidos da América) (em inglês) ou o [OCLC](https://perma.cc/48KR-ZTAJ) (Centro de Bibliotecas de Computadores Online) (em inglês). A limpeza de dados é um pré-requisito para estas etapas; A taxa de sucesso do NER e o êxito do processo de correspondência entre os dados do usuário e as entidades externas, dependem da habilidade do mesmo de tornar estes dados o mais concretos possível. + +## Descrição do exercício: *Powerhouse* + +O *Powerhouse*, em Sydney, permite-lhe exportar gratuitamente os metadados da sua coleção no seu [sítio Web](https://powerhouse.com.au/). Este museu é um dos maiores do mundo na área da ciência e tecnologia, fornecendo acesso a quase 90,000 objetos, que vão desde motores a vapor até vidros finos e de peças de alta-costura a chips de computadores. + +O museu divulgou ativamente a sua coleção em linha e disponibilizou gratuitamente a maior parte dos seus dados. No seu sítio Web, era possível descarregar um ficheiro de texto separado por separadores denominado `phm-collection.tsv` e abri-lo como uma tabela de dados. O ficheiro descomprimido (58MB) contém metadados básicos (17 campos) para 75,823 objetos, sob a licença *[Creative Commons Attribution Share Alike (CCASA)](https://perma.cc/M3QW-RLW6)* (em inglês). Neste tutorial utilizaremos uma cópia dos dados que está arquivada para o usuário fazer o download (mais à frente). Isto garante que se o *Powerhouse* atualizar os seus dados, o usuário ainda vai conseguir acompanhar esta lição. + +Ao longo do processo de limpeza e de criação do perfil dos dados, a lição vai focar o campo das `'Categorias'`, que é preenchido com termos do [*Powerhouse Object Names Thesaurus* (BARTOC)](https://perma.cc/PEP6-X2LD) (em inglês). O BARTOC reconhece o uso e a ortografia australiana e reflete, de uma maneira muito direta, os pontos fortes da coleção. Nesta coleção, o usuário vai encontrar, por exemplo, mais e melhores representações da história social e das artes decorativas e menos objetos com nomes associados às belas-artes e à história natural. + +Os termos no campo das Categorias compreendem o que chamamos de [Vocabulário Controlado](https://perma.cc/FEW7-CFDB). Um Vocabulário Controlado consiste em palavras-chave que, ao utilizarem um número limitado de termos, descrevem o conteúdo de uma coleção, sendo, normalmente, um ponto de entrada importante para historiadores em data sets de bibliotecas, arquivos e museus. É, por isso, que será dada uma importância especial ao campo das 'Categorias'. Depois de ser feita a limpeza dos dados, deverá ser possível reutilizar os termos do Vocabulário Controlado para encontrar informação adicional sobre esses termos num outro lugar online. Isto é conhecido como a criação de *[Linked Data](https://perma.cc/5SRF-V3UR)* (Dados Vinculados). + +### Primeiros passos: instalação do *OpenRefine* e importação de dados + +Deverá ser feito o [Download do *OpenRefine*](https://openrefine.org/download) (em inglês) e seguidas as instruções. O *OpenRefine* funciona em todas as plataformas: Windows, Mac, e Linux. Este será aberto no navegador de internet do usuário, mas é importante entender que a aplicação é executada localmente e que os dados não serão guardados online. Com o *OpenRefine* aberto no seu navegador de internet, clique em '**Language Settings**', presente no canto superior esquerdo, e altere a linguagem para '**Português**'. Os arquivos de dados estão disponíveis no *Programming Historian* como *[phm-collection](/assets/cleaning-data-with-openrefine/phm-collection.tsv)*. Por favor, faça o Download do ficheiro *phm-collection.tsv* que serão utilizados ao longo deste tutorial antes de continuar. + +Na página inicial do *OpenRefine* crie um novo projeto utilizando o ficheiro de dados que fez o download e clique '**Próximo**' . A primeira linha será processada como o nome da coluna por defeito, mas será preciso desmarcar a caixa de seleção 'Usar caracter " encerrar células contendo separadores de colunas', já que as aspas dentro do ficheiro não têm qualquer significado para o *OpenRefine*. Além disto, deverá selecionar a caixa de seleção 'Tentar analisar texto de células como números' para que o *OpenRefine* detete automaticamente números. Agora deverá clicar em '**Criar projeto**'. Se tudo correr como planejado, deverá ver no canto superior esquerdo 75,814 linhas. + +O data set do *Powerhouse* consiste em metadados detalhados sobre todos os objetos da coleção incluindo o título, a descrição, as várias categorias às quais o item pertence, informação sobre a proveniência do mesmo e um link persistente para a página que hospeda o objeto dentro do site do museu. Para ter uma ideia do objeto a que corresponde os metadados, clique no link persistente e o site será aberto. + +{% include figure.html filename="en-or-cleaning-data-with-openrefine-01.png" alt="Imagem de um objeto de amostra no site *Powerhouse* onde é possível observar um carro de brincar com desenhos de palhaços" caption="Figura 1: Captura de tela de um objeto de amostra no site *Powerhouse*" %} + +### Conheça os seus dados + +A primeira coisa a fazer é observar e conhecer os seus dados. Poderá inspecionar os diferentes valores de dados exibindo-os em `facetas e filtros`. Poderá considerar a [faceta](https://perma.cc/HKN9-NYXZ) (em inglês) uma lente através da qual é possível ver um subconjunto específico de dados baseados no critério da sua escolha. Clique no triângulo em frente ao nome da coluna, selecione Faceta e crie uma Faceta. Por exemplo, experimente o `Faceta de texto` ou o `Faceta numérica`, dependendo da natureza dos valores contidos nesses campos (os valores numéricos estão expostos a verde). No entanto, tenha em atenção que estas Facetas de texto têm uma maior eficácia em campos com valores redundantes (*Categories* (categorias), por exemplo); Se ocorrer o erro 'Muitas para mostrar' você pode escolher aumentar o limite da contagem de opções a cima do padrão dos 2,000. Todavia, um limite muito alto pode tornar o aplicativo mais lento (por norma, 5,000 é uma escolha segura). Facetas numéricas não têm esta restrição. Para mais opções, selecione Facetas personalizadas : Faceta por valores em branco, por exemplo, torna-se útil na procura de quantos valores foram preenchidos em cada campo. Vamos explorar mais detalhadamente estas funcionalidades nos exercícios a seguir. + +### Remoção de linhas em branco + +Uma coisa que irá reparar quando criar Facetas numéricas para a coluna do *Record ID* (Identificador do registo), é que existem três linhas sem dados. Poderá encontrá-las ao desmarcar a caixa de seleção numérica, deixando apenas valores não-numéricos. Na verdade, estes valores não estão realmente a branco, mas contêm apenas um caractere de espaço em branco, que pode ser visível se mover o seu cursor para onde deveria estar esse valor e clicar no botão '**edit**' (Editar) que aparece. Para remover estas linhas, clique no triângulo em frente à primeira coluna denominada por '**Todos**' , selecione '**Editar linhas**' e depois '**Remover as linhas que corresponderam**'. Feche a faceta numerica para verificar que permanecem agora 75,811 linhas. + +### Remoção de duplicações + +O segundo passo é detetar e remover duplicações. Estas podem ser identificadas ao classificar colunas, como o *Record ID*, por um valor único (neste caso vamos assumir que o *Record ID* é, de facto, único para cada entrada). Esta operação pode ser realizada ao clicar no triângulo à esquerda do *Record ID*, depois devemos selecionar a opção '**Ordenar**…' e escolher o marcador '**números**'. No *OpenRefine*, ordenar é apenas uma ajuda visual, a não ser que torne a reordenação permanente. Para o fazer, clique na opção Ordenar por cima do *Marks* (Marcas) e, em seguida, deverá escolher a opção '**Reordenar linhas permanentemente**'. Se se esquecer de fazer isto, posteriormente, irá ter resultados imprevisíveis neste tutorial. + +Linhas idênticas estão agora adjacentes umas às outras. Em seguida, deixe em branco as linhas do *Record ID* que têm o mesmo *Record ID* que as a cima delas, marcando-as como duplicações. Para o fazer, deve clicar no triângulo do *Record ID*, escolher **Editar células** \> **Transformar em vazias abaixo**. A mensagem de *status* dirá que 84 colunas foram afetadas (se se esqueceu de reordenar as linhas permanentemente, apenas vão ser afetadas 19 colunas; em caso afirmativo, desfaça a operação Transformar em vazias abaixo no separador 'Desfazer/Refazer' e volte ao parágrafo anterior refazendo-o de modo a ter a certeza que as linhas estão reordenadas e não apenas classificadas). Elimine essas linhas ao criar uma faceta em '**Transformar em vazias abaixo**' na coluna do *Record ID* ('**Faceta**' \> '**Facetas personalizadas**' \> '**Faceta por valores em branco**') em seguida deverá selecionar as 84 linhas a branco clicando em '**true**' (Verdade) e removê-las usando o triângulo da coluna '**Todos**' ('**Editar linhas**' \> **Remover as linhas que corresponderam**'). Quando fechar a faceta deverá observar que existem agora 75,727 linhas únicas. + +O usuário deverá ter uma atenção especial ao eliminar duplicações. Na etapa mencionada acima, assumimos que o data set possui um campo com valores únicos, indicando que uma linha inteira representa uma duplicação. Este não é necessariamente o caso e, por isso, devemos ter cuidado e verificar manualmente se a linha inteira representa uma duplicação ou não. + +### Atomização + +Depois de remover os registos duplicados, podemos focar-nos na coluna *Categories*. Em média, foram atribuídas 2.25 categorias a cada objeto. Estas categorias estão contidas no mesmo campo, separadas por uma barra vertical '\|'. O registo 9, por exemplo, contém três: 'Mineral samples\|Specimens\|Mineral Samples-Geological' (Amostras minerais\|Espécimes\|Amostras minerais-Geológicas). Para analisar em detalhe o uso destas palavras-chave, os valores do campo das categorias devem ser separados em células individuais com base na barra vertical, expandindo os 75,727 registos em 170,167 linhas. Escolha '**Editar células**', '**Dividir células com múltiplos valores**', digitando '**\|**' como separador de valores. O *OpenRefine* irá informá-lo que tem agora 170,167 linhas. + +É importante compreender totalmente o paradigma das linhas/entradas. Torne a coluna *Record ID* visível para ver o que se passa. Pode mudar entre a opção de visualização 'linhas' e 'entradas' ao clicar nos links que dão pelos mesmos nomes, logo em cima do cabeçalho das colunas. Na opção 'linhas', cada linha representa um par de *Record IDs* e uma única categoria, permitindo a manipulação de cada uma individualmente. A opção 'entradas' tem uma entrada para cada *Record ID*, que pode ter categorias diferentes em linhas diferentes (agrupadas a cinzento ou branco), mas cada registo é manipulado como um todo. Concretamente, existem agora 170,167 atribuições de categorias (Linhas), separadas em 75,736 itens de coleção (Entradas). Pode também ter reparado que estamos com mais 9 registos do que os originais 75,727, mas não se preocupe com isso agora, iremos voltar a esta pequena diferença mais tarde. + +### Facetting e agrupamento + +Um dos conteúdos do campo foi devidamente atomizado, filtros, facetas e agrupamentos podem ser aplicados para fornecer uma visão rápida e geral dos problemas clássicos dos metadados. Ao aplicar a faceta customizada '`Faceta por valores em branco`' à coluna *Categories*, é possível identificar imediatamente os 461 registos que não têm uma categoria, representando 0.6% da coleção. Ao aplicar uma faceta de texto ao campo das categorias podemos ter uma visão geral das 4,935 diferentes categorias utilizadas na coleção (o limite padrão é 2,000, mas poderá clicar na opção '**Definir o limite da contagem da escolha**' para aumentá-la para 5,000). Os títulos podem ser ordenados alfabeticamente (nome') ou por frequência ('quantidade'), fornecendo ao utilizador uma lista dos termos mais usados para indexar a coleção. Os três títulos principais são 'Numismática' (*Numismatics*) (8,041), 'Cerâmica' (*Ceramics*) (7,390) e 'Roupas e vestuário' (*Clothing and dress*) (7,279). + +Após aplicar a faceta, o *OpenRefine* propõe aglomerar as escolhas da faceta com base em vários métodos de similaridade. Tal como a Figura 2 demonstra, o agrupamento permite ao usuário resolver problemas relacionados com inconsistências, o uso incoerente tanto da forma singular como plural e erros de ortografia simples. O *OpenRefine* apresenta os valores relacionados e propõe uma fusão resultante no valor mais recorrente. Deverá selecionar a opção '**Agrupar**' para abrir o comando de uniformização dos termos, em seguida, escolha os valores que deseja agrupar ao selecionar as caixas individualmente ou ao clicar '**Marcar todos**' na parte inferior e, por fim, '**Unir selecionados e Re-agrupar**'. + +{% include figure.html filename="tr-pt-cleaning-data-with-openrefine-2.png" alt="Interface do *OpenRefine* referente ao agrupamento e edição da coluna 'Categories' em que é possível observar os métodos de agrupamento e ainda as diferenças detetadas nesta coluna" caption="Figura 2: Visão geral de alguns agrupamentos" %} + +O método padrão de aglomeramento não é muito complexo, portanto ainda não encontra todos os aglomerados. Experimente com diferentes métodos para ver quais são os resultados que estes produzem. Deverá ter cuidado: alguns métodos podem ser muito agressivos e alguns valores, que não deverão estar juntos, podem acabar agrupados. Agora que os valores foram agrupados individualmente, podemos colocá-los de volta numa única célula. Clique no triângulo das *categories* e escolha **Editar células**, **Unir células com múltiplos valores**, escolha a barra vertical ('\|') como separador, **OK**. As linhas têm agora a mesma aparência que tinham antes, com um campo de categorias com vários valores. + +### Aplicação de transformações *ad-hoc* através do uso de expressões GREL + +Relembre-se que existiu um aumento no número de registos depois do processo de divisão: nove registos apareceram do nada. Para encontrar a causa desta disparidade, precisamos de voltar atrás, antes da divisão das categorias em linhas separadas. Para fazer isso, altere o separador 'Desfazer / Refazer' à direita do separador 'Faceta / Filtro' e vai obter um histórico de todas as ações que executou desde que o projeto foi criado. Selecione o passo antes de '*Split multi-valued cells in column Categories*' (Dividir células com vários valores na coluna Categorias) (se seguiu o nosso exemplo deverá ser '*Remove 84 rows*' (Remover 84 linhas)). Depois volte para o separador 'Faceta / Filtro'. + +O problema surgiu durante a operação de divisão no caractere de barra vertical, portanto há uma grande probabilidade do que correu mal estar relacionado com esse caractere. Vamos aplicar um filtro na coluna Categorias ao selecionar '**Filtro de texto**' no menu. Primeiro, digite um único `|` no campo da esquerda: o *OpenRefine* deverá informá-lo que existem 71,064 registos correspondentes (i.e. registos que contenham uma barra vertical) num total de 75,727. Células que não contenham a barra vertical podem ser células em branco ou células apenas com uma categoria, não tendo assim um separador. Tal como o registo 29 que apenas tem '*Scientific instruments*' (Instrumentos científicos). + +Agora insira um segundo '\|' depois do primeiro para obter '\|\|' (dupla barra vertical): poderá observar que existem 9 registos que correspondem a este padrão. Estes são, provavelmente, os 9 registos culpados pela nossa discrepância: quando o *OpenRefine* divide os registos, a dupla barra vertical é interpretada como uma quebra entre dois registos em vez de um separador duplo sem sentido. Agora, como é que corrigimos estes valores? Vá ao menu do campo das categorias e escolha '**Editar células**' \> '**Transformar**…. Bem-vindo à interface de transformação de texto personalizada, uma funcionalidade poderosa do *OpenRefine* que usa a *Google Refine Expression Language* (GREL). + +A palavra '*value*' (valor) no campo de texto representa o valor atual de cada célula, valor esse visível em baixo. Podemos modificar este valor ao aplicar-lhe funções (ver a *[GREL documentation](https://perma.cc/A228-FFBE)* (documentação da GREL, em inglês) para uma lista completa). Neste caso, queremos substituir a dupla barra vertical por uma única barra. Isto pode ser realizado ao inserir a seguinte expressão GREL (certifique-se que não se esquece das aspas): + +``` +value.replace('||', '|') +``` + +Em baixo do campo de texto 'Expressão', terá uma pré-visualização dos valores modificados com as duplas barras verticais removidas. Clique em **OK** e tente dividir as categorias de novo com a opção '**Editar células**' \> '**Dividir células com múltiplos valores...**'. O número de registos ficará agora nos 75,727 (clique no link '**entradas**' para verificar). + +\* \* \*\ +Outro problema que pode ser resolvido com a ajuda da GREL é o dos registos para os quais a mesma categoria é listada duas vezes. Observe o registo 41 por exemplo, cujas categorias são '*Models|Botanical specimens|Botanical Specimens|Didactic Displays|Models*' (Modelos|Espécimes botânicos|Espécimes Botânicos|Expositores Didáticos|Modelos). A categoria '*Models*' aparece duas vezes sem nenhuma razão aparente, pelo que vamos querer remover esta duplicação. Clique no triângulo da coluna das '*Categories*' e escolha 'Editar células', 'Unir células com múltiplos valores', OK. Escolha a barra vertical como separador. Agora as categorias estão listadas como antes. Em seguida selecione '**Editar células**' \> '**Transformar**', também na coluna das categorias. Ao usar a GREL podemos dividir sucessivamente as categorias na barra vertical, procurar categorias únicas e juntá-las de novo. Para isso, basta digitar a seguinte expressão: + + +``` +value.split('|').uniques().join('|') +``` + +Ao fazê-lo irá reparar que 33.006 células foram afetadas, mais de metade da coleção. + +### Exportação dos seus dados limpos + +Desde que carregou os seus dados no *OpenRefine*, todas as operações de limpeza foram executadas na memória do software, deixando os dados originais intocados. Se desejar salvar os dados que limpou, terá de os exportar ao clicar no menu '**Exportar**' no canto superior direito do ecrã. O *OpenRefine* suporta uma larga variedade de formatos, tais como [CSV](https://perma.cc/SVC7-TH2C) (em inglês), HTML ou Excel: selecione o que melhor se adapta a si e acrescente o seu próprio modelo de exportação ao clicar 'Criando modelo'. Poderá também exportar o seu projeto num formato interno do *OpenRefine* de modo a partilhá-lo com os outros. + +### Construção sob os dados limpos + +Depois de limpar os seus dados, poderá dar o próximo passo e explorar outros recursos interessantes do *OpenRefine*. A comunidade de utilizadores do *OpenRefine* desenvolveu duas interessantes extensões que permitem ligar os seus dados a dados que já foram publicados na web. A *[RDF Transform extension](https://perma.cc/9RTF-S6LT)* (em inglês) transforma palavras-chave de texto simples em URLs. A [NER extension](https://perma.cc/SM98-U7GG) (em inglês) permite ao usuário aplicar a *named-entity recognition* (NER) que identifica palavras chave em texto corrido e atribui-lhes um URL. + +## Conclusões + +Se apenas se lembrar de uma coisa desta lição, deverá ser o seguinte: *Todos os dados são sujos, mas poderá fazer algo quanto a isso*. Tal como mostrámos aqui, já existe muito que pode ser feito para aumentar significativamente a qualidade dos dados. Em primeiro lugar, aprendemos como é que podemos ter uma visão geral e rápida de quantos valores vazios existem no nosso data set e com que frequência é que um valor particular (e.g. uma palavra-chave) é usada ao longo da coleção. Esta lição também demonstra como resolver problemas recorrentes, tais como duplicações e inconsistências ortográficas de maneira automática com a ajuda do *OpenRefine*. Não hesite em experimentar as ferramentas de limpeza enquanto executa estas etapas numa cópia dos seus data sets, já que o *OpenRefine* permite-lhe rastrear e refazer todos os passos caso tenha cometido um erro. diff --git a/pt/licoes/manipulacao-transformacao-dados-r.md b/pt/licoes/manipulacao-transformacao-dados-r.md index f61205339e..51fa6284ea 100644 --- a/pt/licoes/manipulacao-transformacao-dados-r.md +++ b/pt/licoes/manipulacao-transformacao-dados-r.md @@ -1,422 +1,422 @@ ---- -title: Manipulação e transformação de dados com R -slug: manipulacao-transformacao-dados-r -layout: lesson -collection: lessons -date: 2017-08-01 -translation_date: 2022-11-26 -authors: -- Nabeel Siddiqui -editors: -- Ian Milligan -reviewers: -- Lauren Tilton -- Ryan Deschamps -translator: -- Ian Araujo -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Suemi Higuchi -- Joana Paulino -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/397 -activity: transforming -topics: [data-manipulation, data-management, distant-reading, r, data-visualization] -abstract: "Esta lição explora como os investigadores podem tornar seus dados organizados, entender os pacotes do R para manipulação de dados e conduzir análises de dados básicas usando esta linguagem." -original: data-wrangling-and-management-in-r -avatar_alt: Barra de sabão -doi: 10.46430/phpt0035 ---- - -{% include toc.html %} - -## Requisitos -Nesta lição consideramos que já possui algum conhecimento da linguagem R. Se ainda não completou a lição [Noções básicas de R com dados tabulares](/pt/licoes/nocoes-basicas-R-dados-tabulares), recomendamos que o faça primeiro. Ter experiência com outras linguagens de programação também pode ser benéfico. Se está buscando por onde começar aprendendo outras linguagens, recomendamos os excelentes tutoriais de Python do *Programming Historian*. - -Nota da tradução: o conteúdo da programação utilizado na lição original foi alterado para esta versão em português para que o contexto e os exemplos sejam próximos da realidade da comunidade que fala o idioma. Por conta disso, parte do texto da lição traduzida, bem como os exemplos e as interpretações dos dados são diferente da lição original. No entanto, o conteúdo e a estrutura da lição são fidedignos à lição original, como os tipos de dados e as análises desenvolvidas. Mudamos, por exemplo, a comparação entre Mississipi e Virgínia por Brasil e Argentina, mantendo os recursos e procedimentos realizados por Nabeel Siddiqui. - -## Objetivos da lição -Ao fim desta lição, você: - -1. Saberá como tornar seus dados bem ordenados (*tidy*) e entenderá por que isso é importante. -2. Terá assimilado o uso do pacote [dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) (em inglês) e sua aplicação na manipulação e controle de dados. -3. Estará familiarizado com o operador *pipe* `%>%` na linguagem R e verá como ele pode auxiliar na criação de códigos mais legíveis. -4. Terá ganho experiência com análise exploratória de dados através de exemplos básicos de manipulação de dados. - -## Introdução -Os dados que encontra disponíveis nas diversas plataformas raramente estão no formato adequado para serem analisados, e precisará manipulá-los antes de explorar as perguntas de seu interesse. Isso pode tomar mais tempo que a própria análise dos dados! Neste tutorial, vamos aprender técnicas básicas para manipulação, gestão e controle de dados usando R. Especificamente, nos debruçaremos sobre a filosofia do ["*tidy data*"](https://www.jstatsoft.org/article/view/v059i10) (em inglês) conforme apresentada por Hadley Wickham. - -De acordo com [Wickham](http://hadley.nz/) (em inglês), os dados estão *tidy* ou bem-organizados quando satisfazem três critérios chave: - -1. Cada unidade de observação está em uma linha -2. Cada variável está em uma coluna -3. Cada valor possui a sua própria célula. - -Estar atento a estes critérios nos permite reconhecer quando os nossos dados estão adequados ou não. Também nos fornece um esquema padrão e um conjunto de soluções para lidar com alguns dos problemas mais comuns encontrados em *datasets* "mal-arranjados", como por exemplo: - -1. Nomes de colunas como valores ao invés de nomes de variáveis -2. Múltiplas variáveis contidas em uma única coluna -3. Variáveis armazenadas tanto em linhas quanto em colunas -4. Unidades de observação de diferentes categorias armazenadas na mesma tabela -5. Uma única unidade de observação armazenada em múltiplas tabelas. - -Talvez o mais importante seja que manter os dados nesse formato nos permite utilizar uma série de pacotes do ["tidyverse,"](http://tidyverse.org/) (em inglês), concebidos para trabalhar especificamente com dados neste formato *tidy*. Dessa forma, assegurando-nos de que os dados de entrada e de saída estão bem organizados, precisaremos apenas de um pequeno conjunto de ferramentas para resolver um grande número de questões. Podemos combinar, manipular e dividir os *datasets* que criamos, conforme considerarmos mais adequado. - -Neste tutorial focaremos no pacote [dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) (em inglês) presente no tidyverse, mas também é importante mencionar alguns outros que serão vistos na lição: - -* [**magittr**](http://magrittr.tidyverse.org) (em inglês) -- Este pacote nos garante acesso ao operador *pipe* `%>%`, que torna o nosso código mais legível. -* [**ggplot2**](http://ggplot2.tidyverse.org/) (em inglês) -- Este pacote utiliza a ["Gramática de Gráficos"](http://www.springer.com/us/book/9780387245447) (em inglês) para fornecer uma forma fácil de visualizar nossos dados. -* [**tibble**](http://tibble.tidyverse.org/) (em inglês) -- Este pacote nos fornece uma releitura dos tradicionais *data frames*, mais fáceis de serem trabalhados e visualizados. - -Instale o "tidyverse", se ainda não o fez, e carregue-o antes de começarmos. Além disso, certifique-se de que possui instaladas a -[versão mais recente do R](https://cran.rstudio.com/) e a [versão mais recente do RStudio](https://www.rstudio.com/products/rstudio/download/) compatíveis com o seu sistema operacional. - -Copie o código a seguir para o seu RStudio. Para executá-lo, precisa selecionar as linhas e pressionar Ctrl+Enter (Command+Enter no Mac OS): - - # Instala e carrega a biblioteca tidyverse - # Não se preocupe caso demore um pouco - - install.packages("tidyverse") - library(tidyverse) - - -## Um exemplo do dplyr em ação -Vejamos um exemplo de como o dplyr pode auxiliar historiadores. Vamos utilizar o pacote "dados" [^1] e importar alguns indicadores socioeconômicos de países entre 1952 e 2007. - -O pacote "remotes" permite a instalação de pacotes R a partir de repositórios remotos, incluindo o GitHub, como é o caso de "dados". - - # Instala e carrega as bibliotecas "remotes" e "dados" - - install.packages("remotes") - library(remotes) - - remotes::install_github("cienciadedatos/dados") - library(dados) - -Em seguida, para termos acesso ao *dataset* "dados_gapminder", que se encontra no pacote "dados", basta executar o seguinte código: - - # Cria o objeto dados_socioeconomicos_paises e atribui a ele os elementos de dados_gapminder - - dados_socioeconomicos_paises <- dados_gapminder - -Os dados do [Gapminder](https://www.gapminder.org/) (em inglês) contêm o progresso de países ao longo do tempo, observando as estatísticas de alguns índices. Após importar o *dataset*, notará que ele possui seis variáveis: país, continente, ano, expectativa de vida, população e PIB *per capita*. Os dados já estão em formato *tidy*, possibilitando uma infinidade de opções para exploração futura. - -Neste exemplo, vamos visualizar o crescimento populacional de Brasil e Argentina ao longo dos anos. Para isso utilizaremos o pacote dplyr a fim de filtrar os dados que contenham apenas informações dos países de nosso interesse. Em seguida, utilizaremos o ggplot2 para visualizar tais dados. Este exercício é apenas uma breve demonstração do que é possível fazer com o dplyr, portanto, não se preocupe se não entender o código por enquanto. - - # Filtra os países desejados (Brasil e Argentina) - - dados_brasil_argentina <- dados_socioeconomicos_paises %>% - filter(pais %in% c("Brasil", "Argentina")) - - # Visualiza a população dos dois países - - ggplot(data = dados_brasil_argentina, aes(x = ano, y = populacao, color = pais)) + - geom_line() + - geom_point() - -{% include figure.html filename="pt-tr-manipulacao-transformacao-dados-r-01.png" alt="Imagem com a representação de um gráfico de linhas com dados da população por anos para o Brasil e a Argentina" caption="Gráfico da população de Brasil e Argentina, ao longo dos anos" %} - -Como podemos observar, a população absoluta do Brasil é consideravelmente maior em comparação com a população da Argentina. Embora isso pareça óbvio devido ao tamanho do território brasileiro, o código nos fornece uma base sobre a qual podemos formular uma infinidade de questões similares. Por exemplo, com uma pequena mudança no código podemos criar um gráfico similar com dois países diferentes, como Portugal e Bélgica. - - # Filtra os países desejados (Portugal e Bélgica) - - dados_portugal_belgica <- dados_socioeconomicos_paises %>% - filter(pais %in% c("Portugal", "Bélgica")) - - # Visualiza a população dos dois países - - ggplot(data = dados_portugal_belgica, aes(x = ano, y = populacao, color = pais)) + - geom_line() + - geom_point() - -{% include figure.html filename="pt-tr-manipulacao-transformacao-dados-r-02.png" alt="Imagem com a representação de um gráfico de linhas com dados da população por anos para a Bélgica e Portugal" caption="Gráfico da população de Portugal e Bégica, ao longo dos anos" %} - -Promover mudanças rápidas no código e revisar nossos dados é parte fundamental do processo de análise exploratória de dados (AED). Ao invés de tentar "provar" uma hipótese, a análise exploratória nos ajuda a entender melhor os dados e a levantar questões sobre eles. Para os historiadores, a AED fornece uma forma fácil de saber quando aprofundar mais em um tema e quando voltar atrás, e esta é uma área onde o R se sobressai. - -## Operador Pipe - -Antes de olharmos para o dplyr, precisamos entender o que é o operador *pipe* `%>%` no R, uma vez que iremos utilizá-lo em muitos exemplos adiante. Como mencionado anteriormente, este operador é parte do pacote [magrittr](https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html) (em inglês), criado por [Stefan Milton Bache](http://stefanbache.dk/) e [Hadley Wickham](http://hadley.nz/), e está incluída no tidyverse. O seu nome é uma referência ao pintor surrealista Rene Magritte, criador da obra ["A Traição das Imagens"](https://www.renemagritte.org/the-treachery-of-images.jsp), que mostra um cachimbo com a frase "isto não é um cachimbo" (*ceci n'est pas une pipe*, em francês). - -O operador *pipe* `%>%` permite passar o que está à sua esquerda como a primeira variável em uma função especificada à sua direita. Embora possa parecer estranho no início, uma vez que aprende a usar o *pipe* descobrirá que ele torna seu código mais legível, evitando instruções aninhadas. Não se preocupe se estiver um pouco confuso por agora. Tudo ficará mais claro à medida que observarmos os exemplos. - -Vamos dizer que estamos interessados em obter a raiz quadrada de cada população e, então, somar todas as raízes antes de calcular a média. Obviamente, essa não é uma medição útil, mas demonstra a rapidez com que o código do R pode se tornar difícil de ler. Normalmente, usaríamos declarações aninhadas: - - mean(sum(sqrt(dados_socioeconomicos_paises$populacao))) - - ## [1] 6328339 - -Veja que com tantos comandos aninhados fica difícil lembrar quantos parênteses são necessários no final da linha, tornando o código complicado de ler. Para atenuar esse problema, algumas pessoas criam vetores temporários entre cada chamada de função. - - # Obtém a raiz quadrada da população de todos os países - - vetor_raiz_populacao <- sqrt(dados_socioeconomicos_paises$populacao) - - # Obtém a soma de todas as raízes da variável temporária - - soma_vetor_raizes_populacao <- sum(vetor_raiz_populacao) - - # Obtém a média da variável temporária - - media_soma_vetor_raizes_populacao <- mean(soma_vetor_raizes_populacao) - - # Exibe a média - - media_soma_vetor_raizes_populacao - - ## [1] 6328339 - -Embora obtenha o mesmo resultado, este código é muito mais legível. No entanto, se esquecer de excluir os vetores temporários, seu espaço de trabalho pode se tornar confuso. O operador *pipe* faz esse trabalho por você. Aqui está o mesmo código usando o operador *pipe*: - - dados_socioeconomicos_paises$populacao %>% sqrt %>% sum %>% mean - - ## [1] 6328339 - -Este código é mais fácil de ler que os anteriores e pode torná-lo ainda mais limpo escrevendo em linhas diferentes. - - # Certifique-se de colocar o operador no final da linha - - dados_socioeconomicos_paises$populacao %>% - sqrt %>% - sum %>% - mean - - ## [1] 6328339 - -Note que os vetores ou *data frames* criados pelo operador pipe são descartados quando se completa a operação. Se quiser salvar o resultado da operação, será preciso atribuí-lo a uma nova variável: - - vetor_permanente_media_soma_populacao <- dados_socioeconomicos_paises$populacao %>% - sqrt %>% - sum %>% - mean - - vetor_permanente_media_soma_populacao - - ## [1] 6328339 - -Agora que adquirimos uma compreensão do operador *pipe*, estamos prontos para começar a analisar e manipular alguns dados. Ao longo da lição vamos continuar trabalhando com o *dataset* dados_gapminder: - - # Certifique-se de que o pacote "dados" está instalado e carregado aantes de proceder conforme abaixo - - dados_gapminder - - ## # A tibble: 1,704 x 6 - ## pais continente ano expectativa_de_vida populacao pib_per_capita - ## - ## 1 Afeganistão Ásia 1952 28.8 8425333 779. - ## 2 Afeganistão Ásia 1957 30.3 9240934 821. - ## 3 Afeganistão Ásia 1962 32.0 10267083 853. - ## 4 Afeganistão Ásia 1967 34.0 11537966 836. - ## 5 Afeganistão Ásia 1972 36.1 13079460 740. - ## 6 Afeganistão Ásia 1977 38.4 14880372 786. - ## 7 Afeganistão Ásia 1982 39.9 12881816 978. - ## 8 Afeganistão Ásia 1987 40.8 13867957 852. - ## 9 Afeganistão Ásia 1992 41.7 16317921 649. - ## 10 Afeganistão Ásia 1997 41.8 22227415 635. - ## # … with 1,694 more rows - -Como pode observar, este *dataset* contém o nome do país, seu continente e o ano de registro, além dos indicadores de expectativa de vida, total da população e PIB *per capita*, em determinados anos. Conforme mencionamos acima, antes de analisar os dados é importante verificar se estes estão bem ordenados no formato *tidy*. Relembrando os três critérios discutidos, podemos dizer que sim, o *dataset* encontra-se organizado e pronto para ser trabalhado com o pacote dplyr. - -## O que é dplyr? -[Dplyr](https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html) (em inglês) também é parte do tidyverse, fornecendo funções para manipulação e transformação dos dados. Porque estamos mantendo nossos dados bem organizados, precisaremos apenas de um pequeno conjunto de ferramentas para explorá-los. Em comparação com o pacote básico do R, usando o dplyr em nosso código, fica geralmente mais rápido e há a garantia de que os dados resultantes (*output*) estarão bem ordenados uma vez que os dados de entrada (*input*) também estarão. Talvez o mais importante seja que o dplyr torna o nosso código mais fácil de ser lido e utiliza "verbos" que são, na maioria das vezes, intuitivos. Cada função do dplyr corresponde a um desses verbos, sendo cinco principais: filtrar (`filter`), selecionar (`select`), ordenar (`arrange`), modificar (`mutate`) e sumarizar (`summarise`). Vamos observar individualmente como cada uma dessas funções funciona na prática. - -### Selecionar (select) - -Se olharmos para o *dataset* dados_gapminder, vamos observar a presença de seis colunas, cada uma contendo diferentes informações. Podemos escolher, para a nossa análise, visualizar apenas algumas dessas colunas. A função `select()` do dplyr nos permite fazer isso. O primeiro argumento da função é o *data frame* que desejamos manipular e os seguintes são os nomes das colunas que queremos manter: - - # Remove as colunas de dados_gapminder usando select() - # Note que não é necessário acrescentar o nome da coluna com o símbolo $ (dólar) ao final de dados_gapminder visto que o dplyr automaticamente assume que "," (vírgula) representa E (AND em inglês) - - select(dados_gapminder, pais, ano, expectativa_de_vida) - - ## # A tibble: 1,704 x 3 - ## pais ano expectativa_de_vida - ## - ## 1 Afeganistão 1952 28.8 - ## 2 Afeganistão 1957 30.3 - ## 3 Afeganistão 1962 32.0 - ## 4 Afeganistão 1967 34.0 - ## 5 Afeganistão 1972 36.1 - ## 6 Afeganistão 1977 38.4 - ## 7 Afeganistão 1982 39.9 - ## 8 Afeganistão 1987 40.8 - ## 9 Afeganistão 1992 41.7 - ## 10 Afeganistão 1997 41.8 - ## # … with 1,694 more rows - -Vejamos como escrever o mesmo código utilizando o operador *pipe* `%>%`: - - dados_gapminder %>% - select(pais, ano, expectativa_de_vida) - - ## # A tibble: 1,704 x 3 - ## pais ano expectativa_de_vida - ## - ## 1 Afeganistão 1952 28.8 - ## 2 Afeganistão 1957 30.3 - ## 3 Afeganistão 1962 32.0 - ## 4 Afeganistão 1967 34.0 - ## 5 Afeganistão 1972 36.1 - ## 6 Afeganistão 1977 38.4 - ## 7 Afeganistão 1982 39.9 - ## 8 Afeganistão 1987 40.8 - ## 9 Afeganistão 1992 41.7 - ## 10 Afeganistão 1997 41.8 - ## # … with 1,694 more rows - -Fazer referência a cada uma das colunas que desejamos manter apenas para nos livrar de uma é um tanto tedioso. Podemos usar o símbolo de menos (-) para demonstrar que queremos remover uma coluna. - - dados_gapminder %>% - select(-continente) - - ## # A tibble: 1,704 x 5 - ## pais ano expectativa_de_vida populacao pib_per_capita - ## - ## 1 Afeganistão 1952 28.8 8425333 779. - ## 2 Afeganistão 1957 30.3 9240934 821. - ## 3 Afeganistão 1962 32.0 10267083 853. - ## 4 Afeganistão 1967 34.0 11537966 836. - ## 5 Afeganistão 1972 36.1 13079460 740. - ## 6 Afeganistão 1977 38.4 14880372 786. - ## 7 Afeganistão 1982 39.9 12881816 978. - ## 8 Afeganistão 1987 40.8 13867957 852. - ## 9 Afeganistão 1992 41.7 16317921 649. - ## 10 Afeganistão 1997 41.8 22227415 635. - ## # … with 1,694 more rows - -### Filtrar (filter) - -A função `filter()` faz o mesmo que a função select, mas ao invés de escolher o nome da coluna, podemos usá-lo para filtrar linhas usando um teste de requisito. Por exemplo, se quisermos selecionar somente os registros dos países em 2007: - - dados_gapminder %>% - filter(ano == 2007) - - ## # A tibble: 142 x 6 - ## pais continente ano expectativa_de_vida populacao pib_per_capita - ## - ## 1 Afeganistão Ásia 2007 43.8 31889923 975. - ## 2 Albânia Europa 2007 76.4 3600523 5937. - ## 3 Argélia África 2007 72.3 33333216 6223. - ## 4 Angola África 2007 42.7 12420476 4797. - ## 5 Argentina Américas 2007 75.3 40301927 12779. - ## 6 Austrália Oceania 2007 81.2 20434176 34435. - ## 7 Áustria Europa 2007 79.8 8199783 36126. - ## 8 Bahrein Ásia 2007 75.6 708573 29796. - ## 9 Bangladesh Ásia 2007 64.1 150448339 1391. - ## 10 Bélgica Europa 2007 79.4 10392226 33693. - ## # … with 132 more rows - -### Modificar (mutate) - -A função `mutate()` permite adicionar uma coluna ao seu *dataset*. No momento, temos país e continente em duas colunas separadas. Podemos utilizar a função `paste()` para combinar as duas informações e especificar um separador. Vamos colocá-las em uma única coluna chamada "localizacao". - - dados_gapminder %>% - mutate(localizacao = paste(pais, continente, sep = ", ")) - - ## # A tibble: 1,704 x 7 - ## pais continente ano expectativa_de_vida populacao pib_per_capita localizacao - ## - ## 1 Afeganistão Ásia 1952 28.8 8425333 779. Afeganistão, Ásia - ## 2 Afeganistão Ásia 1957 30.3 9240934 821. Afeganistão, Ásia - ## 3 Afeganistão Ásia 1962 32.0 10267083 853. Afeganistão, Ásia - ## 4 Afeganistão Ásia 1967 34.0 11537966 836. Afeganistão, Ásia - ## 5 Afeganistão Ásia 1972 36.1 13079460 740. Afeganistão, Ásia - ## 6 Afeganistão Ásia 1977 38.4 14880372 786. Afeganistão, Ásia - ## 7 Afeganistão Ásia 1982 39.9 12881816 978. Afeganistão, Ásia - ## 8 Afeganistão Ásia 1987 40.8 13867957 852. Afeganistão, Ásia - ## 9 Afeganistão Ásia 1992 41.7 16317921 649. Afeganistão, Ásia - ## 10 Afeganistão Ásia 1997 41.8 22227415 635. Afeganistão, Ásia - ## # … with 1,694 more rows - -Novamente, é preciso lembrar que o dplyr não salva os dados, nem transforma o original. Em vez disso, ele cria um *data frame* temporário em cada etapa. Se deseja manter os dados, é necessário criar uma variável permanente. - - dados_gapminder_localizacao <- dados_gapminder %>% - mutate(localizacao = paste(pais, continente, sep = ", ")) - - # Visualiza a nova tabela criada com a localização adicionada - - dados_gapminder_localizacao - - ## # A tibble: 1,704 x 7 - ## pais continente ano expectativa_de_vida populacao pib_per_capita localizacao - ## - ## 1 Afeganistão Ásia 1952 28.8 8425333 779. Afeganistão, Ásia - ## 2 Afeganistão Ásia 1957 30.3 9240934 821. Afeganistão, Ásia - ## 3 Afeganistão Ásia 1962 32.0 10267083 853. Afeganistão, Ásia - ## 4 Afeganistão Ásia 1967 34.0 11537966 836. Afeganistão, Ásia - ## 5 Afeganistão Ásia 1972 36.1 13079460 740. Afeganistão, Ásia - ## 6 Afeganistão Ásia 1977 38.4 14880372 786. Afeganistão, Ásia - ## 7 Afeganistão Ásia 1982 39.9 12881816 978. Afeganistão, Ásia - ## 8 Afeganistão Ásia 1987 40.8 13867957 852. Afeganistão, Ásia - ## 9 Afeganistão Ásia 1992 41.7 16317921 649. Afeganistão, Ásia - ## 10 Afeganistão Ásia 1997 41.8 22227415 635. Afeganistão, Ásia - ## # … with 1,694 more rows - -### Ordenar (arrange) - -A função `arrange()` nos permite ordenar as colunas de novas formas. Atualmente, o nosso conjunto de dados está organizado em ordem alfabética pelo nome do país. Vamos ordená-lo em ordem decrescente de acordo com o total da população. - - dados_gapminder %>% - arrange(desc(populacao)) - - ## # A tibble: 1,704 x 6 - ## pais continente ano expectativa_de_vida populacao pib_per_capita - ## - ## 1 China Ásia 2007 73.0 1318683096 4959. - ## 2 China Ásia 2002 72.0 1280400000 3119. - ## 3 China Ásia 1997 70.4 1230075000 2289. - ## 4 China Ásia 1992 68.7 1164970000 1656. - ## 5 Índia Ásia 2007 64.7 1110396331 2452. - ## 6 China Ásia 1987 67.3 1084035000 1379. - ## 7 Índia Ásia 2002 62.9 1034172547 1747. - ## 8 China Ásia 1982 65.5 1000281000 962. - ## 9 Índia Ásia 1997 61.8 959000000 1459. - ## 10 China Ásia 1977 64.0 943455000 741. - ## # … with 1,694 more rows - -### Sumarizar (summarise) - -A última função do dplyr que veremos é a `summarise()`, usada geralmente para criar uma tabela contendo dados estatísticos resumidos que podemos plotar. Vamos utilizar a função `summarise()` para calcular a média da expectativa de vida nos países, considerando todo o conjunto dados_gapminder. - - dados_gapminder %>% - summarise(mean(expectativa_de_vida)) - - ## # A tibble: 1 x 1 - ## `mean(expectativa_de_vida)` - ## - ## 1 59.5 - -## Juntando tudo - -Agora, após termos visto os cinco principais verbos do dplyr, podemos criar rapidamente uma visualização dos nossos dados. Vamos criar um gráfico de barras mostrando o número de países com expectativa de vida maior que 50 anos, em 2007. - - expectativa_vida_2007 <- dados_gapminder %>% - filter(ano == 2007) %>% - mutate(expectativa_2007 = ifelse(expectativa_de_vida >= 50, "Maior ou igual a 50 anos", "Menor que 50 anos")) - - ggplot(expectativa_vida_2007) + - geom_bar(aes(x = expectativa_2007, fill = expectativa_2007)) + - labs(x = "A expectativa de vida é maior que 50 anos?") - -{% include figure.html filename="pt-tr-manipulacao-transformacao-dados-r-03.png" alt="Imagem com a representação de um gráfico de barras com dados sobre o número de países onde a expectativa de vida é maior ou menor que 50 anos, em 2007" caption="Expectativa de vida nos países em 2007" %} - -Novamente, fazendo uma pequena mudança no nosso código, podemos ver também o número de países com expectativa de vida maior que 50 anos, em 1952. - - expectativa_vida_1952 <- dados_gapminder %>% - filter(ano == 1952) %>% - mutate(expectativa_1952 = ifelse(expectativa_de_vida >= 50, "Maior ou igual a 50 anos", "Menor que 50 anos")) - - ggplot(expectativa_vida_1952) + - geom_bar(aes(x = expectativa_1952, fill = expectativa_1952)) + - labs(x = "A expectativa de vida é maior que 50 anos?") - -({% include figure.html filename="pt-tr-manipulacao-transformacao-dados-r-04.png" alt="Imagem com a representação de um gráfico de barras com dados sobre o número de países onde a expectativa de vida é maior ou menor que 50 anos, em 1952" caption="Expectativa de vida nos países em 1952" %} - -## Conclusão - -Este tutorial deve encaminhar seus conhecimentos para pensar sobre como organizar e manipular dados usando R. Posteriormente, provavelmente vai querer visualizar esses dados de alguma forma, usando gráficos, como fizemos em partes desta lição. Recomendamos que comece a estudar o [ggplot2](https://ggplot2.tidyverse.org/) (em inglês), pacote com uma coleção de ferramentas que funcionam bem em conjunto com o dplyr. Além disso, você deve buscar conhecer as outras funções do pacote dplyr que não vimos aqui, para aprimorar suas habilidades de manipulação de dados. Por enquanto, esta lição deve proporcionar um bom ponto de partida, cobrindo muitos dos principais problemas que poderá encontrar. - - -### Notas - -[^1]: O pacote "dados" disponibiliza a tradução de conjuntos de dados originalmente em inglês encontrados em outros pacotes de R. Está disponível em https://github.com/cienciadedatos/dados +--- +title: Manipulação e transformação de dados com R +slug: manipulacao-transformacao-dados-r +layout: lesson +collection: lessons +date: 2017-08-01 +translation_date: 2022-11-26 +authors: +- Nabeel Siddiqui +editors: +- Ian Milligan +reviewers: +- Lauren Tilton +- Ryan Deschamps +translator: +- Ian Araujo +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Suemi Higuchi +- Joana Paulino +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/397 +activity: transforming +topics: [data-manipulation, data-management, distant-reading, r, data-visualization] +abstract: "Esta lição explora como os investigadores podem tornar seus dados organizados, entender os pacotes do R para manipulação de dados e conduzir análises de dados básicas usando esta linguagem." +original: data-wrangling-and-management-in-r +avatar_alt: Barra de sabão +doi: 10.46430/phpt0035 +--- + +{% include toc.html %} + +## Requisitos +Nesta lição consideramos que já possui algum conhecimento da linguagem R. Se ainda não completou a lição [Noções básicas de R com dados tabulares](/pt/licoes/nocoes-basicas-R-dados-tabulares), recomendamos que o faça primeiro. Ter experiência com outras linguagens de programação também pode ser benéfico. Se está buscando por onde começar aprendendo outras linguagens, recomendamos os excelentes tutoriais de Python do *Programming Historian*. + +Nota da tradução: o conteúdo da programação utilizado na lição original foi alterado para esta versão em português para que o contexto e os exemplos sejam próximos da realidade da comunidade que fala o idioma. Por conta disso, parte do texto da lição traduzida, bem como os exemplos e as interpretações dos dados são diferente da lição original. No entanto, o conteúdo e a estrutura da lição são fidedignos à lição original, como os tipos de dados e as análises desenvolvidas. Mudamos, por exemplo, a comparação entre Mississipi e Virgínia por Brasil e Argentina, mantendo os recursos e procedimentos realizados por Nabeel Siddiqui. + +## Objetivos da lição +Ao fim desta lição, você: + +1. Saberá como tornar seus dados bem ordenados (*tidy*) e entenderá por que isso é importante. +2. Terá assimilado o uso do pacote [dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) (em inglês) e sua aplicação na manipulação e controle de dados. +3. Estará familiarizado com o operador *pipe* `%>%` na linguagem R e verá como ele pode auxiliar na criação de códigos mais legíveis. +4. Terá ganho experiência com análise exploratória de dados através de exemplos básicos de manipulação de dados. + +## Introdução +Os dados que encontra disponíveis nas diversas plataformas raramente estão no formato adequado para serem analisados, e precisará manipulá-los antes de explorar as perguntas de seu interesse. Isso pode tomar mais tempo que a própria análise dos dados! Neste tutorial, vamos aprender técnicas básicas para manipulação, gestão e controle de dados usando R. Especificamente, nos debruçaremos sobre a filosofia do ["*tidy data*"](https://www.jstatsoft.org/article/view/v059i10) (em inglês) conforme apresentada por Hadley Wickham. + +De acordo com [Wickham](https://hadley.nz/) (em inglês), os dados estão *tidy* ou bem-organizados quando satisfazem três critérios chave: + +1. Cada unidade de observação está em uma linha +2. Cada variável está em uma coluna +3. Cada valor possui a sua própria célula. + +Estar atento a estes critérios nos permite reconhecer quando os nossos dados estão adequados ou não. Também nos fornece um esquema padrão e um conjunto de soluções para lidar com alguns dos problemas mais comuns encontrados em *datasets* "mal-arranjados", como por exemplo: + +1. Nomes de colunas como valores ao invés de nomes de variáveis +2. Múltiplas variáveis contidas em uma única coluna +3. Variáveis armazenadas tanto em linhas quanto em colunas +4. Unidades de observação de diferentes categorias armazenadas na mesma tabela +5. Uma única unidade de observação armazenada em múltiplas tabelas. + +Talvez o mais importante seja que manter os dados nesse formato nos permite utilizar uma série de pacotes do ["tidyverse,"](https://tidyverse.org/) (em inglês), concebidos para trabalhar especificamente com dados neste formato *tidy*. Dessa forma, assegurando-nos de que os dados de entrada e de saída estão bem organizados, precisaremos apenas de um pequeno conjunto de ferramentas para resolver um grande número de questões. Podemos combinar, manipular e dividir os *datasets* que criamos, conforme considerarmos mais adequado. + +Neste tutorial focaremos no pacote [dplyr](https://cran.r-project.org/web/packages/dplyr/index.html) (em inglês) presente no tidyverse, mas também é importante mencionar alguns outros que serão vistos na lição: + +* [**magittr**](https://magrittr.tidyverse.org) (em inglês) -- Este pacote nos garante acesso ao operador *pipe* `%>%`, que torna o nosso código mais legível. +* [**ggplot2**](https://ggplot2.tidyverse.org/) (em inglês) -- Este pacote utiliza a ["Gramática de Gráficos"](https://www.springer.com/us/book/9780387245447) (em inglês) para fornecer uma forma fácil de visualizar nossos dados. +* [**tibble**](https://tibble.tidyverse.org/) (em inglês) -- Este pacote nos fornece uma releitura dos tradicionais *data frames*, mais fáceis de serem trabalhados e visualizados. + +Instale o "tidyverse", se ainda não o fez, e carregue-o antes de começarmos. Além disso, certifique-se de que possui instaladas a +[versão mais recente do R](https://cran.rstudio.com/) e a [versão mais recente do RStudio](https://www.rstudio.com/products/rstudio/download/) compatíveis com o seu sistema operacional. + +Copie o código a seguir para o seu RStudio. Para executá-lo, precisa selecionar as linhas e pressionar Ctrl+Enter (Command+Enter no Mac OS): + + # Instala e carrega a biblioteca tidyverse + # Não se preocupe caso demore um pouco + + install.packages("tidyverse") + library(tidyverse) + + +## Um exemplo do dplyr em ação +Vejamos um exemplo de como o dplyr pode auxiliar historiadores. Vamos utilizar o pacote "dados" [^1] e importar alguns indicadores socioeconômicos de países entre 1952 e 2007. + +O pacote "remotes" permite a instalação de pacotes R a partir de repositórios remotos, incluindo o GitHub, como é o caso de "dados". + + # Instala e carrega as bibliotecas "remotes" e "dados" + + install.packages("remotes") + library(remotes) + + remotes::install_github("cienciadedatos/dados") + library(dados) + +Em seguida, para termos acesso ao *dataset* "dados_gapminder", que se encontra no pacote "dados", basta executar o seguinte código: + + # Cria o objeto dados_socioeconomicos_paises e atribui a ele os elementos de dados_gapminder + + dados_socioeconomicos_paises <- dados_gapminder + +Os dados do [Gapminder](https://www.gapminder.org/) (em inglês) contêm o progresso de países ao longo do tempo, observando as estatísticas de alguns índices. Após importar o *dataset*, notará que ele possui seis variáveis: país, continente, ano, expectativa de vida, população e PIB *per capita*. Os dados já estão em formato *tidy*, possibilitando uma infinidade de opções para exploração futura. + +Neste exemplo, vamos visualizar o crescimento populacional de Brasil e Argentina ao longo dos anos. Para isso utilizaremos o pacote dplyr a fim de filtrar os dados que contenham apenas informações dos países de nosso interesse. Em seguida, utilizaremos o ggplot2 para visualizar tais dados. Este exercício é apenas uma breve demonstração do que é possível fazer com o dplyr, portanto, não se preocupe se não entender o código por enquanto. + + # Filtra os países desejados (Brasil e Argentina) + + dados_brasil_argentina <- dados_socioeconomicos_paises %>% + filter(pais %in% c("Brasil", "Argentina")) + + # Visualiza a população dos dois países + + ggplot(data = dados_brasil_argentina, aes(x = ano, y = populacao, color = pais)) + + geom_line() + + geom_point() + +{% include figure.html filename="pt-tr-manipulacao-transformacao-dados-r-01.png" alt="Imagem com a representação de um gráfico de linhas com dados da população por anos para o Brasil e a Argentina" caption="Gráfico da população de Brasil e Argentina, ao longo dos anos" %} + +Como podemos observar, a população absoluta do Brasil é consideravelmente maior em comparação com a população da Argentina. Embora isso pareça óbvio devido ao tamanho do território brasileiro, o código nos fornece uma base sobre a qual podemos formular uma infinidade de questões similares. Por exemplo, com uma pequena mudança no código podemos criar um gráfico similar com dois países diferentes, como Portugal e Bélgica. + + # Filtra os países desejados (Portugal e Bélgica) + + dados_portugal_belgica <- dados_socioeconomicos_paises %>% + filter(pais %in% c("Portugal", "Bélgica")) + + # Visualiza a população dos dois países + + ggplot(data = dados_portugal_belgica, aes(x = ano, y = populacao, color = pais)) + + geom_line() + + geom_point() + +{% include figure.html filename="pt-tr-manipulacao-transformacao-dados-r-02.png" alt="Imagem com a representação de um gráfico de linhas com dados da população por anos para a Bélgica e Portugal" caption="Gráfico da população de Portugal e Bégica, ao longo dos anos" %} + +Promover mudanças rápidas no código e revisar nossos dados é parte fundamental do processo de análise exploratória de dados (AED). Ao invés de tentar "provar" uma hipótese, a análise exploratória nos ajuda a entender melhor os dados e a levantar questões sobre eles. Para os historiadores, a AED fornece uma forma fácil de saber quando aprofundar mais em um tema e quando voltar atrás, e esta é uma área onde o R se sobressai. + +## Operador Pipe + +Antes de olharmos para o dplyr, precisamos entender o que é o operador *pipe* `%>%` no R, uma vez que iremos utilizá-lo em muitos exemplos adiante. Como mencionado anteriormente, este operador é parte do pacote [magrittr](https://cran.r-project.org/web/packages/magrittr/vignettes/magrittr.html) (em inglês), criado por [Stefan Milton Bache](https://stefanbache.dk/) e [Hadley Wickham](https://hadley.nz/), e está incluída no tidyverse. O seu nome é uma referência ao pintor surrealista Rene Magritte, criador da obra ["A Traição das Imagens"](https://www.renemagritte.org/the-treachery-of-images.jsp), que mostra um cachimbo com a frase "isto não é um cachimbo" (*ceci n'est pas une pipe*, em francês). + +O operador *pipe* `%>%` permite passar o que está à sua esquerda como a primeira variável em uma função especificada à sua direita. Embora possa parecer estranho no início, uma vez que aprende a usar o *pipe* descobrirá que ele torna seu código mais legível, evitando instruções aninhadas. Não se preocupe se estiver um pouco confuso por agora. Tudo ficará mais claro à medida que observarmos os exemplos. + +Vamos dizer que estamos interessados em obter a raiz quadrada de cada população e, então, somar todas as raízes antes de calcular a média. Obviamente, essa não é uma medição útil, mas demonstra a rapidez com que o código do R pode se tornar difícil de ler. Normalmente, usaríamos declarações aninhadas: + + mean(sum(sqrt(dados_socioeconomicos_paises$populacao))) + + ## [1] 6328339 + +Veja que com tantos comandos aninhados fica difícil lembrar quantos parênteses são necessários no final da linha, tornando o código complicado de ler. Para atenuar esse problema, algumas pessoas criam vetores temporários entre cada chamada de função. + + # Obtém a raiz quadrada da população de todos os países + + vetor_raiz_populacao <- sqrt(dados_socioeconomicos_paises$populacao) + + # Obtém a soma de todas as raízes da variável temporária + + soma_vetor_raizes_populacao <- sum(vetor_raiz_populacao) + + # Obtém a média da variável temporária + + media_soma_vetor_raizes_populacao <- mean(soma_vetor_raizes_populacao) + + # Exibe a média + + media_soma_vetor_raizes_populacao + + ## [1] 6328339 + +Embora obtenha o mesmo resultado, este código é muito mais legível. No entanto, se esquecer de excluir os vetores temporários, seu espaço de trabalho pode se tornar confuso. O operador *pipe* faz esse trabalho por você. Aqui está o mesmo código usando o operador *pipe*: + + dados_socioeconomicos_paises$populacao %>% sqrt %>% sum %>% mean + + ## [1] 6328339 + +Este código é mais fácil de ler que os anteriores e pode torná-lo ainda mais limpo escrevendo em linhas diferentes. + + # Certifique-se de colocar o operador no final da linha + + dados_socioeconomicos_paises$populacao %>% + sqrt %>% + sum %>% + mean + + ## [1] 6328339 + +Note que os vetores ou *data frames* criados pelo operador pipe são descartados quando se completa a operação. Se quiser salvar o resultado da operação, será preciso atribuí-lo a uma nova variável: + + vetor_permanente_media_soma_populacao <- dados_socioeconomicos_paises$populacao %>% + sqrt %>% + sum %>% + mean + + vetor_permanente_media_soma_populacao + + ## [1] 6328339 + +Agora que adquirimos uma compreensão do operador *pipe*, estamos prontos para começar a analisar e manipular alguns dados. Ao longo da lição vamos continuar trabalhando com o *dataset* dados_gapminder: + + # Certifique-se de que o pacote "dados" está instalado e carregado aantes de proceder conforme abaixo + + dados_gapminder + + ## # A tibble: 1,704 x 6 + ## pais continente ano expectativa_de_vida populacao pib_per_capita + ## + ## 1 Afeganistão Ásia 1952 28.8 8425333 779. + ## 2 Afeganistão Ásia 1957 30.3 9240934 821. + ## 3 Afeganistão Ásia 1962 32.0 10267083 853. + ## 4 Afeganistão Ásia 1967 34.0 11537966 836. + ## 5 Afeganistão Ásia 1972 36.1 13079460 740. + ## 6 Afeganistão Ásia 1977 38.4 14880372 786. + ## 7 Afeganistão Ásia 1982 39.9 12881816 978. + ## 8 Afeganistão Ásia 1987 40.8 13867957 852. + ## 9 Afeganistão Ásia 1992 41.7 16317921 649. + ## 10 Afeganistão Ásia 1997 41.8 22227415 635. + ## # … with 1,694 more rows + +Como pode observar, este *dataset* contém o nome do país, seu continente e o ano de registro, além dos indicadores de expectativa de vida, total da população e PIB *per capita*, em determinados anos. Conforme mencionamos acima, antes de analisar os dados é importante verificar se estes estão bem ordenados no formato *tidy*. Relembrando os três critérios discutidos, podemos dizer que sim, o *dataset* encontra-se organizado e pronto para ser trabalhado com o pacote dplyr. + +## O que é dplyr? +[Dplyr](https://cran.r-project.org/web/packages/dplyr/vignettes/dplyr.html) (em inglês) também é parte do tidyverse, fornecendo funções para manipulação e transformação dos dados. Porque estamos mantendo nossos dados bem organizados, precisaremos apenas de um pequeno conjunto de ferramentas para explorá-los. Em comparação com o pacote básico do R, usando o dplyr em nosso código, fica geralmente mais rápido e há a garantia de que os dados resultantes (*output*) estarão bem ordenados uma vez que os dados de entrada (*input*) também estarão. Talvez o mais importante seja que o dplyr torna o nosso código mais fácil de ser lido e utiliza "verbos" que são, na maioria das vezes, intuitivos. Cada função do dplyr corresponde a um desses verbos, sendo cinco principais: filtrar (`filter`), selecionar (`select`), ordenar (`arrange`), modificar (`mutate`) e sumarizar (`summarise`). Vamos observar individualmente como cada uma dessas funções funciona na prática. + +### Selecionar (select) + +Se olharmos para o *dataset* dados_gapminder, vamos observar a presença de seis colunas, cada uma contendo diferentes informações. Podemos escolher, para a nossa análise, visualizar apenas algumas dessas colunas. A função `select()` do dplyr nos permite fazer isso. O primeiro argumento da função é o *data frame* que desejamos manipular e os seguintes são os nomes das colunas que queremos manter: + + # Remove as colunas de dados_gapminder usando select() + # Note que não é necessário acrescentar o nome da coluna com o símbolo $ (dólar) ao final de dados_gapminder visto que o dplyr automaticamente assume que "," (vírgula) representa E (AND em inglês) + + select(dados_gapminder, pais, ano, expectativa_de_vida) + + ## # A tibble: 1,704 x 3 + ## pais ano expectativa_de_vida + ## + ## 1 Afeganistão 1952 28.8 + ## 2 Afeganistão 1957 30.3 + ## 3 Afeganistão 1962 32.0 + ## 4 Afeganistão 1967 34.0 + ## 5 Afeganistão 1972 36.1 + ## 6 Afeganistão 1977 38.4 + ## 7 Afeganistão 1982 39.9 + ## 8 Afeganistão 1987 40.8 + ## 9 Afeganistão 1992 41.7 + ## 10 Afeganistão 1997 41.8 + ## # … with 1,694 more rows + +Vejamos como escrever o mesmo código utilizando o operador *pipe* `%>%`: + + dados_gapminder %>% + select(pais, ano, expectativa_de_vida) + + ## # A tibble: 1,704 x 3 + ## pais ano expectativa_de_vida + ## + ## 1 Afeganistão 1952 28.8 + ## 2 Afeganistão 1957 30.3 + ## 3 Afeganistão 1962 32.0 + ## 4 Afeganistão 1967 34.0 + ## 5 Afeganistão 1972 36.1 + ## 6 Afeganistão 1977 38.4 + ## 7 Afeganistão 1982 39.9 + ## 8 Afeganistão 1987 40.8 + ## 9 Afeganistão 1992 41.7 + ## 10 Afeganistão 1997 41.8 + ## # … with 1,694 more rows + +Fazer referência a cada uma das colunas que desejamos manter apenas para nos livrar de uma é um tanto tedioso. Podemos usar o símbolo de menos (-) para demonstrar que queremos remover uma coluna. + + dados_gapminder %>% + select(-continente) + + ## # A tibble: 1,704 x 5 + ## pais ano expectativa_de_vida populacao pib_per_capita + ## + ## 1 Afeganistão 1952 28.8 8425333 779. + ## 2 Afeganistão 1957 30.3 9240934 821. + ## 3 Afeganistão 1962 32.0 10267083 853. + ## 4 Afeganistão 1967 34.0 11537966 836. + ## 5 Afeganistão 1972 36.1 13079460 740. + ## 6 Afeganistão 1977 38.4 14880372 786. + ## 7 Afeganistão 1982 39.9 12881816 978. + ## 8 Afeganistão 1987 40.8 13867957 852. + ## 9 Afeganistão 1992 41.7 16317921 649. + ## 10 Afeganistão 1997 41.8 22227415 635. + ## # … with 1,694 more rows + +### Filtrar (filter) + +A função `filter()` faz o mesmo que a função select, mas ao invés de escolher o nome da coluna, podemos usá-lo para filtrar linhas usando um teste de requisito. Por exemplo, se quisermos selecionar somente os registros dos países em 2007: + + dados_gapminder %>% + filter(ano == 2007) + + ## # A tibble: 142 x 6 + ## pais continente ano expectativa_de_vida populacao pib_per_capita + ## + ## 1 Afeganistão Ásia 2007 43.8 31889923 975. + ## 2 Albânia Europa 2007 76.4 3600523 5937. + ## 3 Argélia África 2007 72.3 33333216 6223. + ## 4 Angola África 2007 42.7 12420476 4797. + ## 5 Argentina Américas 2007 75.3 40301927 12779. + ## 6 Austrália Oceania 2007 81.2 20434176 34435. + ## 7 Áustria Europa 2007 79.8 8199783 36126. + ## 8 Bahrein Ásia 2007 75.6 708573 29796. + ## 9 Bangladesh Ásia 2007 64.1 150448339 1391. + ## 10 Bélgica Europa 2007 79.4 10392226 33693. + ## # … with 132 more rows + +### Modificar (mutate) + +A função `mutate()` permite adicionar uma coluna ao seu *dataset*. No momento, temos país e continente em duas colunas separadas. Podemos utilizar a função `paste()` para combinar as duas informações e especificar um separador. Vamos colocá-las em uma única coluna chamada "localizacao". + + dados_gapminder %>% + mutate(localizacao = paste(pais, continente, sep = ", ")) + + ## # A tibble: 1,704 x 7 + ## pais continente ano expectativa_de_vida populacao pib_per_capita localizacao + ## + ## 1 Afeganistão Ásia 1952 28.8 8425333 779. Afeganistão, Ásia + ## 2 Afeganistão Ásia 1957 30.3 9240934 821. Afeganistão, Ásia + ## 3 Afeganistão Ásia 1962 32.0 10267083 853. Afeganistão, Ásia + ## 4 Afeganistão Ásia 1967 34.0 11537966 836. Afeganistão, Ásia + ## 5 Afeganistão Ásia 1972 36.1 13079460 740. Afeganistão, Ásia + ## 6 Afeganistão Ásia 1977 38.4 14880372 786. Afeganistão, Ásia + ## 7 Afeganistão Ásia 1982 39.9 12881816 978. Afeganistão, Ásia + ## 8 Afeganistão Ásia 1987 40.8 13867957 852. Afeganistão, Ásia + ## 9 Afeganistão Ásia 1992 41.7 16317921 649. Afeganistão, Ásia + ## 10 Afeganistão Ásia 1997 41.8 22227415 635. Afeganistão, Ásia + ## # … with 1,694 more rows + +Novamente, é preciso lembrar que o dplyr não salva os dados, nem transforma o original. Em vez disso, ele cria um *data frame* temporário em cada etapa. Se deseja manter os dados, é necessário criar uma variável permanente. + + dados_gapminder_localizacao <- dados_gapminder %>% + mutate(localizacao = paste(pais, continente, sep = ", ")) + + # Visualiza a nova tabela criada com a localização adicionada + + dados_gapminder_localizacao + + ## # A tibble: 1,704 x 7 + ## pais continente ano expectativa_de_vida populacao pib_per_capita localizacao + ## + ## 1 Afeganistão Ásia 1952 28.8 8425333 779. Afeganistão, Ásia + ## 2 Afeganistão Ásia 1957 30.3 9240934 821. Afeganistão, Ásia + ## 3 Afeganistão Ásia 1962 32.0 10267083 853. Afeganistão, Ásia + ## 4 Afeganistão Ásia 1967 34.0 11537966 836. Afeganistão, Ásia + ## 5 Afeganistão Ásia 1972 36.1 13079460 740. Afeganistão, Ásia + ## 6 Afeganistão Ásia 1977 38.4 14880372 786. Afeganistão, Ásia + ## 7 Afeganistão Ásia 1982 39.9 12881816 978. Afeganistão, Ásia + ## 8 Afeganistão Ásia 1987 40.8 13867957 852. Afeganistão, Ásia + ## 9 Afeganistão Ásia 1992 41.7 16317921 649. Afeganistão, Ásia + ## 10 Afeganistão Ásia 1997 41.8 22227415 635. Afeganistão, Ásia + ## # … with 1,694 more rows + +### Ordenar (arrange) + +A função `arrange()` nos permite ordenar as colunas de novas formas. Atualmente, o nosso conjunto de dados está organizado em ordem alfabética pelo nome do país. Vamos ordená-lo em ordem decrescente de acordo com o total da população. + + dados_gapminder %>% + arrange(desc(populacao)) + + ## # A tibble: 1,704 x 6 + ## pais continente ano expectativa_de_vida populacao pib_per_capita + ## + ## 1 China Ásia 2007 73.0 1318683096 4959. + ## 2 China Ásia 2002 72.0 1280400000 3119. + ## 3 China Ásia 1997 70.4 1230075000 2289. + ## 4 China Ásia 1992 68.7 1164970000 1656. + ## 5 Índia Ásia 2007 64.7 1110396331 2452. + ## 6 China Ásia 1987 67.3 1084035000 1379. + ## 7 Índia Ásia 2002 62.9 1034172547 1747. + ## 8 China Ásia 1982 65.5 1000281000 962. + ## 9 Índia Ásia 1997 61.8 959000000 1459. + ## 10 China Ásia 1977 64.0 943455000 741. + ## # … with 1,694 more rows + +### Sumarizar (summarise) + +A última função do dplyr que veremos é a `summarise()`, usada geralmente para criar uma tabela contendo dados estatísticos resumidos que podemos plotar. Vamos utilizar a função `summarise()` para calcular a média da expectativa de vida nos países, considerando todo o conjunto dados_gapminder. + + dados_gapminder %>% + summarise(mean(expectativa_de_vida)) + + ## # A tibble: 1 x 1 + ## `mean(expectativa_de_vida)` + ## + ## 1 59.5 + +## Juntando tudo + +Agora, após termos visto os cinco principais verbos do dplyr, podemos criar rapidamente uma visualização dos nossos dados. Vamos criar um gráfico de barras mostrando o número de países com expectativa de vida maior que 50 anos, em 2007. + + expectativa_vida_2007 <- dados_gapminder %>% + filter(ano == 2007) %>% + mutate(expectativa_2007 = ifelse(expectativa_de_vida >= 50, "Maior ou igual a 50 anos", "Menor que 50 anos")) + + ggplot(expectativa_vida_2007) + + geom_bar(aes(x = expectativa_2007, fill = expectativa_2007)) + + labs(x = "A expectativa de vida é maior que 50 anos?") + +{% include figure.html filename="pt-tr-manipulacao-transformacao-dados-r-03.png" alt="Imagem com a representação de um gráfico de barras com dados sobre o número de países onde a expectativa de vida é maior ou menor que 50 anos, em 2007" caption="Expectativa de vida nos países em 2007" %} + +Novamente, fazendo uma pequena mudança no nosso código, podemos ver também o número de países com expectativa de vida maior que 50 anos, em 1952. + + expectativa_vida_1952 <- dados_gapminder %>% + filter(ano == 1952) %>% + mutate(expectativa_1952 = ifelse(expectativa_de_vida >= 50, "Maior ou igual a 50 anos", "Menor que 50 anos")) + + ggplot(expectativa_vida_1952) + + geom_bar(aes(x = expectativa_1952, fill = expectativa_1952)) + + labs(x = "A expectativa de vida é maior que 50 anos?") + +({% include figure.html filename="pt-tr-manipulacao-transformacao-dados-r-04.png" alt="Imagem com a representação de um gráfico de barras com dados sobre o número de países onde a expectativa de vida é maior ou menor que 50 anos, em 1952" caption="Expectativa de vida nos países em 1952" %} + +## Conclusão + +Este tutorial deve encaminhar seus conhecimentos para pensar sobre como organizar e manipular dados usando R. Posteriormente, provavelmente vai querer visualizar esses dados de alguma forma, usando gráficos, como fizemos em partes desta lição. Recomendamos que comece a estudar o [ggplot2](https://ggplot2.tidyverse.org/) (em inglês), pacote com uma coleção de ferramentas que funcionam bem em conjunto com o dplyr. Além disso, você deve buscar conhecer as outras funções do pacote dplyr que não vimos aqui, para aprimorar suas habilidades de manipulação de dados. Por enquanto, esta lição deve proporcionar um bom ponto de partida, cobrindo muitos dos principais problemas que poderá encontrar. + + +### Notas + +[^1]: O pacote "dados" disponibiliza a tradução de conjuntos de dados originalmente em inglês encontrados em outros pacotes de R. Está disponível em https://github.com/cienciadedatos/dados diff --git a/pt/licoes/nocoes-basicas-R-dados-tabulares.md b/pt/licoes/nocoes-basicas-R-dados-tabulares.md index 9cbe0bb4a0..469a33307f 100644 --- a/pt/licoes/nocoes-basicas-R-dados-tabulares.md +++ b/pt/licoes/nocoes-basicas-R-dados-tabulares.md @@ -1,570 +1,570 @@ ---- -title: Noções básicas de R com dados tabulares -layout: lesson -slug: nocoes-basicas-R-dados-tabulares -date: 2016-09-05 -translation_date: 2021-08-28 -authors: -- Taryn Dewar -reviewers: -- James Baker -- John Russell -editors: -- Adam Crymble -translator: -- Diana Rebelo Rodriguez -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Ivo Veiga -- Romulo Predes -difficulty: 1 -activity: transforming -topics: [data-manipulation, r] -abstract: "Esta lição ensina uma maneira de analisar rapidamente grandes volumes de dados tabulares, tornando a pesquisa mais rápida e eficaz." -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/398 -original: r-basics-with-tabular-data -avatar_alt: Letra R ornamentada e ilustrada -doi: 10.46430/phpt0015 ---- - - - -{% include toc.html %} - -## Objetivos da lição - -À medida que mais e mais registros históricos são digitalizados, ter uma maneira de analisar rapidamente grandes volumes de dados tabulares torna a pesquisa mais rápida e eficaz. - -[R](https://pt.wikipedia.org/wiki/R_%28linguagem_de_programa%C3%A7%C3%A3o%29) é uma linguagem de programação com pontos fortes nas análises estatísticas. Como tal, ela pode ser usada para realizar análises quantitativas sobre fontes históricas, incluindo, mas não se limitando, a testes estatísticos. Como é possível executar repetidamente o mesmo código nas mesmas fontes, R permite analisar dados rapidamente e produz resultados que podem ser replicados. Além disso, como é possível salvar o código, R permite que se redirecionem ou revejam funções para projectos futuros, tornando-o uma parte flexível de sua caixa de ferramentas. - -Este tutorial não pressupõe nenhum conhecimento prévio do R. Ele percorrerá algumas das funções básicas do R e servirá como uma introdução à linguagem. Ele aborda o processo de instalação, explica algumas das ferramentas que se podem usar no R, bem como explica como trabalhar com conjuntos de dados enquanto se faz pesquisa. O tutorial fará isso através de uma série de mini-lições que mostrarão os tipos de fontes com as quais o R funciona bem e exemplos de como fazer cálculos para encontrar informações que possam ser relevantes à pesquisa histórica. A lição também abordará diferentes métodos de entrada de dados para R, tais como matrizes e o uso de ficheiros CSV. - -## Para quem isto é útil? - -R é ideal para analisar conjuntos de dados de grande dimensão que levariam muito tempo para serem processados manualmente. Depois de entendida a forma como se escrevem algumas funções básicas e como importar ficheiros de dados próprios, é possível analisar e visualizar os dados de forma rápida e eficiente. - -Embora R seja uma ótima ferramenta para dados tabulares, pode-se achar mais útil utilizar outras abordagens para analisar fontes não tabulares (tais como transcrições de jornais). Caso possua interesse em estudar estes tipos de fontes, dê uma olhada em algumas das outras grandes lições do [The Programming Historian](/pt/). - -## Instalar R - -R é uma linguagem de programação e um ambiente para trabalhar com dados. Ele pode ser executado utilizando o console de R, bem como no [command-line](/en/lessons/intro-to-bash) (linha de comandos) ou na interface [R Studio](https://www.rstudio.com/). Este tutorial irá focar no uso do console de R. Para começar com o R, baixe o programa do [The Comprehensive R Archive Network](https://cran.r-project.org/). R é compatível com Linux, Mac e Windows. - -Quando se abre o console de R pela primeira vez, a janela aberta se parece com essa: -![O console R no Mac.](/images/r-basics-with-tabular-data/Intro-to-R-1.png) - -## Usar o console de R - -O console R é um ótimo lugar para começar a trabalhar se quando se é inexperiente em R, porque ele foi projetado especificamente para esta linguagem e tem funções específicas para o R. - -O console é onde se digitam os comandos. Para limpar a tela inicial, vá para 'Edit' (editar) na barra de menu e selecione 'Clean Console’ (limpar console). Isto iniciará R com uma nova página. Também é possível mudar a aparência do console clicando na roda colorida no topo do console em um Mac, ou selecionando 'GUI Preferences' (preferências da Interface Gráfica do Usuário) no menu 'Edit' em um PC. Além disso, também é possível ajustar a cor da tela de fundo e as cores da fonte para as funções. - -## Usar conjuntos de dados - -Antes de trabalhar com dados próprios, usar os conjuntos de dados já incorporados ajuda a ter uma noção de como R funciona. É possível pesquisar nos conjuntos de dados inserindo data() no console. Isto mostrará a lista de todos os conjuntos de dados disponíveis em uma janela separada. Essa lista inclui os títulos de todos os diferentes conjuntos de dados, bem como uma breve descrição sobre as informações em cada um deles. - -No exemplo abaixo iremos primeiro carregar o conjunto de dados AirPassengers na sua sessão R digitando data(AirPassengers) na próxima linha do console^[1] e pressionando Enter. Para visualizar o conjunto de dados, digite apenas AirPassengers na próxima linha e pressione Enter novamente. Isso imprimirá uma tabela mostrando o número de passageiros que voaram em companhias aéreas internacionais entre janeiro de 1949 e dezembro de 1960, em milhares. Deverá aparecer o seguinte: - -``` -> data(AirPassengers) -> AirPassengers - Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec -1949 112 118 132 129 121 135 148 148 136 119 104 118 -1950 115 126 141 135 125 149 170 170 158 133 114 140 -1951 145 150 178 163 172 178 199 199 184 162 146 166 -1952 171 180 193 181 183 218 230 242 209 191 172 194 -1953 196 196 236 235 229 243 264 272 237 211 180 201 -1954 204 188 235 227 234 264 302 293 259 229 203 229 -1955 242 233 267 269 270 315 364 347 312 274 237 278 -1956 284 277 317 313 318 374 413 405 355 306 271 306 -1957 315 301 356 348 355 422 465 467 404 347 305 336 -1958 340 318 362 348 363 435 491 505 404 359 310 337 -1959 360 342 406 396 420 472 548 559 463 407 362 405 -1960 417 391 419 461 472 535 622 606 508 461 390 432 -``` - -Agora, é possível usar R para responder a uma série de perguntas com base nestes dados. Por exemplo, quais foram os meses mais populares para voar? Houve um aumento nas viagens internacionais ao longo do tempo? Provavelmente poderíamos encontrar as respostas a tais perguntas simplesmente escaneando esta tabela, mas não tão rapidamente quanto o computador. E se houvesse muito mais dados? - -## Funções básicas - -R pode ser usado para calcular uma série de valores que podem ser úteis enquanto se faz pesquisa em um conjunto de dados. Por exemplo, é possível encontrar a [média](https://pt.wikipedia.org/wiki/M%C3%A9dia), a [mediana](https://pt.wikipedia.org/wiki/Mediana_%28estat%C3%ADstica%29) e os valores mínimos e máximos. Para encontrar a média e a mediana no conjunto de dados, insere-se, respectivamente, mean(AirPassengers) e median(AirPassengers) no console. E se quisermos calcular mais de um valor de cada vez? Para produzir um resumo dos dados, digite summary(AirPassengers) (resumo) no console. Isto dará os valores mínimo e máximo dos dados, assim como a média, a mediana e os valores do primeiro e terceiro quartil. - -``` -> summary(AirPassengers) - Min. 1st Qu. Median Mean 3rd Qu. Max. - 104.0 180.0 265.5 280.3 360.5 622.0 -``` - -Um resumo nos mostra que o número mínimo de passageiros entre janeiro de 1949 e dezembro de 1960 foi de 104.000 e que o número máximo de passageiros foi de 622.000. O valor médio nos mostra que aproximadamente 280.300 pessoas viajavam por mês durante o período de coleta dos dados. Estes valores podem ser úteis para ver o grau de variação no número de passageiros ao longo do tempo. - -Usar a função summary() é uma boa maneira de se obter uma visão geral de todo o conjunto de dados. No entanto, e se quisermos analisar um subconjunto de dados, como um determinado ano ou alguns meses? É possível selecionar diferentes pontos de dados (como um determinado mês) e intervalos (como um determinado ano) em R para calcular muitos valores diferentes. Por exemplo, pode-se adicionar o número de passageiros durante dois meses para determinar o número total de passageiros durante esse período de tempo. - -Tente adicionar os dois primeiros valores dos dados AirPassengers no console e, em seguida, pressione 'Enter'. Devem aparecer duas linhas assim: - -``` -> 112+118 -[1] 230 -``` - -Isto lhe daria o número total de passageiros (em centenas de milhares) que voaram em janeiro e fevereiro de 1949. - -R pode fazer muito mais do que simples aritmética. É possível criar objetos, ou [variáveis](https://pt.wikipedia.org/wiki/Vari%C3%A1vel_%28programa%C3%A7%C3%A3o%29), para representar números e [expressões](https://pt.wikipedia.org/wiki/Express%C3%A3o_%28computa%C3%A7%C3%A3o%29). Por exemplo, pode-se nomear o valor de janeiro de 1949 como variável Jan1949. DigiteJan1949 <- 112 no console e, em seguida, Jan1949 na linha seguinte. A notação <- atribui o valor 112 à variável Jan1949. O que deve aparecer é: - -``` -> Jan1949 <- 112 -> Jan1949 -[1] 112 -``` - -R é sensível a maiúsculas e minúsculas, portanto tenha cuidado para usar a mesma notação quando usar as variáveis que foram atribuídas (ou nomeadas) em outras ações. Veja o artigo de Rasmus Bååth, [The State of Naming Conventions in R](https://perma.cc/ME6W-JZJQ) (em inglês), para mais informações sobre como nomear melhor as variáveis. - -Para remover uma variável do console, digite rm() (*remove* ou apagar) com a variável da qual se deseja apagar dos parênteses, e pressione Enter. Para ver todas as variáveis atribuídas, digite ls() (*list objects* ou lista de objetos) no console e pressione Enter. Isto pode ajudar a evitar o uso do mesmo nome para múltiplas variáveis. Isto também é importante porque R armazena todos os objetos que são criados em sua memória, portanto, mesmo que não se consiga ver uma variável nomeada x no console, ela pode ter sido criada antes e acidentalmente poderia sobrescrevê-la ao atribuir outra variável. - -Aqui está a lista de variáveis que criamos até agora: - -``` -> ls() -[1] "AirPassengers" "Jan1949" -``` - -Temos as variáveis AirPassengers e Jan1949. Se removermos a variável Jan1949 e digitarmos novamente ls(), veremos: - -``` -> rm(Jan1949) -> ls() -[1] "AirPassengers" -``` - -Se a qualquer momento não conseguir corrigir um erro ou ficar preso a uma função, digite help() no console para abrir a página de ajuda. Também é possível encontrar ajuda geral usando o menu ‘Help’ na parte superior do console. Se quiser mudar algo no código que já escreveu, pode-se digitar novamente o código em uma nova linha. Para economizar tempo, também é possível usar as setas do teclado para rolar para cima e para baixo no console para encontrar a linha de código que se deseja mudar. - -É possível usar letras como variáveis, mas quando começar a trabalhar com seus próprios dados, pode ser mais fácil atribuir nomes que sejam mais representativos desses dados. Mesmo com os dados AirPassengers, atribuir variáveis que se correlacionam com meses ou anos específicos tornaria mais fácil saber exatamente com quais valores se está trabalhando. - -### Prática - -A. Atribuir os valores de janeiro de 1950 e janeiro de 1960 dos dados de AirPassengers() em dois objetos novos. Em seguida, somar os valores dos dois objetos criados em uma nova linha de código. - -B. Usar os dois objetos criadas para encontrar a diferença entre os viajantes aéreos de janeiro de 1960 e de 1950. - -### Soluções - -A. Atribuir variáveis para os pontos de janeiro de 1950 e janeiro de 1960 dos dados de AirPassengers(). Adicionar as duas variáveis juntas na linha seguinte. - -``` -> Jan1950<- 115 -> Jan1960<- 417 -> Jan1950+Jan1960 -[1] 532 -``` - -Isto significa que 532.000 pessoas viajaram em voos internacionais em janeiro de 1950 e janeiro de 1960. - -B. Usar as variáveis que foram criadas para encontrar a diferença entre os viajantes aéreos de 1960 e 1950. - -``` -> Jan1960-Jan1950 -[1] 302 -``` - -Isto significa que, em janeiro de 1960, mais 302.000 pessoas viajaram em voos internacionais do que em janeiro de 1950. - -Definir variáveis para pontos de dados individuais pode ser entediante, especialmente se os nomes atribuídos são bastante longos. Entretanto, o processo é semelhante para atribuir um intervalo de valores a uma variável, como todos os pontos de dados durante um ano. Fazemos isso criando listas chamadas ‘vetores’ usando o comando c. c significa ‘combinar’ e nos permite vincular números em uma variável comum. Por exemplo, pode-se criar um vetor para os dados AirPassengers() de 1949 nomeado Air49: - -``` -> Air49<- c(112,118,132,129,121,135,148,148,136,119,104,118) -``` - -Cada item é acessível usando o nome da variável e sua posição no índice (a partir de 1). Neste caso, Air49[2] contém o valor que corresponde a fevereiro de 1949 - 118. - -``` -> Air49[2] -[1] 118 -``` - -É possível criar uma lista de valores consecutivos usando dois pontos. Por exemplo: - -``` -> y <- 1:10 -> y -[1] 1 2 3 4 5 6 7 8 9 10 -``` - -Usando este conhecimento, podemos usar a seguinte expressão para definir uma variável para os dados AirPassengers de 1949. - -``` -> Air49 <- AirPassengers[1:12] -> Air49 - [1] 112 118 132 129 121 135 148 148 136 119 104 118 -``` - -Air49[2] selecionou os primeiros doze termos no conjunto de dados AirPassengers. Isto dá o mesmo resultado que acima, mas leva menos tempo e também reduz a chance de que um valor seja transcrito incorretamente. - -Para obter o número total de passageiros para 1949, é possível somar todos os termos no vetor, usando a função sum() (somar). - -``` -> sum(Air49) -[1] 1520 -``` - -Portanto, o número total de passageiros em 1949 era de aproximadamente 1.520.000. - -Finalmente, a função length() (comprimento) torna possível saber o número de objetos em um vetor: - -``` -> length(Air49) -[1] 12 -``` - -### Prática - -1. Criar uma variável para os dados AirPassengers de 1950. -2. Imprimir ou apresentar o segundo objeto da série de 1950. -3. Qual é o tamanho (*length*) da sequência na pergunta 2? -4. Quantos passageiros voaram no total em 1950? - -### Soluções - -1. -``` -> Air50 <- AirPassengers[13:24] -Air50 -[1] 115 126 141 135 125 149 170 170 158 133 114 140 -``` - -2. -``` -> Air50[2] -[1] 126 -``` - -3. -``` -> length(Air50) -[1] 12 -``` - -4. -``` ->sum(Air50) -[1] 1676 -``` - -Caso se quisesse criar variáveis para todos os anos no conjunto de dados, seria possível então usar algumas das ferramentas que examinamos para determinar o número de pessoas que viajam de avião ao longo do tempo. Aqui está uma lista de variáveis para 1949 a 1960, seguida pelo número total de passageiros para cada ano: - -``` -> Air49 <- AirPassengers[1:12] -Air50 <- AirPassengers[13:24] -Air51 <- AirPassengers[25:36] -Air52 <- AirPassengers[37:48] -Air53 <- AirPassengers[49:60] -Air54 <- AirPassengers[61:72] -Air55 <- AirPassengers[73:84] -Air56 <- AirPassengers[85:96] -Air57 <- AirPassengers[97:108] -Air58 <- AirPassengers[109:120] -Air59 <- AirPassengers[121:132] -Air60 <- AirPassengers[133:144] -``` - -``` -> sum(Air49) -[1] 1520 -sum(Air50) -[1] 1676 -sum(Air51) -[1] 2042 -sum(Air52) -[1] 2364 -sum(Air53) -[1] 2700 -sum(Air54) -[1] 2867 -sum(Air55) -[1] 3408 -sum(Air56) -[1] 3939 -sum(Air57) -[1] 4421 -sum(Air58) -[1] 4572 -sum(Air59) -[1] 5140 -sum(Air60) -[1] 5714 -``` - -A partir destas informações, podemos ver que o número de passageiros aumenta a cada ano. É possível ir mais longe com estes dados para determinar se havia um interesse crescente em férias em certos períodos do ano, ou mesmo o aumento percentual de passageiros ao longo do tempo. - -## Trabalhar com bases de dados maiores - -Note que o exemplo acima não é bem adequado para conjuntos de dados de grande dimensão: contar pontos de dados para encontrar os corretos seria muito entediante. Pense no que aconteceria se procurássemos informações do ano 96 em um conjunto de dados com 150 anos de dados coletados. - -É possível selecionar linhas e colunas específicas de dados se o conjunto de dados estiver em um formato particular. Carregue os dados de mtcars em seu console: - -``` -> data(mtcars) -> mtcars - mpg cyl disp hp drat wt qsec vs am gear carb -Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 -Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 -Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 -Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 -Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 -Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 -Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 -Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 -Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 -Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 -Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 -Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 -Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 -Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 -Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 -Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 -Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 -Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 -Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 -Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 -Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 -Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 -AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 -Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 -Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 -Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 -Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 -Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 -Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 -Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 -Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 -Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 -``` - -Este [conjunto de dados](https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/mtcars.html) fornece uma visão geral dos *Motor Trend Car Road Tests* de automóveis da revista Motor Trend de 1974[^2]. Ele contém informações sobre quantas milhas por galão ou quilômetros por litro um carro poderia percorrer[^3], o número de cilindros do motor em cada carro, potência, relação do eixo traseiro, peso, e outras características de cada modelo. Os dados poderão ser usados para descobrir qual destas características tornou cada tipo de carro mais ou menos seguro para os passageiros ao longo do tempo. - -É possível selecionar colunas inserindo o nome do conjunto de dados seguido por colchetes e o número da linha ou coluna de dados que lhe interessa. Para ordenar as linhas e colunas, pense no dataset[x,y], sendo dataset o conjunto de dados com o qual se está trabalhando, x a linha e y a coluna. - -Se estivesse interessado na primeira linha de informações no conjunto mtcars, deveria executar o seguinte em seu console: - -``` -> mtcars[1,] - mpg cyl disp hp drat wt qsec vs am gear carb -Mazda RX4 21 6 160 110 3.9 2.62 16.46 0 1 4 4 -``` - -Para ver uma coluna dos dados, podemos digitar: - -``` -> mtcars[,2] - [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4 -``` - -Isto mostra todos os valores sob a categoria cyl (cilindrada). A maioria dos modelos de carros tem motores de 4, 6 ou 8 cilindros. Também é possível selecionar pontos de dados individuais inserindo valores tanto para x (linha) quanto para y (coluna): - -``` - > mtcars[1,2] -[1] 6 -``` - -Isto retorna o valor na primeira linha, segunda coluna. A partir daqui, seria possível executar um resumo em uma linha ou coluna de dados sem ter que contar o número de termos no conjunto de dados. Por exemplo, digitar summary(mtcars[,1]) no console e pressionar 'Enter' daria o resumo para as milhas por galão que os diferentes carros no conjunto de dados mtcars usam: - -``` -> summary(mtcars[,1]) - Min. 1st Qu. Median Mean 3rd Qu. Max. - 10.40 15.42 19.20 20.09 22.80 33.90 -``` - -O resumo indica que a eficiência máxima de combustível foi de 33,9 milhas por galão ou 54.5 quilômetros por 3.78 litros, do Toyota Corolla e o menos eficiente foi o Lincoln Continental, que só conseguiu 10,4 milhas por galão, ou seja, 16.7 quilômetros por 3.78 litros. Podemos encontrar os carros que correspondem aos pontos de valor olhando de volta para a tabela. É muito mais fácil encontrar um valor específico do que tentar fazer as contas em sua cabeça ou pesquisar através de uma planilha. - -## Matrizes - -Agora que temos uma melhor compreensão de como algumas das funções básicas em R funcionam, podemos analisar maneiras de usar essas funções em nossos próprios dados. Isto inclui a construção de [matrizes](https://pt.wikipedia.org/wiki/Matriz_%28matem%C3%A1tica%29) usando pequenos conjuntos de dados. O benefício de saber como construir matrizes em R é que se tivermos apenas alguns pontos de dados para trabalhar, poderíamos simplesmente criar uma matriz em vez de um CSV que precisaria ser depois importado. Uma das maneiras mais simples de construir uma matriz é criar pelo menos duas variáveis ou vetores e depois ligá-los entre si. Por exemplo, vejamos alguns dados do [Old Bailey](https://pt.wikipedia.org/wiki/Old_Bailey) (o Tribunal Penal Central da Inglaterra e do País de Gales): - - -![Conjunto de dados criminais do [The Old Bailey](https://www.oldbaileyonline.org/) nas décadas entre 1670 e 1800.](/images/r-basics-with-tabular-data/Intro-to-R-2.png) - - -O Old Bailey contém estatísticas e informações sobre casos criminais entre 1674 e 1913 que foram mantidos pelo Tribunal Penal Central de Londres. Se quiséssemos analisar o número total de crimes de roubo e furto violento entre 1670 e 1710, poderíamos colocar esses números em uma matriz. - -Para isso, vamos criar as variáveis Roubos e RoubosViolentos usando os totais de cada década como pontos de dados: - -``` -> Roubos <- c(2,30,38,13) -RoubosViolentos <- c(7,20,36,3) -``` - -Para criar uma matriz podemos usar a função cbind() (*column bind* ou união de colunas). Isto une Roubos e RoubosViolentos em colunas, representadas como Crime aqui: - -``` -> Roubos <- c(2,30,38,13) -RoubosViolentos <- c(7,20,36,3) -Crime <- cbind(Roubos,RoubosViolentos) -Crime - Roubos RoubosViolentos -[1,] 2 7 -[2,] 30 20 -[3,] 38 36 -[4,] 13 3 -``` - -Também é possível estabelecer uma matriz usando rbind(). rbind() une os dados em fileiras (*row bind* ou união de fileiras). Observe a diferença entrenCrime e Crime2: - -``` -> Crime2 <- rbind(Roubos,RoubosViolentos) -> Crime2 - [,1] [,2] [,3] [,4] -Roubos 2 30 38 13 -RoubosViolentos 7 20 36 3 -``` - -A segunda matriz também pode ser criada usando a expressão t(Crime) (matriz transposta), que gera o inverso de Crime. - -Também é possível construir uma matriz utilizando matrix(). Isto permite transformar uma sequência de números, como o número de roubos e roubos violentos cometidos, em uma matriz se não tiver criado variáveis separadas para estes valores: - -``` -> matrix(c(2,30,3,4,7,20,36,3),nrow=2) - [,1] [,2] [,3] [,4] -[1,] 2 3 7 36 -[2,] 30 4 20 3 -``` - -``` -[2,] 30 4 20 3 -> matrix(c(2,30,3,4,7,20,36,3),ncol=2) - [,1] [,2] -[1,] 2 7 -[2,] 30 20 -[3,] 3 36 -[4,] 4 3 -``` - -A primeira parte da função é a lista de números. Depois disso, é possível determinar quantas linhas (nrow=) (número de linhas) ou colunas (ncol=) (número de colunas) a matriz terá. - -A função apply() permite executar a mesma função em cada linha ou coluna de uma matriz. Existem três partes da função apply(), nas quais é preciso selecionar: a matriz que está sendo utilizada, os termos que se deseja usar e a função que se deseja executar na matriz: - -``` -> Crime - Roubos RoubosViolentos -[1,] 2 7 -[2,] 30 20 -[3,] 38 36 -[4,] 13 3 -> apply(Crime,1,mean) -[1] 4.5 25.0 37.0 8.0 -``` - -Este exemplo mostra a função apply utilizada na matriz Crime para calcular a média (*mean*) de cada linha e, portanto, o número médio de roubos e assaltos combinados que foram cometidos em cada década. Se quiser saber a média de cada coluna, use um 2 em vez de um 1 dentro da função: - -``` -> apply(Crime,2,mean) - Roubos RoubosViolentos - 20.75 16.50 -``` - -Isto mostra o número médio de roubos e assaltos entre as décadas. - -### Prática - -1. Criar uma matriz de duas colunas usando os seguintes dados de Quebra da Paz (*Breaking Peace*) e Assassinatos (*Killing*) de 1710 a 1730 da tabela acima do Old Bailey: c(2,3,3,44,51,17) - -2. Usar a função cbind() para juntar QuebraPaz <- c(2,3,3) e Assassinatos <- c(44,51,17). - -3. Calcular a média de cada coluna para a matriz acima usando a função apply(). - -### Soluções - -1. -``` -> matrix(c(2,3,3,44,51,17),ncol=2) - [,1] [,2] -[1,] 2 44 -[2,] 3 51 -[3,] 3 17 -``` - -2. -``` -> QuebraPaz <- c(2,3,3) -> Assassinatos <- c(44,51,17) -> PazAssassinatos <- cbind(QuebraPaz,Assassinatos) -> PazAssassinatos - QuebraPaz Assassinatos -[1,] 2 44 -[2,] 3 51 -[3,] 3 17 -``` - -3. -``` -> apply(PazAssassinatos,2,mean) -> QuebraPaz Assassinatos -> 2.666667 37.333333 -``` - -Matrizes podem ser úteis quando se está trabalhando com pequenas quantidades de dados. No entanto, nem sempre é a melhor opção, porque uma matriz pode ser difícil de ler. Às vezes é mais fácil criar seu próprio ficheiro usando um programa de planilhas como [Excel](https://pt.wikipedia.org/wiki/Microsoft_Excel) ou [Open Office](https://www.openoffice.org/pt/) para garantir que todas as informações que deseja estudar estejam organizadas e importar esse ficheiro para o R. - -## Carregar seu próprio conjunto de dados em R - -Agora que já praticou com dados simples, pode trabalhar com seus próprios dados. Como trabalhar com esses dados em R? Há várias maneiras de se fazer isso. A primeira é carregar a planilha diretamente em R. Outra maneira é importar um ficheiro CSV (*comma-separated values* ou valores separados por vírgula) ou TXT (de texto) para R. - -Para carregar um ficheiro Excel diretamente no console R, é necessário primeiro instalar o pacote readxl (ler o ficheiro Excel). Para fazer isto, digite install.packages("readxl") no console e pressione Enter. Pode ser que seja necessário verificar se o pacote foi instalado no console clicando na guia “Packages&Data” (pacotes e dados) no menu, selecionando “Package Manager” (gerenciador de pacotes) e depois clicando na caixa ao lado do pacote readxl. A partir daqui, é possível selecionar um ficheiro e carregá-lo em R. Abaixo está um exemplo de como pode parecer carregar um simples ficheiro Excel: - -``` -> x <- read_excel("Workbook2.xlsx") -> x - a b -1 1 5 -2 2 6 -3 3 7 -4 4 8 -``` - -Após o comando read_excel insere-se o nome do ficheiro que está sendo selecionado. Os números embaixo correspondem aos dados da planilha de amostra que utilizei. Observe como as linhas estão numeradas e as colunas estão etiquetadas como eram na planilha original. - -Quando estiver carregando dados em R, certifique-se de que o ficheiro que está sendo acessado esteja dentro do diretório em seu computador de onde se está trabalhando. Para verificar isso, digite dir() (diretório) ou getwd() (mostrar o caminho do diretório de trabalho) no console. É possível mudar o diretório, se necessário, indo para a aba “Miscellaneous” (diversos) na barra de título em sua tela e, em seguida, selecionando o que se quer definir como diretório para R. Se não fizer isso, R não será capaz de encontrar o ficheiro corretamente. - -Outra maneira de carregar dados em R é usar um ficheiro CSV. Um ficheiro [CSV](https://pt.wikipedia.org/wiki/Comma-separated_values) exibe valores em filas e colunas, separados por vírgulas. É possível salvar qualquer documento criado no Excel como um ficheiro .csv e depois carregá-lo em R. Para usar um ficheiro CSV em R, nomeie o ficheiro usando o comando <- e depois digite read.csv(file="file-name.csv",header=TRUE,sep=",") no console. file-name indica ao R qual ficheiro selecionar, enquanto que definir o cabeçalho ou header= (o ficheiro equivale a), para TRUE (verdadeiro) diz que a primeira linha são cabeçalhos e não variáveis. sep significa que há uma vírgula entre cada número e linha. - -Normalmente, um CSV pode conter muitas informações. Entretanto, para começar, tente criar um ficheiro CSV em Excel usando os dados do *Old Bailey* que usamos para as matrizes. Defina as colunas para as datas entre 1710 e 1730, mais o número de violações de crimes de paz e assassinatos para aquelas décadas. Salve o ficheiro como "OldBailey.csv" e tente carregá-lo em R usando os passos acima. Veremos que: - -``` -> read.csv (file="OldBailey.csv", header=TRUE, sep=",") -Date QuebraPaz Assassinatos -1 1710 2 44 -2 1720 3 51 -3 1730 4 17 -``` - -Agora poderíamos acessar os dados em R e fazer quaisquer cálculos para ajudá-lo a estudar os dados. Os ficheiros CSV também podem ser muito mais complexos do que este exemplo, portanto, qualquer conjunto de dados com os quais trabalhamos em estudos próprios também poderia ser aberto em R. - -TXT (ou ficheiros de texto) podem ser importados para R de maneira semelhante. Usando o comando read.table(), é possível carregar ficheiros de texto em R, seguindo a mesma sintaxe que no exemplo acima. - -## Salvar dados en R - -Agora que carregamos dados em R e conhecemos algumas maneiras de trabalhar com os dados, o que acontece se quisermos salvá-los em outro formato? A função write.xlsx() permite que se faça exatamente isso - pegar os dados de R e salvá-los em um ficheiro Excel. Tente escrever o ficheiro do *Old Bailey* em um ficheiro Excel. Primeiro, será necessário carregar o pacote e depois será possível criar o ficheiro após criar uma variável para os dados do *Old Bailey*: - -``` -> library(xlsx) -> write.xlsx(x= OldBailey, file= "OldBailey.xlsx", sheetName= "OldBailey", row.names= TRUE) -``` - -Neste caso, e dentro do parêntese desta função [write.xlsx](https://www.rdocumentation.org/packages/xlsx/versions/0.6.5), estamos chamando para processar a variável "OldBailey" com o argumento x= . Ao mesmo tempo, indicamos que o ficheiro salvo deve ser chamado “OldBailey” com a extensão “.xlsx” com o argumento file= . Além disso, damos o nome "OldBailey" à planilha onde estarão os dados com sheetName= . Finalmente, estabelecemos que queremos (TRUE ou verdadeiro) que os nomes da linha em nossa variável sejam salvos no novo ficheiro. [N. da T.] - -## Resumo e passos seguintes - -Este tutorial explorou as bases do uso de R para trabalhar com dados de pesquisa tabular. O R pode ser uma ferramenta muito útil para a pesquisa em ciências humanas e sociais porque a análise de dados é reprodutível e permite analisar dados rapidamente sem ter que montar um sistema complicado. Agora que conhece alguns dos conceitos básicos do R, pode-se explorar algumas das outras funções do programa, incluindo cálculos estatísticos, produção de gráficos e criação de suas próprias funções. - -Para mais informações sobre o R, visite o [R Manual](https://cran.r-project.org/doc/manuals/r-release/R-intro.html) (em inglês). - -Há também uma série de outros tutoriais de R online, inclusive: - -* [R: A self-learn tutorial](https://web.archive.org/web/20191015004305/https://www.nceas.ucsb.edu/files/scicomp/Dloads/RProgramming/BestFirstRTutorial.pdf) (em inglês) - este tutorial passa por uma série de funções e fornece exercícios para praticar competências. - -* [DataCamp Introduction to R](https://www.datacamp.com/courses/free-introduction-to-r) - este é um curso online gratuito que lhe dá feedback sobre seu código para ajudar a identificar erros e aprender como escrever código de forma mais eficiente. - -Finalmente, um grande recurso para historiadores digitais é o [Lincoln Mullen’s Digital History Methods in R](http://dh-r.lincolnmullen.com/). É um rascunho de um livro escrito especificamente sobre como usar R para o trabalho de história digital. - -## Notas - -[^1]: Box, G. E. P., Jenkins, G. M. e Reinsel, G. C. (1976), Time Series Analysis, Forecasting and Control. Third Edition. Holden-Day. Series G. -[^2]: Henderson e Velleman (1981), Building multiple regression models interactively. Biometrics, 37, 391Ð411. -[^3]: Nota da tradutora: Um galão equivale a 3,78 litros e uma milha equivale a 1,6 quilômetros. - +--- +title: Noções básicas de R com dados tabulares +layout: lesson +slug: nocoes-basicas-R-dados-tabulares +date: 2016-09-05 +translation_date: 2021-08-28 +authors: +- Taryn Dewar +reviewers: +- James Baker +- John Russell +editors: +- Adam Crymble +translator: +- Diana Rebelo Rodriguez +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Ivo Veiga +- Romulo Predes +difficulty: 1 +activity: transforming +topics: [data-manipulation, r] +abstract: "Esta lição ensina uma maneira de analisar rapidamente grandes volumes de dados tabulares, tornando a pesquisa mais rápida e eficaz." +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/398 +original: r-basics-with-tabular-data +avatar_alt: Letra R ornamentada e ilustrada +doi: 10.46430/phpt0015 +--- + + + +{% include toc.html %} + +## Objetivos da lição + +À medida que mais e mais registros históricos são digitalizados, ter uma maneira de analisar rapidamente grandes volumes de dados tabulares torna a pesquisa mais rápida e eficaz. + +[R](https://pt.wikipedia.org/wiki/R_%28linguagem_de_programa%C3%A7%C3%A3o%29) é uma linguagem de programação com pontos fortes nas análises estatísticas. Como tal, ela pode ser usada para realizar análises quantitativas sobre fontes históricas, incluindo, mas não se limitando, a testes estatísticos. Como é possível executar repetidamente o mesmo código nas mesmas fontes, R permite analisar dados rapidamente e produz resultados que podem ser replicados. Além disso, como é possível salvar o código, R permite que se redirecionem ou revejam funções para projectos futuros, tornando-o uma parte flexível de sua caixa de ferramentas. + +Este tutorial não pressupõe nenhum conhecimento prévio do R. Ele percorrerá algumas das funções básicas do R e servirá como uma introdução à linguagem. Ele aborda o processo de instalação, explica algumas das ferramentas que se podem usar no R, bem como explica como trabalhar com conjuntos de dados enquanto se faz pesquisa. O tutorial fará isso através de uma série de mini-lições que mostrarão os tipos de fontes com as quais o R funciona bem e exemplos de como fazer cálculos para encontrar informações que possam ser relevantes à pesquisa histórica. A lição também abordará diferentes métodos de entrada de dados para R, tais como matrizes e o uso de ficheiros CSV. + +## Para quem isto é útil? + +R é ideal para analisar conjuntos de dados de grande dimensão que levariam muito tempo para serem processados manualmente. Depois de entendida a forma como se escrevem algumas funções básicas e como importar ficheiros de dados próprios, é possível analisar e visualizar os dados de forma rápida e eficiente. + +Embora R seja uma ótima ferramenta para dados tabulares, pode-se achar mais útil utilizar outras abordagens para analisar fontes não tabulares (tais como transcrições de jornais). Caso possua interesse em estudar estes tipos de fontes, dê uma olhada em algumas das outras grandes lições do [The Programming Historian](/pt/). + +## Instalar R + +R é uma linguagem de programação e um ambiente para trabalhar com dados. Ele pode ser executado utilizando o console de R, bem como no [command-line](/en/lessons/intro-to-bash) (linha de comandos) ou na interface [R Studio](https://www.rstudio.com/). Este tutorial irá focar no uso do console de R. Para começar com o R, baixe o programa do [The Comprehensive R Archive Network](https://cran.r-project.org/). R é compatível com Linux, Mac e Windows. + +Quando se abre o console de R pela primeira vez, a janela aberta se parece com essa: +![O console R no Mac.](/images/r-basics-with-tabular-data/Intro-to-R-1.png) + +## Usar o console de R + +O console R é um ótimo lugar para começar a trabalhar se quando se é inexperiente em R, porque ele foi projetado especificamente para esta linguagem e tem funções específicas para o R. + +O console é onde se digitam os comandos. Para limpar a tela inicial, vá para 'Edit' (editar) na barra de menu e selecione 'Clean Console’ (limpar console). Isto iniciará R com uma nova página. Também é possível mudar a aparência do console clicando na roda colorida no topo do console em um Mac, ou selecionando 'GUI Preferences' (preferências da Interface Gráfica do Usuário) no menu 'Edit' em um PC. Além disso, também é possível ajustar a cor da tela de fundo e as cores da fonte para as funções. + +## Usar conjuntos de dados + +Antes de trabalhar com dados próprios, usar os conjuntos de dados já incorporados ajuda a ter uma noção de como R funciona. É possível pesquisar nos conjuntos de dados inserindo data() no console. Isto mostrará a lista de todos os conjuntos de dados disponíveis em uma janela separada. Essa lista inclui os títulos de todos os diferentes conjuntos de dados, bem como uma breve descrição sobre as informações em cada um deles. + +No exemplo abaixo iremos primeiro carregar o conjunto de dados AirPassengers na sua sessão R digitando data(AirPassengers) na próxima linha do console^[1] e pressionando Enter. Para visualizar o conjunto de dados, digite apenas AirPassengers na próxima linha e pressione Enter novamente. Isso imprimirá uma tabela mostrando o número de passageiros que voaram em companhias aéreas internacionais entre janeiro de 1949 e dezembro de 1960, em milhares. Deverá aparecer o seguinte: + +``` +> data(AirPassengers) +> AirPassengers + Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec +1949 112 118 132 129 121 135 148 148 136 119 104 118 +1950 115 126 141 135 125 149 170 170 158 133 114 140 +1951 145 150 178 163 172 178 199 199 184 162 146 166 +1952 171 180 193 181 183 218 230 242 209 191 172 194 +1953 196 196 236 235 229 243 264 272 237 211 180 201 +1954 204 188 235 227 234 264 302 293 259 229 203 229 +1955 242 233 267 269 270 315 364 347 312 274 237 278 +1956 284 277 317 313 318 374 413 405 355 306 271 306 +1957 315 301 356 348 355 422 465 467 404 347 305 336 +1958 340 318 362 348 363 435 491 505 404 359 310 337 +1959 360 342 406 396 420 472 548 559 463 407 362 405 +1960 417 391 419 461 472 535 622 606 508 461 390 432 +``` + +Agora, é possível usar R para responder a uma série de perguntas com base nestes dados. Por exemplo, quais foram os meses mais populares para voar? Houve um aumento nas viagens internacionais ao longo do tempo? Provavelmente poderíamos encontrar as respostas a tais perguntas simplesmente escaneando esta tabela, mas não tão rapidamente quanto o computador. E se houvesse muito mais dados? + +## Funções básicas + +R pode ser usado para calcular uma série de valores que podem ser úteis enquanto se faz pesquisa em um conjunto de dados. Por exemplo, é possível encontrar a [média](https://pt.wikipedia.org/wiki/M%C3%A9dia), a [mediana](https://pt.wikipedia.org/wiki/Mediana_%28estat%C3%ADstica%29) e os valores mínimos e máximos. Para encontrar a média e a mediana no conjunto de dados, insere-se, respectivamente, mean(AirPassengers) e median(AirPassengers) no console. E se quisermos calcular mais de um valor de cada vez? Para produzir um resumo dos dados, digite summary(AirPassengers) (resumo) no console. Isto dará os valores mínimo e máximo dos dados, assim como a média, a mediana e os valores do primeiro e terceiro quartil. + +``` +> summary(AirPassengers) + Min. 1st Qu. Median Mean 3rd Qu. Max. + 104.0 180.0 265.5 280.3 360.5 622.0 +``` + +Um resumo nos mostra que o número mínimo de passageiros entre janeiro de 1949 e dezembro de 1960 foi de 104.000 e que o número máximo de passageiros foi de 622.000. O valor médio nos mostra que aproximadamente 280.300 pessoas viajavam por mês durante o período de coleta dos dados. Estes valores podem ser úteis para ver o grau de variação no número de passageiros ao longo do tempo. + +Usar a função summary() é uma boa maneira de se obter uma visão geral de todo o conjunto de dados. No entanto, e se quisermos analisar um subconjunto de dados, como um determinado ano ou alguns meses? É possível selecionar diferentes pontos de dados (como um determinado mês) e intervalos (como um determinado ano) em R para calcular muitos valores diferentes. Por exemplo, pode-se adicionar o número de passageiros durante dois meses para determinar o número total de passageiros durante esse período de tempo. + +Tente adicionar os dois primeiros valores dos dados AirPassengers no console e, em seguida, pressione 'Enter'. Devem aparecer duas linhas assim: + +``` +> 112+118 +[1] 230 +``` + +Isto lhe daria o número total de passageiros (em centenas de milhares) que voaram em janeiro e fevereiro de 1949. + +R pode fazer muito mais do que simples aritmética. É possível criar objetos, ou [variáveis](https://pt.wikipedia.org/wiki/Vari%C3%A1vel_%28programa%C3%A7%C3%A3o%29), para representar números e [expressões](https://pt.wikipedia.org/wiki/Express%C3%A3o_%28computa%C3%A7%C3%A3o%29). Por exemplo, pode-se nomear o valor de janeiro de 1949 como variável Jan1949. DigiteJan1949 <- 112 no console e, em seguida, Jan1949 na linha seguinte. A notação <- atribui o valor 112 à variável Jan1949. O que deve aparecer é: + +``` +> Jan1949 <- 112 +> Jan1949 +[1] 112 +``` + +R é sensível a maiúsculas e minúsculas, portanto tenha cuidado para usar a mesma notação quando usar as variáveis que foram atribuídas (ou nomeadas) em outras ações. Veja o artigo de Rasmus Bååth, [The State of Naming Conventions in R](https://perma.cc/ME6W-JZJQ) (em inglês), para mais informações sobre como nomear melhor as variáveis. + +Para remover uma variável do console, digite rm() (*remove* ou apagar) com a variável da qual se deseja apagar dos parênteses, e pressione Enter. Para ver todas as variáveis atribuídas, digite ls() (*list objects* ou lista de objetos) no console e pressione Enter. Isto pode ajudar a evitar o uso do mesmo nome para múltiplas variáveis. Isto também é importante porque R armazena todos os objetos que são criados em sua memória, portanto, mesmo que não se consiga ver uma variável nomeada x no console, ela pode ter sido criada antes e acidentalmente poderia sobrescrevê-la ao atribuir outra variável. + +Aqui está a lista de variáveis que criamos até agora: + +``` +> ls() +[1] "AirPassengers" "Jan1949" +``` + +Temos as variáveis AirPassengers e Jan1949. Se removermos a variável Jan1949 e digitarmos novamente ls(), veremos: + +``` +> rm(Jan1949) +> ls() +[1] "AirPassengers" +``` + +Se a qualquer momento não conseguir corrigir um erro ou ficar preso a uma função, digite help() no console para abrir a página de ajuda. Também é possível encontrar ajuda geral usando o menu ‘Help’ na parte superior do console. Se quiser mudar algo no código que já escreveu, pode-se digitar novamente o código em uma nova linha. Para economizar tempo, também é possível usar as setas do teclado para rolar para cima e para baixo no console para encontrar a linha de código que se deseja mudar. + +É possível usar letras como variáveis, mas quando começar a trabalhar com seus próprios dados, pode ser mais fácil atribuir nomes que sejam mais representativos desses dados. Mesmo com os dados AirPassengers, atribuir variáveis que se correlacionam com meses ou anos específicos tornaria mais fácil saber exatamente com quais valores se está trabalhando. + +### Prática + +A. Atribuir os valores de janeiro de 1950 e janeiro de 1960 dos dados de AirPassengers() em dois objetos novos. Em seguida, somar os valores dos dois objetos criados em uma nova linha de código. + +B. Usar os dois objetos criadas para encontrar a diferença entre os viajantes aéreos de janeiro de 1960 e de 1950. + +### Soluções + +A. Atribuir variáveis para os pontos de janeiro de 1950 e janeiro de 1960 dos dados de AirPassengers(). Adicionar as duas variáveis juntas na linha seguinte. + +``` +> Jan1950<- 115 +> Jan1960<- 417 +> Jan1950+Jan1960 +[1] 532 +``` + +Isto significa que 532.000 pessoas viajaram em voos internacionais em janeiro de 1950 e janeiro de 1960. + +B. Usar as variáveis que foram criadas para encontrar a diferença entre os viajantes aéreos de 1960 e 1950. + +``` +> Jan1960-Jan1950 +[1] 302 +``` + +Isto significa que, em janeiro de 1960, mais 302.000 pessoas viajaram em voos internacionais do que em janeiro de 1950. + +Definir variáveis para pontos de dados individuais pode ser entediante, especialmente se os nomes atribuídos são bastante longos. Entretanto, o processo é semelhante para atribuir um intervalo de valores a uma variável, como todos os pontos de dados durante um ano. Fazemos isso criando listas chamadas ‘vetores’ usando o comando c. c significa ‘combinar’ e nos permite vincular números em uma variável comum. Por exemplo, pode-se criar um vetor para os dados AirPassengers() de 1949 nomeado Air49: + +``` +> Air49<- c(112,118,132,129,121,135,148,148,136,119,104,118) +``` + +Cada item é acessível usando o nome da variável e sua posição no índice (a partir de 1). Neste caso, Air49[2] contém o valor que corresponde a fevereiro de 1949 - 118. + +``` +> Air49[2] +[1] 118 +``` + +É possível criar uma lista de valores consecutivos usando dois pontos. Por exemplo: + +``` +> y <- 1:10 +> y +[1] 1 2 3 4 5 6 7 8 9 10 +``` + +Usando este conhecimento, podemos usar a seguinte expressão para definir uma variável para os dados AirPassengers de 1949. + +``` +> Air49 <- AirPassengers[1:12] +> Air49 + [1] 112 118 132 129 121 135 148 148 136 119 104 118 +``` + +Air49[2] selecionou os primeiros doze termos no conjunto de dados AirPassengers. Isto dá o mesmo resultado que acima, mas leva menos tempo e também reduz a chance de que um valor seja transcrito incorretamente. + +Para obter o número total de passageiros para 1949, é possível somar todos os termos no vetor, usando a função sum() (somar). + +``` +> sum(Air49) +[1] 1520 +``` + +Portanto, o número total de passageiros em 1949 era de aproximadamente 1.520.000. + +Finalmente, a função length() (comprimento) torna possível saber o número de objetos em um vetor: + +``` +> length(Air49) +[1] 12 +``` + +### Prática + +1. Criar uma variável para os dados AirPassengers de 1950. +2. Imprimir ou apresentar o segundo objeto da série de 1950. +3. Qual é o tamanho (*length*) da sequência na pergunta 2? +4. Quantos passageiros voaram no total em 1950? + +### Soluções + +1. +``` +> Air50 <- AirPassengers[13:24] +Air50 +[1] 115 126 141 135 125 149 170 170 158 133 114 140 +``` + +2. +``` +> Air50[2] +[1] 126 +``` + +3. +``` +> length(Air50) +[1] 12 +``` + +4. +``` +>sum(Air50) +[1] 1676 +``` + +Caso se quisesse criar variáveis para todos os anos no conjunto de dados, seria possível então usar algumas das ferramentas que examinamos para determinar o número de pessoas que viajam de avião ao longo do tempo. Aqui está uma lista de variáveis para 1949 a 1960, seguida pelo número total de passageiros para cada ano: + +``` +> Air49 <- AirPassengers[1:12] +Air50 <- AirPassengers[13:24] +Air51 <- AirPassengers[25:36] +Air52 <- AirPassengers[37:48] +Air53 <- AirPassengers[49:60] +Air54 <- AirPassengers[61:72] +Air55 <- AirPassengers[73:84] +Air56 <- AirPassengers[85:96] +Air57 <- AirPassengers[97:108] +Air58 <- AirPassengers[109:120] +Air59 <- AirPassengers[121:132] +Air60 <- AirPassengers[133:144] +``` + +``` +> sum(Air49) +[1] 1520 +sum(Air50) +[1] 1676 +sum(Air51) +[1] 2042 +sum(Air52) +[1] 2364 +sum(Air53) +[1] 2700 +sum(Air54) +[1] 2867 +sum(Air55) +[1] 3408 +sum(Air56) +[1] 3939 +sum(Air57) +[1] 4421 +sum(Air58) +[1] 4572 +sum(Air59) +[1] 5140 +sum(Air60) +[1] 5714 +``` + +A partir destas informações, podemos ver que o número de passageiros aumenta a cada ano. É possível ir mais longe com estes dados para determinar se havia um interesse crescente em férias em certos períodos do ano, ou mesmo o aumento percentual de passageiros ao longo do tempo. + +## Trabalhar com bases de dados maiores + +Note que o exemplo acima não é bem adequado para conjuntos de dados de grande dimensão: contar pontos de dados para encontrar os corretos seria muito entediante. Pense no que aconteceria se procurássemos informações do ano 96 em um conjunto de dados com 150 anos de dados coletados. + +É possível selecionar linhas e colunas específicas de dados se o conjunto de dados estiver em um formato particular. Carregue os dados de mtcars em seu console: + +``` +> data(mtcars) +> mtcars + mpg cyl disp hp drat wt qsec vs am gear carb +Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 +Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 +Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 +Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 +Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 +Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 +Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 +Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 +Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 +Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 +Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 +Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 +Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 +Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 +Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 +Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 +Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 +Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 +Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 +Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 +Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 +Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 +AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 +Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 +Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 +Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 +Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 +Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 +Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 +Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 +Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 +Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 +``` + +Este [conjunto de dados](https://stat.ethz.ch/R-manual/R-devel/library/datasets/html/mtcars.html) fornece uma visão geral dos *Motor Trend Car Road Tests* de automóveis da revista Motor Trend de 1974[^2]. Ele contém informações sobre quantas milhas por galão ou quilômetros por litro um carro poderia percorrer[^3], o número de cilindros do motor em cada carro, potência, relação do eixo traseiro, peso, e outras características de cada modelo. Os dados poderão ser usados para descobrir qual destas características tornou cada tipo de carro mais ou menos seguro para os passageiros ao longo do tempo. + +É possível selecionar colunas inserindo o nome do conjunto de dados seguido por colchetes e o número da linha ou coluna de dados que lhe interessa. Para ordenar as linhas e colunas, pense no dataset[x,y], sendo dataset o conjunto de dados com o qual se está trabalhando, x a linha e y a coluna. + +Se estivesse interessado na primeira linha de informações no conjunto mtcars, deveria executar o seguinte em seu console: + +``` +> mtcars[1,] + mpg cyl disp hp drat wt qsec vs am gear carb +Mazda RX4 21 6 160 110 3.9 2.62 16.46 0 1 4 4 +``` + +Para ver uma coluna dos dados, podemos digitar: + +``` +> mtcars[,2] + [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4 +``` + +Isto mostra todos os valores sob a categoria cyl (cilindrada). A maioria dos modelos de carros tem motores de 4, 6 ou 8 cilindros. Também é possível selecionar pontos de dados individuais inserindo valores tanto para x (linha) quanto para y (coluna): + +``` + > mtcars[1,2] +[1] 6 +``` + +Isto retorna o valor na primeira linha, segunda coluna. A partir daqui, seria possível executar um resumo em uma linha ou coluna de dados sem ter que contar o número de termos no conjunto de dados. Por exemplo, digitar summary(mtcars[,1]) no console e pressionar 'Enter' daria o resumo para as milhas por galão que os diferentes carros no conjunto de dados mtcars usam: + +``` +> summary(mtcars[,1]) + Min. 1st Qu. Median Mean 3rd Qu. Max. + 10.40 15.42 19.20 20.09 22.80 33.90 +``` + +O resumo indica que a eficiência máxima de combustível foi de 33,9 milhas por galão ou 54.5 quilômetros por 3.78 litros, do Toyota Corolla e o menos eficiente foi o Lincoln Continental, que só conseguiu 10,4 milhas por galão, ou seja, 16.7 quilômetros por 3.78 litros. Podemos encontrar os carros que correspondem aos pontos de valor olhando de volta para a tabela. É muito mais fácil encontrar um valor específico do que tentar fazer as contas em sua cabeça ou pesquisar através de uma planilha. + +## Matrizes + +Agora que temos uma melhor compreensão de como algumas das funções básicas em R funcionam, podemos analisar maneiras de usar essas funções em nossos próprios dados. Isto inclui a construção de [matrizes](https://pt.wikipedia.org/wiki/Matriz_%28matem%C3%A1tica%29) usando pequenos conjuntos de dados. O benefício de saber como construir matrizes em R é que se tivermos apenas alguns pontos de dados para trabalhar, poderíamos simplesmente criar uma matriz em vez de um CSV que precisaria ser depois importado. Uma das maneiras mais simples de construir uma matriz é criar pelo menos duas variáveis ou vetores e depois ligá-los entre si. Por exemplo, vejamos alguns dados do [Old Bailey](https://pt.wikipedia.org/wiki/Old_Bailey) (o Tribunal Penal Central da Inglaterra e do País de Gales): + + +![Conjunto de dados criminais do [The Old Bailey](https://www.oldbaileyonline.org/) nas décadas entre 1670 e 1800.](/images/r-basics-with-tabular-data/Intro-to-R-2.png) + + +O Old Bailey contém estatísticas e informações sobre casos criminais entre 1674 e 1913 que foram mantidos pelo Tribunal Penal Central de Londres. Se quiséssemos analisar o número total de crimes de roubo e furto violento entre 1670 e 1710, poderíamos colocar esses números em uma matriz. + +Para isso, vamos criar as variáveis Roubos e RoubosViolentos usando os totais de cada década como pontos de dados: + +``` +> Roubos <- c(2,30,38,13) +RoubosViolentos <- c(7,20,36,3) +``` + +Para criar uma matriz podemos usar a função cbind() (*column bind* ou união de colunas). Isto une Roubos e RoubosViolentos em colunas, representadas como Crime aqui: + +``` +> Roubos <- c(2,30,38,13) +RoubosViolentos <- c(7,20,36,3) +Crime <- cbind(Roubos,RoubosViolentos) +Crime + Roubos RoubosViolentos +[1,] 2 7 +[2,] 30 20 +[3,] 38 36 +[4,] 13 3 +``` + +Também é possível estabelecer uma matriz usando rbind(). rbind() une os dados em fileiras (*row bind* ou união de fileiras). Observe a diferença entrenCrime e Crime2: + +``` +> Crime2 <- rbind(Roubos,RoubosViolentos) +> Crime2 + [,1] [,2] [,3] [,4] +Roubos 2 30 38 13 +RoubosViolentos 7 20 36 3 +``` + +A segunda matriz também pode ser criada usando a expressão t(Crime) (matriz transposta), que gera o inverso de Crime. + +Também é possível construir uma matriz utilizando matrix(). Isto permite transformar uma sequência de números, como o número de roubos e roubos violentos cometidos, em uma matriz se não tiver criado variáveis separadas para estes valores: + +``` +> matrix(c(2,30,3,4,7,20,36,3),nrow=2) + [,1] [,2] [,3] [,4] +[1,] 2 3 7 36 +[2,] 30 4 20 3 +``` + +``` +[2,] 30 4 20 3 +> matrix(c(2,30,3,4,7,20,36,3),ncol=2) + [,1] [,2] +[1,] 2 7 +[2,] 30 20 +[3,] 3 36 +[4,] 4 3 +``` + +A primeira parte da função é a lista de números. Depois disso, é possível determinar quantas linhas (nrow=) (número de linhas) ou colunas (ncol=) (número de colunas) a matriz terá. + +A função apply() permite executar a mesma função em cada linha ou coluna de uma matriz. Existem três partes da função apply(), nas quais é preciso selecionar: a matriz que está sendo utilizada, os termos que se deseja usar e a função que se deseja executar na matriz: + +``` +> Crime + Roubos RoubosViolentos +[1,] 2 7 +[2,] 30 20 +[3,] 38 36 +[4,] 13 3 +> apply(Crime,1,mean) +[1] 4.5 25.0 37.0 8.0 +``` + +Este exemplo mostra a função apply utilizada na matriz Crime para calcular a média (*mean*) de cada linha e, portanto, o número médio de roubos e assaltos combinados que foram cometidos em cada década. Se quiser saber a média de cada coluna, use um 2 em vez de um 1 dentro da função: + +``` +> apply(Crime,2,mean) + Roubos RoubosViolentos + 20.75 16.50 +``` + +Isto mostra o número médio de roubos e assaltos entre as décadas. + +### Prática + +1. Criar uma matriz de duas colunas usando os seguintes dados de Quebra da Paz (*Breaking Peace*) e Assassinatos (*Killing*) de 1710 a 1730 da tabela acima do Old Bailey: c(2,3,3,44,51,17) + +2. Usar a função cbind() para juntar QuebraPaz <- c(2,3,3) e Assassinatos <- c(44,51,17). + +3. Calcular a média de cada coluna para a matriz acima usando a função apply(). + +### Soluções + +1. +``` +> matrix(c(2,3,3,44,51,17),ncol=2) + [,1] [,2] +[1,] 2 44 +[2,] 3 51 +[3,] 3 17 +``` + +2. +``` +> QuebraPaz <- c(2,3,3) +> Assassinatos <- c(44,51,17) +> PazAssassinatos <- cbind(QuebraPaz,Assassinatos) +> PazAssassinatos + QuebraPaz Assassinatos +[1,] 2 44 +[2,] 3 51 +[3,] 3 17 +``` + +3. +``` +> apply(PazAssassinatos,2,mean) +> QuebraPaz Assassinatos +> 2.666667 37.333333 +``` + +Matrizes podem ser úteis quando se está trabalhando com pequenas quantidades de dados. No entanto, nem sempre é a melhor opção, porque uma matriz pode ser difícil de ler. Às vezes é mais fácil criar seu próprio ficheiro usando um programa de planilhas como [Excel](https://pt.wikipedia.org/wiki/Microsoft_Excel) ou [Open Office](https://www.openoffice.org/pt/) para garantir que todas as informações que deseja estudar estejam organizadas e importar esse ficheiro para o R. + +## Carregar seu próprio conjunto de dados em R + +Agora que já praticou com dados simples, pode trabalhar com seus próprios dados. Como trabalhar com esses dados em R? Há várias maneiras de se fazer isso. A primeira é carregar a planilha diretamente em R. Outra maneira é importar um ficheiro CSV (*comma-separated values* ou valores separados por vírgula) ou TXT (de texto) para R. + +Para carregar um ficheiro Excel diretamente no console R, é necessário primeiro instalar o pacote readxl (ler o ficheiro Excel). Para fazer isto, digite install.packages("readxl") no console e pressione Enter. Pode ser que seja necessário verificar se o pacote foi instalado no console clicando na guia “Packages&Data” (pacotes e dados) no menu, selecionando “Package Manager” (gerenciador de pacotes) e depois clicando na caixa ao lado do pacote readxl. A partir daqui, é possível selecionar um ficheiro e carregá-lo em R. Abaixo está um exemplo de como pode parecer carregar um simples ficheiro Excel: + +``` +> x <- read_excel("Workbook2.xlsx") +> x + a b +1 1 5 +2 2 6 +3 3 7 +4 4 8 +``` + +Após o comando read_excel insere-se o nome do ficheiro que está sendo selecionado. Os números embaixo correspondem aos dados da planilha de amostra que utilizei. Observe como as linhas estão numeradas e as colunas estão etiquetadas como eram na planilha original. + +Quando estiver carregando dados em R, certifique-se de que o ficheiro que está sendo acessado esteja dentro do diretório em seu computador de onde se está trabalhando. Para verificar isso, digite dir() (diretório) ou getwd() (mostrar o caminho do diretório de trabalho) no console. É possível mudar o diretório, se necessário, indo para a aba “Miscellaneous” (diversos) na barra de título em sua tela e, em seguida, selecionando o que se quer definir como diretório para R. Se não fizer isso, R não será capaz de encontrar o ficheiro corretamente. + +Outra maneira de carregar dados em R é usar um ficheiro CSV. Um ficheiro [CSV](https://pt.wikipedia.org/wiki/Comma-separated_values) exibe valores em filas e colunas, separados por vírgulas. É possível salvar qualquer documento criado no Excel como um ficheiro .csv e depois carregá-lo em R. Para usar um ficheiro CSV em R, nomeie o ficheiro usando o comando <- e depois digite read.csv(file="file-name.csv",header=TRUE,sep=",") no console. file-name indica ao R qual ficheiro selecionar, enquanto que definir o cabeçalho ou header= (o ficheiro equivale a), para TRUE (verdadeiro) diz que a primeira linha são cabeçalhos e não variáveis. sep significa que há uma vírgula entre cada número e linha. + +Normalmente, um CSV pode conter muitas informações. Entretanto, para começar, tente criar um ficheiro CSV em Excel usando os dados do *Old Bailey* que usamos para as matrizes. Defina as colunas para as datas entre 1710 e 1730, mais o número de violações de crimes de paz e assassinatos para aquelas décadas. Salve o ficheiro como "OldBailey.csv" e tente carregá-lo em R usando os passos acima. Veremos que: + +``` +> read.csv (file="OldBailey.csv", header=TRUE, sep=",") +Date QuebraPaz Assassinatos +1 1710 2 44 +2 1720 3 51 +3 1730 4 17 +``` + +Agora poderíamos acessar os dados em R e fazer quaisquer cálculos para ajudá-lo a estudar os dados. Os ficheiros CSV também podem ser muito mais complexos do que este exemplo, portanto, qualquer conjunto de dados com os quais trabalhamos em estudos próprios também poderia ser aberto em R. + +TXT (ou ficheiros de texto) podem ser importados para R de maneira semelhante. Usando o comando read.table(), é possível carregar ficheiros de texto em R, seguindo a mesma sintaxe que no exemplo acima. + +## Salvar dados en R + +Agora que carregamos dados em R e conhecemos algumas maneiras de trabalhar com os dados, o que acontece se quisermos salvá-los em outro formato? A função write.xlsx() permite que se faça exatamente isso - pegar os dados de R e salvá-los em um ficheiro Excel. Tente escrever o ficheiro do *Old Bailey* em um ficheiro Excel. Primeiro, será necessário carregar o pacote e depois será possível criar o ficheiro após criar uma variável para os dados do *Old Bailey*: + +``` +> library(xlsx) +> write.xlsx(x= OldBailey, file= "OldBailey.xlsx", sheetName= "OldBailey", row.names= TRUE) +``` + +Neste caso, e dentro do parêntese desta função [write.xlsx](https://www.rdocumentation.org/packages/xlsx/versions/0.6.5), estamos chamando para processar a variável "OldBailey" com o argumento x= . Ao mesmo tempo, indicamos que o ficheiro salvo deve ser chamado “OldBailey” com a extensão “.xlsx” com o argumento file= . Além disso, damos o nome "OldBailey" à planilha onde estarão os dados com sheetName= . Finalmente, estabelecemos que queremos (TRUE ou verdadeiro) que os nomes da linha em nossa variável sejam salvos no novo ficheiro. [N. da T.] + +## Resumo e passos seguintes + +Este tutorial explorou as bases do uso de R para trabalhar com dados de pesquisa tabular. O R pode ser uma ferramenta muito útil para a pesquisa em ciências humanas e sociais porque a análise de dados é reprodutível e permite analisar dados rapidamente sem ter que montar um sistema complicado. Agora que conhece alguns dos conceitos básicos do R, pode-se explorar algumas das outras funções do programa, incluindo cálculos estatísticos, produção de gráficos e criação de suas próprias funções. + +Para mais informações sobre o R, visite o [R Manual](https://cran.r-project.org/doc/manuals/r-release/R-intro.html) (em inglês). + +Há também uma série de outros tutoriais de R online, inclusive: + +* [R: A self-learn tutorial](https://web.archive.org/web/20191015004305/https://www.nceas.ucsb.edu/files/scicomp/Dloads/RProgramming/BestFirstRTutorial.pdf) (em inglês) - este tutorial passa por uma série de funções e fornece exercícios para praticar competências. + +* [DataCamp Introduction to R](https://www.datacamp.com/courses/free-introduction-to-r) - este é um curso online gratuito que lhe dá feedback sobre seu código para ajudar a identificar erros e aprender como escrever código de forma mais eficiente. + +Finalmente, um grande recurso para historiadores digitais é o [Lincoln Mullen’s Digital History Methods in R](https://dh-r.lincolnmullen.com/). É um rascunho de um livro escrito especificamente sobre como usar R para o trabalho de história digital. + +## Notas + +[^1]: Box, G. E. P., Jenkins, G. M. e Reinsel, G. C. (1976), Time Series Analysis, Forecasting and Control. Third Edition. Holden-Day. Series G. +[^2]: Henderson e Velleman (1981), Building multiple regression models interactively. Biometrics, 37, 391Ð411. +[^3]: Nota da tradutora: Um galão equivale a 3,78 litros e uma milha equivale a 1,6 quilômetros. + diff --git a/pt/licoes/nocoes-basicas-paginas-web-html.md b/pt/licoes/nocoes-basicas-paginas-web-html.md index e355644598..e6fbd227c6 100644 --- a/pt/licoes/nocoes-basicas-paginas-web-html.md +++ b/pt/licoes/nocoes-basicas-paginas-web-html.md @@ -1,121 +1,121 @@ ---- -title: Noções básicas de páginas web e HTML -layout: lesson -slug: nocoes-basicas-paginas-web-html -date: 2012-07-17 -translation_date: 2021-05-12 -authors: -- William J. Turkel -- Adam Crymble -reviewers: -- Jim Clifford -- Amanda Morton -editors: -- Miriam Posner -translator: -- Aracele Torres -translation-editor: -- Danielle Sanches -translation-reviewer: -- Bruno Martins -- Rômulo Predes -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/318 -activity: presenting -topics: [python] -abstract: "Esta lição é uma introdução ao HTML e às páginas da web que ele estrutura." -next: trabalhando-ficheiros-texto-python -previous: introducao-e-instalacao -original: viewing-html-files -avatar_alt: Uma mulher ouvindo um homem através de uma trombeta de ouvido -doi: 10.46430/phpt0002 ---- - -{% include toc.html %} - - - - -## Visualizando arquivos HTML - -Quando você está trabalhando com fontes online, na maior parte do tempo utiliza -ficheiros contendo anotações em HTML (Hyper Text Markup Language). O seu navegador web já -sabe como interpretar HTML, apresentando a informação de uma forma adequada para leitores humanos. -A maioria dos navegadores também permite que você veja o *código-fonte* HTML de qualquer página que você visitar. -As duas imagens abaixo mostram uma página web típica (do *Old Bailey Online*) e o código -HTML usado para gerar essa página, que você pode ver com a opção do menu do Firefox -`Abrir menu -> Desenvolvimento web -> Código-fonte da página`. - -Quando você está trabalhando no navegador, normalmente não precisa (ou quer) ver o código-fonte HTML de uma página da web. -No entanto, se você está criando uma página própria, pode ser muito útil ver como outras pessoas realizaram um -determinado efeito. Você também vai querer estudar o código HTML enquanto escreve -programas para manipular páginas da web ou extrair informação automaticamente delas. - -{% include figure.html filename="obo.png" caption="Captura de tela do Old Bailey Online" %} - -{% include figure.html filename="obo-page-source.png" caption="Código HTML da página Old Bailey Online" %} - -(Para aprender mais sobre HTML, você pode achar útil nesse momento usar o [W3 Schools HTML Tutorial][]. Um conhecimento detalhado de HTML não é necessário para continuar lendo, mas qualquer tempo que você passe aprendendo HTML será amplamente recompensado no seu trabalho como historiador digital ou humanista digital.) - -## "Olá mundo" em HTML - -A HTML é conhecida como uma linguagem de *marcação*. Em outras palavras, HTML é o texto que foi "marcado" (i.e., anotado), com *tags* que fornecem informações para o interpretador (que geralmente é um navegador web). Suponha que está formatando uma entrada bibliográfica e quer indicar o título de um trabalho, colocando-o em itálico. Em HTML, pode utilizar tags `em` ("em" significa ênfase) para este efeito. Portanto, parte do seu ficheiro HTML pode ter a seguinte aparência: - -``` xml -... em Digital History de Cohen e Rosenzweig, por exemplo ... -``` - -O ficheiro HTML mais simples consiste em *tags* que indicam o início e o fim de todo o documento, e *tags* que identificam um `head` e um `body` dentro desse documento. A informação descritiva (i.e., os "meta-dados") sobre o ficheiro geralmente vai para o cabeçalho, enquanto que a informação que será exibida ao leitor humano geralmente vai para o corpo. - -``` xml - - -Olá mundo! - -``` - -Você pode tentar criar algum código HTML. Com o seu editor de texto, crie um novo ficheiro. Copie o código abaixo no editor. A primeira linha diz ao navegador qual o tipo do ficheiro. A *tag* `html` tem a direção do texto definida como `ltr` (da esquerda para a direita), e ainda a propriedade `lang` (idioma) definida como português. A *tag* `title` no cabeçalho do documento HTML contém informação que geralmente é exibida na barra superior de uma janela quando a página está sendo visualizada, e nas abas do Firefox. - - -``` xml - - - - - <!-- Insira seu título aqui --> - - - - - - -``` - -Altere - -``` xml - -``` - -e - -``` xml - -``` - -para - -``` xml -Olá mundo! -``` - -Guarde o ficheiro num diretório `programming-historian` como `ola-mundo.html`. De seguida, vá para o Firefox e escolha `Abrir menu -> Abrir ficheiro...` e -então escolha `ola-mundo.html`. Dependendo do seu editor de texto, você pode ter a opção 'visualizar página no navegador' ou 'abrir no navegador'. Depois de abrir o ficheiro, a sua mensagem deve aparecer no navegador. Observe a diferença entre abrir um ficheiro HTML com um navegador como o Firefox (que o interpreta), ou abrir o mesmo ficheiro com seu editor de texto (que não faz o mesmo). - -## Leituras sugeridas para aprender HTML - -- [W3 Schools HTML Tutorial][] -- [W3 Schools HTML5 Tutorial][] - - [W3 Schools HTML tutorial]: http://www.w3schools.com/html/default.asp - [W3 Schools HTML5 Tutorial]: http://www.w3schools.com/html/html5_intro.asp +--- +title: Noções básicas de páginas web e HTML +layout: lesson +slug: nocoes-basicas-paginas-web-html +date: 2012-07-17 +translation_date: 2021-05-12 +authors: +- William J. Turkel +- Adam Crymble +reviewers: +- Jim Clifford +- Amanda Morton +editors: +- Miriam Posner +translator: +- Aracele Torres +translation-editor: +- Danielle Sanches +translation-reviewer: +- Bruno Martins +- Rômulo Predes +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/318 +activity: presenting +topics: [python] +abstract: "Esta lição é uma introdução ao HTML e às páginas da web que ele estrutura." +next: /pt/licoes/trabalhando-ficheiros-texto-python +previous: /pt/licoes/introducao-e-instalacao +original: viewing-html-files +avatar_alt: Uma mulher ouvindo um homem através de uma trombeta de ouvido +doi: 10.46430/phpt0002 +--- + +{% include toc.html %} + + + + +## Visualizando arquivos HTML + +Quando você está trabalhando com fontes online, na maior parte do tempo utiliza +ficheiros contendo anotações em HTML (Hyper Text Markup Language). O seu navegador web já +sabe como interpretar HTML, apresentando a informação de uma forma adequada para leitores humanos. +A maioria dos navegadores também permite que você veja o *código-fonte* HTML de qualquer página que você visitar. +As duas imagens abaixo mostram uma página web típica (do *Old Bailey Online*) e o código +HTML usado para gerar essa página, que você pode ver com a opção do menu do Firefox +`Abrir menu -> Desenvolvimento web -> Código-fonte da página`. + +Quando você está trabalhando no navegador, normalmente não precisa (ou quer) ver o código-fonte HTML de uma página da web. +No entanto, se você está criando uma página própria, pode ser muito útil ver como outras pessoas realizaram um +determinado efeito. Você também vai querer estudar o código HTML enquanto escreve +programas para manipular páginas da web ou extrair informação automaticamente delas. + +{% include figure.html filename="obo.png" caption="Captura de tela do Old Bailey Online" %} + +{% include figure.html filename="obo-page-source.png" caption="Código HTML da página Old Bailey Online" %} + +(Para aprender mais sobre HTML, você pode achar útil nesse momento usar o [W3 Schools HTML Tutorial][]. Um conhecimento detalhado de HTML não é necessário para continuar lendo, mas qualquer tempo que você passe aprendendo HTML será amplamente recompensado no seu trabalho como historiador digital ou humanista digital.) + +## "Olá mundo" em HTML + +A HTML é conhecida como uma linguagem de *marcação*. Em outras palavras, HTML é o texto que foi "marcado" (i.e., anotado), com *tags* que fornecem informações para o interpretador (que geralmente é um navegador web). Suponha que está formatando uma entrada bibliográfica e quer indicar o título de um trabalho, colocando-o em itálico. Em HTML, pode utilizar tags `em` ("em" significa ênfase) para este efeito. Portanto, parte do seu ficheiro HTML pode ter a seguinte aparência: + +``` xml +... em Digital History de Cohen e Rosenzweig, por exemplo ... +``` + +O ficheiro HTML mais simples consiste em *tags* que indicam o início e o fim de todo o documento, e *tags* que identificam um `head` e um `body` dentro desse documento. A informação descritiva (i.e., os "meta-dados") sobre o ficheiro geralmente vai para o cabeçalho, enquanto que a informação que será exibida ao leitor humano geralmente vai para o corpo. + +``` xml + + +Olá mundo! + +``` + +Você pode tentar criar algum código HTML. Com o seu editor de texto, crie um novo ficheiro. Copie o código abaixo no editor. A primeira linha diz ao navegador qual o tipo do ficheiro. A *tag* `html` tem a direção do texto definida como `ltr` (da esquerda para a direita), e ainda a propriedade `lang` (idioma) definida como português. A *tag* `title` no cabeçalho do documento HTML contém informação que geralmente é exibida na barra superior de uma janela quando a página está sendo visualizada, e nas abas do Firefox. + + +``` xml + + + + + <!-- Insira seu título aqui --> + + + + + + +``` + +Altere + +``` xml + +``` + +e + +``` xml + +``` + +para + +``` xml +Olá mundo! +``` + +Guarde o ficheiro num diretório `programming-historian` como `ola-mundo.html`. De seguida, vá para o Firefox e escolha `Abrir menu -> Abrir ficheiro...` e +então escolha `ola-mundo.html`. Dependendo do seu editor de texto, você pode ter a opção 'visualizar página no navegador' ou 'abrir no navegador'. Depois de abrir o ficheiro, a sua mensagem deve aparecer no navegador. Observe a diferença entre abrir um ficheiro HTML com um navegador como o Firefox (que o interpreta), ou abrir o mesmo ficheiro com seu editor de texto (que não faz o mesmo). + +## Leituras sugeridas para aprender HTML + +- [W3 Schools HTML Tutorial][] +- [W3 Schools HTML5 Tutorial][] + + [W3 Schools HTML tutorial]: https://www.w3schools.com/html/default.asp + [W3 Schools HTML5 Tutorial]: https://www.w3schools.com/html/html5_intro.asp diff --git a/pt/licoes/normalizacao-dados-textuais-python.md b/pt/licoes/normalizacao-dados-textuais-python.md index b83e651b25..1782f21b0e 100644 --- a/pt/licoes/normalizacao-dados-textuais-python.md +++ b/pt/licoes/normalizacao-dados-textuais-python.md @@ -1,158 +1,158 @@ ---- -title: Normalização de Dados Textuais com Python -layout: lesson -collection: lessons -slug: normalizacao-dados-textuais-python -date: 2012-07-17 -translation_date: 2022-10-27 -authors: -- William J. Turkel -- Adam Crymble -reviewers: -- Jim Clifford -- Frederik Elwert -editors: -- Miriam Posner -translator: -- Felipe Lamarca -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- André Salvo -- Gabriela Kucuruza -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/460 -activity: transforming -topics: [python] -abstract: "Nesta lição tornará a lista criada na lição 'De HTML para Lista de Palavras' mais fácil de ser analisada através da normalização desses dados." -original: normalizing-data -avatar_alt: Mulher alta a arrastar um jovem baixo -doi: 10.46430/phpt0029 ---- - - -{% include toc.html %} - -
    -O site do Old Bailey Online foi recentemente atualizado. Infelizmente, devido às diversas mudanças, muitos (se não todos) os elementos do site de exemplo usado nesta lição não funcionarão conforme descrito. No entanto, as metodologias ensinadas por esta lição permanecem relevantes e podem ser adaptadas pelos leitores para um site de exemplo diferente. Estamos trabalhando na adaptação da lição para o novo site do Old Bailey Online, mas ainda não temos cronograma preciso de quando a lição será atualizada. [Abril de 2024] -
    - -## Objetivos da Lição - -A lista que criámos na lição [De HTML para Lista de Palavras (parte 2)](/pt/licoes/HTML-lista-palavras-2) precisa ser normalizada antes de poder ser utilizada. Faremos isso através da aplicação de alguns métodos de string adicionais, bem como utilizando expressões regulares. Uma vez normalizados, estaremos aptos a analisar os nossos dados mais facilmente. - -## Ficheiros Necessários para esta Lição - -- `html-to-list1.py` -- `obo.py` - -Caso não tenha esses ficheiros das lições anteriores, pode fazer o *download* de um [zip](/assets/python-lessons3.zip). - -## Limpando a Lista - -Na lição [De HTML para Lista de Palavras (parte 2)](/pt/licoes/HTML-lista-palavras-2), escrevemos um programa em Python chamado `html-to-list1.py` que fazia o *download* de uma [página web](https://perma.cc/8LM6-W39K), removia a formatação HTML e os metadados e retornava uma lista de "palavras" como a apresentada abaixo. Tecnicamente, essas entidades são chamadas de "*tokens*" ao invés de "palavras". Elas incluem alguns elementos que, estritamente falando, não são palavras (como a abreviação &c. para "etcetera"). Elas também incluem elementos que podem ser considerados composições de mais de uma palavra. O possessivo "Akerman's", por exemplo, é ocasionalmente analisado por linguistas como duas palavras: "Akerman" e um marcador de posse. "o'clock" é uma palavra ou duas? E assim por diante. - -Volte ao seu programa `html-to-list1.py` e certifique-se de que o seu resultado se assemelha ao seguinte: - - -``` python -['324.', '\xc2\xa0', 'BENJAMIN', 'BOWSEY', '(a' 'blackmoor', ')', 'was', 'indicted', 'for', 'that', 'he', 'together', 'with', 'five', 'hundred', 'other', 'persons', 'and', 'more,', 'did,', 'unlawfully,' 'riotously,', 'and', 'tumultuously', 'assemble', 'on', 'the', '6th', 'of', 'June', 'to', 'the', 'disturbance', 'of', 'the', 'public', 'peace', 'and', 'did', 'begin', 'to', 'demolish', 'and', 'pull', 'down', 'the', 'dwelling', 'house', 'of', '\xc2\xa0', 'Richard', 'Akerman', ',', 'against', 'the', 'form', 'of', 'the', 'statute,', '&c.', '\xc2\xa0', 'ROSE', 'JENNINGS', ',', 'Esq.', 'sworn.', 'Had', 'you', 'any', 'occasion', 'to', 'be', 'in', 'this', 'part', 'of', 'the', 'town,', 'on', 'the', '6th', 'of', 'June', 'in', 'the', 'evening?', '-', 'I', 'dined', 'with', 'my', 'brother', 'who', 'lives', 'opposite', 'Mr.', "Akerman's", 'house.', 'They', 'attacked', 'Mr.', "Akerman's", 'house', 'precisely', 'at', 'seven', "o'clock;", 'they', 'were', 'preceded', 'by', 'a', 'man', 'better', 'dressed', 'than', 'the', 'rest,', 'who'] -``` - -Por si só, a habilidade de separar um documento em palavras não é muito útil, já que somos capazes de ler. Podemos usar o texto, no entanto, para executar tarefas que não são sempre possíveis sem *softwares* especiais. Começaremos calculando as frequências dos *tokens* e outras unidades linguísticas, uma forma clássica de mensurar textos. - -Está claro que a nossa lista precisará de uma limpeza antes de conseguirmos utilizá-la para contar frequências. Em linha com as práticas estabelecidas em [De HTML para Lista de Palavras (parte 1)](/pt/licoes/HTML-lista-palavras-1), vamos tentar descrever o nosso algoritmo em português primeiro. Desejamos saber a frequência de cada palavra com sentido que aparece na transcrição do julgamento. Desse modo, as etapas envolvidas podem ser semelhantes a estas: - -- Converter todas as palavras para letras minúsculas de modo que "BENJAMIN" e "benjamin" sejam contabilizadas como a mesma palavra -- Remover quaisquer caracteres estranhos ou incomuns -- Contar o número de vezes que cada palavra aparece -- Remover palavras excessivamente comuns como "it", "the", "and", etc. - -## Converter para Minúsculas - -Tipicamente tokens são convertidos em letras minúsculas ao contar frequências, então faremos isso através do método de string `lower` que foi introduzido em [Manipular strings com Python](/pt/licoes/manipular-strings-python). Já que este é um método de string, devemos aplicá-lo à string `text` no programa `html-to-list1.py`. Ajuste `html-to-list1.py` adicionando a *string tag* `lower()` ao final da string `text`. - - -``` python -#html-to-list1.py -import urllib.request, urllib.error, urllib.parse, obo - -url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' - -response = urllib.request.urlopen(url) -html = str(response.read().decode('UTF-8')) -text = obo.stripTags(html).lower() #adicione o método de string aqui -wordlist = text.split() - -print(wordlist) -``` - -Agora deve ver a mesma lista de palavras de antes, mas com todos os caracteres minúsculos. - -Ao chamar métodos em sequência como mostrado, torna-se possível manter o nosso código curto e fazer mudanças bastante significativas no nosso programa. - -Como afirmámos anteriormente, o Python torna fácil a execução de muitas tarefas com pouquíssimo código. - -Neste ponto, podemos examinar uma série de outras entradas do *Old Bailey Online* e uma ampla gama de outras fontes em potencial para termos certeza de que não há outros caracteres especiais que causarão problema posteriormente. Também podemos tentar antecipar situações nas quais não desejamos nos livrar de pontuação (por exemplo, para distinguir valores monetários como "$1629” ou “£1295” de datas, ou reconhecer que “1629-40” carrega um significado diferente de “1629 40”). Isso é o que programadores profissionais são pagos para fazer: tentar pensar em tudo que pode dar errado e tratar isso com antecedência. - -Vamos adotar uma abordagem diferente. O nosso objetivo principal é desenvolver técnicas que um historiador em exercício pode utilizar durante o processo de investigação. Isso significa que quase sempre preferiremos soluções aproximadamente corretas que possam ser desenvolvidas rapidamente. Então, ao invés de perder tempo neste momento para tornar o nosso programa robusto em face de exceções, vamos simplesmente nos livrar de tudo que não seja uma letra com ou sem acento ou um algarismo arábico. Programação é tipicamente um processo de "refinamento gradual". Começamos com um problema e parte de uma solução, e depois continuamos refinando a solução até obter um resultado que funcione melhor. - -## Expressões Regulares de Python - -Nós eliminamos as letras maiúsculas. Agora só precisamos nos livrar da pontuação. A pontuação prejudicará as nossas contagens de frequência se as mantivermos lá. Desejamos que "evening?" seja contabilizado como "evening" e "1780." como "1780", claro. - -É possível utilizar o método de string `replace` para remover cada tipo de pontuação: - -``` python -text = text.replace('[', '') -text = text.replace(']', '') -text = text.replace(',', '') -#etc... -``` - -No entanto, isso não é muito eficiente. Em linha com o nosso objetivo de criar programas curtos e poderosos, utilizaremos um mecanismo chamado *expressões regulares*. Expressões regulares são fornecidas por muitas linguagens de programação de várias maneiras distintas. - -Expressões regulares permitem que busque por padrões bem definidos e podem diminuir drasticamente o comprimento do código. Por exemplo, se desejasse saber se uma substring corresponde a uma letra do alfabeto, ao invés de usar uma condição `if/else` para verificar se ela representa a letra "a", depois "b", depois "c" e assim por diante, poderia usar uma expressão regular para verificar se a substring se assemelha a uma letra entre "a" e "z". Ou poderia verificar a presença de um dígito, uma letra maiúscula, ou qualquer caractere alfanumérico, ou um [retorno de carro](https://perma.cc/T7DA-RG2L), ou qualquer combinação dos itens acima e muito mais. - -Em Python, expressões regulares estão disponíveis como um módulo. Para acelerar o processamento, ele não é carregado automaticamente porque nem todos os programas o exigem. Então, precisará importar (`import`) o módulo (chamado `re`, abreviação de *regular expressions*) da mesma forma que importou o módulo `obo.py`. - -Como estamos interessados apenas em caracteres alfanuméricos, criaremos uma expressão regular que irá isolá-los e removerá o resto. Copie a função a seguir e cole-a ao final do módulo `obo.py`. Pode manter as outras funções do módulo, já que continuaremos a usá-las. - - -``` python -# Dada uma string de texto, remova todos os caracteres não-alfanuméricos (usando a definição Unicode de alfanumérico) - -def stripNonAlphaNum(text): - import re - return re.compile(r'\W+', re.UNICODE).split(text) -``` - -A expressão regular no código acima é o material dentro da string, em outras palavras: `W+`. `W` é uma abreviatura para a classe de caracteres não-alfanuméricos. Numa expressão regular de Python, o sinal de adição (+) encontra uma ou mais cópias de um determinado caractere. `re.UNICODE` informa ao interpretador que desejamos incluir caracteres de outros idiomas do mundo em nossa definição de alfanumérico, assim como de "A" a "Z", "a" a "z" e 0-9 do português. Expressões regulares devem ser *compiladas* antes de poderem ser utilizadas, que é o que o resto do comando faz. Não se preocupe em compreender a parte da compilação agora. - -Agora que refinamos o nosso programa `html-to-list1.py`, ele se parece com isto: - -``` python -#html-to-list1.py -import urllib.request, urllib.error, urllib.parse, obo - -url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' - -response = urllib.request.urlopen(url) -html = response.read().decode('UTF-8') -text = obo.stripTags(html).lower() -wordlist = obo.stripNonAlphaNum(text) - -print(wordlist) -``` - -Ao executar o programa e verificar a saída no painel "Saída de Comando", verá que ele fez um bom trabalho. Esse código irá dividir formas hifenizadas como "coach-wheels" em duas palavras e irá transformar o possessivo "s" ou "o'clock" em palavras separadas ao perderem o apóstrofo. Ainda assim, o código faz uma aproximação boa o suficiente para os nossos objetivos e devemos agora passar para a contagem de frequências antes de tentar melhorá-lo. (Caso trabalhe com fontes em mais de um idioma, precisa aprender um pouco mais a respeito do padrão [Unicode](https://perma.cc/7ACH-KCDN) e sobre o [suporte de Python](https://web.archive.org/web/20180502053841/http://www.diveintopython.net/xml_processing/unicode.html) a ele.) - -## Leituras Sugeridas - -Para praticar mais as Expressões Regulares, o capítulo 7 de "[Dive into Python](https://web.archive.org/web/20180416143856/http://www.diveintopython.net/regular_expressions/index.html)" de Mark Pilgrim pode ser um tutorial útil. - -## Sincronização de Código - -Para acompanhar as lições futuras, é importante que tenha os ficheiros e programas corretos no seu diretório *programming historian*. Ao final de cada capítulo nesta série pode fazer o *download* do ficheiro zip do programming historian para garantir que possui o código correto. - -- python-lessons4.zip ([zip sync](/assets/python-lessons4.zip)) +--- +title: Normalização de Dados Textuais com Python +layout: lesson +collection: lessons +slug: normalizacao-dados-textuais-python +date: 2012-07-17 +translation_date: 2022-10-27 +authors: +- William J. Turkel +- Adam Crymble +reviewers: +- Jim Clifford +- Frederik Elwert +editors: +- Miriam Posner +translator: +- Felipe Lamarca +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- André Salvo +- Gabriela Kucuruza +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/460 +activity: transforming +topics: [python] +abstract: "Nesta lição tornará a lista criada na lição 'De HTML para Lista de Palavras' mais fácil de ser analisada através da normalização desses dados." +original: normalizing-data +avatar_alt: Mulher alta a arrastar um jovem baixo +doi: 10.46430/phpt0029 +--- + + +{% include toc.html %} + +
    +O site do Old Bailey Online foi recentemente atualizado. Infelizmente, devido às diversas mudanças, muitos (se não todos) os elementos do site de exemplo usado nesta lição não funcionarão conforme descrito. No entanto, as metodologias ensinadas por esta lição permanecem relevantes e podem ser adaptadas pelos leitores para um site de exemplo diferente. Estamos trabalhando na adaptação da lição para o novo site do Old Bailey Online, mas ainda não temos cronograma preciso de quando a lição será atualizada. [Abril de 2024] +
    + +## Objetivos da Lição + +A lista que criámos na lição [De HTML para Lista de Palavras (parte 2)](/pt/licoes/HTML-lista-palavras-2) precisa ser normalizada antes de poder ser utilizada. Faremos isso através da aplicação de alguns métodos de string adicionais, bem como utilizando expressões regulares. Uma vez normalizados, estaremos aptos a analisar os nossos dados mais facilmente. + +## Ficheiros Necessários para esta Lição + +- `html-to-list1.py` +- `obo.py` + +Caso não tenha esses ficheiros das lições anteriores, pode fazer o *download* de um [zip](/assets/python-lessons3.zip). + +## Limpando a Lista + +Na lição [De HTML para Lista de Palavras (parte 2)](/pt/licoes/HTML-lista-palavras-2), escrevemos um programa em Python chamado `html-to-list1.py` que fazia o *download* de uma [página web](https://perma.cc/8LM6-W39K), removia a formatação HTML e os metadados e retornava uma lista de "palavras" como a apresentada abaixo. Tecnicamente, essas entidades são chamadas de "*tokens*" ao invés de "palavras". Elas incluem alguns elementos que, estritamente falando, não são palavras (como a abreviação &c. para "etcetera"). Elas também incluem elementos que podem ser considerados composições de mais de uma palavra. O possessivo "Akerman's", por exemplo, é ocasionalmente analisado por linguistas como duas palavras: "Akerman" e um marcador de posse. "o'clock" é uma palavra ou duas? E assim por diante. + +Volte ao seu programa `html-to-list1.py` e certifique-se de que o seu resultado se assemelha ao seguinte: + + +``` python +['324.', '\xc2\xa0', 'BENJAMIN', 'BOWSEY', '(a' 'blackmoor', ')', 'was', 'indicted', 'for', 'that', 'he', 'together', 'with', 'five', 'hundred', 'other', 'persons', 'and', 'more,', 'did,', 'unlawfully,' 'riotously,', 'and', 'tumultuously', 'assemble', 'on', 'the', '6th', 'of', 'June', 'to', 'the', 'disturbance', 'of', 'the', 'public', 'peace', 'and', 'did', 'begin', 'to', 'demolish', 'and', 'pull', 'down', 'the', 'dwelling', 'house', 'of', '\xc2\xa0', 'Richard', 'Akerman', ',', 'against', 'the', 'form', 'of', 'the', 'statute,', '&c.', '\xc2\xa0', 'ROSE', 'JENNINGS', ',', 'Esq.', 'sworn.', 'Had', 'you', 'any', 'occasion', 'to', 'be', 'in', 'this', 'part', 'of', 'the', 'town,', 'on', 'the', '6th', 'of', 'June', 'in', 'the', 'evening?', '-', 'I', 'dined', 'with', 'my', 'brother', 'who', 'lives', 'opposite', 'Mr.', "Akerman's", 'house.', 'They', 'attacked', 'Mr.', "Akerman's", 'house', 'precisely', 'at', 'seven', "o'clock;", 'they', 'were', 'preceded', 'by', 'a', 'man', 'better', 'dressed', 'than', 'the', 'rest,', 'who'] +``` + +Por si só, a habilidade de separar um documento em palavras não é muito útil, já que somos capazes de ler. Podemos usar o texto, no entanto, para executar tarefas que não são sempre possíveis sem *softwares* especiais. Começaremos calculando as frequências dos *tokens* e outras unidades linguísticas, uma forma clássica de mensurar textos. + +Está claro que a nossa lista precisará de uma limpeza antes de conseguirmos utilizá-la para contar frequências. Em linha com as práticas estabelecidas em [De HTML para Lista de Palavras (parte 1)](/pt/licoes/HTML-lista-palavras-1), vamos tentar descrever o nosso algoritmo em português primeiro. Desejamos saber a frequência de cada palavra com sentido que aparece na transcrição do julgamento. Desse modo, as etapas envolvidas podem ser semelhantes a estas: + +- Converter todas as palavras para letras minúsculas de modo que "BENJAMIN" e "benjamin" sejam contabilizadas como a mesma palavra +- Remover quaisquer caracteres estranhos ou incomuns +- Contar o número de vezes que cada palavra aparece +- Remover palavras excessivamente comuns como "it", "the", "and", etc. + +## Converter para Minúsculas + +Tipicamente tokens são convertidos em letras minúsculas ao contar frequências, então faremos isso através do método de string `lower` que foi introduzido em [Manipular strings com Python](/pt/licoes/manipular-strings-python). Já que este é um método de string, devemos aplicá-lo à string `text` no programa `html-to-list1.py`. Ajuste `html-to-list1.py` adicionando a *string tag* `lower()` ao final da string `text`. + + +``` python +#html-to-list1.py +import urllib.request, urllib.error, urllib.parse, obo + +url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' + +response = urllib.request.urlopen(url) +html = str(response.read().decode('UTF-8')) +text = obo.stripTags(html).lower() #adicione o método de string aqui +wordlist = text.split() + +print(wordlist) +``` + +Agora deve ver a mesma lista de palavras de antes, mas com todos os caracteres minúsculos. + +Ao chamar métodos em sequência como mostrado, torna-se possível manter o nosso código curto e fazer mudanças bastante significativas no nosso programa. + +Como afirmámos anteriormente, o Python torna fácil a execução de muitas tarefas com pouquíssimo código. + +Neste ponto, podemos examinar uma série de outras entradas do *Old Bailey Online* e uma ampla gama de outras fontes em potencial para termos certeza de que não há outros caracteres especiais que causarão problema posteriormente. Também podemos tentar antecipar situações nas quais não desejamos nos livrar de pontuação (por exemplo, para distinguir valores monetários como "$1629” ou “£1295” de datas, ou reconhecer que “1629-40” carrega um significado diferente de “1629 40”). Isso é o que programadores profissionais são pagos para fazer: tentar pensar em tudo que pode dar errado e tratar isso com antecedência. + +Vamos adotar uma abordagem diferente. O nosso objetivo principal é desenvolver técnicas que um historiador em exercício pode utilizar durante o processo de investigação. Isso significa que quase sempre preferiremos soluções aproximadamente corretas que possam ser desenvolvidas rapidamente. Então, ao invés de perder tempo neste momento para tornar o nosso programa robusto em face de exceções, vamos simplesmente nos livrar de tudo que não seja uma letra com ou sem acento ou um algarismo arábico. Programação é tipicamente um processo de "refinamento gradual". Começamos com um problema e parte de uma solução, e depois continuamos refinando a solução até obter um resultado que funcione melhor. + +## Expressões Regulares de Python + +Nós eliminamos as letras maiúsculas. Agora só precisamos nos livrar da pontuação. A pontuação prejudicará as nossas contagens de frequência se as mantivermos lá. Desejamos que "evening?" seja contabilizado como "evening" e "1780." como "1780", claro. + +É possível utilizar o método de string `replace` para remover cada tipo de pontuação: + +``` python +text = text.replace('[', '') +text = text.replace(']', '') +text = text.replace(',', '') +#etc... +``` + +No entanto, isso não é muito eficiente. Em linha com o nosso objetivo de criar programas curtos e poderosos, utilizaremos um mecanismo chamado *expressões regulares*. Expressões regulares são fornecidas por muitas linguagens de programação de várias maneiras distintas. + +Expressões regulares permitem que busque por padrões bem definidos e podem diminuir drasticamente o comprimento do código. Por exemplo, se desejasse saber se uma substring corresponde a uma letra do alfabeto, ao invés de usar uma condição `if/else` para verificar se ela representa a letra "a", depois "b", depois "c" e assim por diante, poderia usar uma expressão regular para verificar se a substring se assemelha a uma letra entre "a" e "z". Ou poderia verificar a presença de um dígito, uma letra maiúscula, ou qualquer caractere alfanumérico, ou um [retorno de carro](https://perma.cc/T7DA-RG2L), ou qualquer combinação dos itens acima e muito mais. + +Em Python, expressões regulares estão disponíveis como um módulo. Para acelerar o processamento, ele não é carregado automaticamente porque nem todos os programas o exigem. Então, precisará importar (`import`) o módulo (chamado `re`, abreviação de *regular expressions*) da mesma forma que importou o módulo `obo.py`. + +Como estamos interessados apenas em caracteres alfanuméricos, criaremos uma expressão regular que irá isolá-los e removerá o resto. Copie a função a seguir e cole-a ao final do módulo `obo.py`. Pode manter as outras funções do módulo, já que continuaremos a usá-las. + + +``` python +# Dada uma string de texto, remova todos os caracteres não-alfanuméricos (usando a definição Unicode de alfanumérico) + +def stripNonAlphaNum(text): + import re + return re.compile(r'\W+', re.UNICODE).split(text) +``` + +A expressão regular no código acima é o material dentro da string, em outras palavras: `W+`. `W` é uma abreviatura para a classe de caracteres não-alfanuméricos. Numa expressão regular de Python, o sinal de adição (+) encontra uma ou mais cópias de um determinado caractere. `re.UNICODE` informa ao interpretador que desejamos incluir caracteres de outros idiomas do mundo em nossa definição de alfanumérico, assim como de "A" a "Z", "a" a "z" e 0-9 do português. Expressões regulares devem ser *compiladas* antes de poderem ser utilizadas, que é o que o resto do comando faz. Não se preocupe em compreender a parte da compilação agora. + +Agora que refinamos o nosso programa `html-to-list1.py`, ele se parece com isto: + +``` python +#html-to-list1.py +import urllib.request, urllib.error, urllib.parse, obo + +url = 'http://www.oldbaileyonline.org/browse.jsp?id=t17800628-33&div=t17800628-33' + +response = urllib.request.urlopen(url) +html = response.read().decode('UTF-8') +text = obo.stripTags(html).lower() +wordlist = obo.stripNonAlphaNum(text) + +print(wordlist) +``` + +Ao executar o programa e verificar a saída no painel "Saída de Comando", verá que ele fez um bom trabalho. Esse código irá dividir formas hifenizadas como "coach-wheels" em duas palavras e irá transformar o possessivo "s" ou "o'clock" em palavras separadas ao perderem o apóstrofo. Ainda assim, o código faz uma aproximação boa o suficiente para os nossos objetivos e devemos agora passar para a contagem de frequências antes de tentar melhorá-lo. (Caso trabalhe com fontes em mais de um idioma, precisa aprender um pouco mais a respeito do padrão [Unicode](https://perma.cc/7ACH-KCDN) e sobre o [suporte de Python](https://web.archive.org/web/20180502053841/https://www.diveintopython.net/xml_processing/unicode.html) a ele.) + +## Leituras Sugeridas + +Para praticar mais as Expressões Regulares, o capítulo 7 de "[Dive into Python](https://web.archive.org/web/20180416143856/https://www.diveintopython.net/regular_expressions/index.html)" de Mark Pilgrim pode ser um tutorial útil. + +## Sincronização de Código + +Para acompanhar as lições futuras, é importante que tenha os ficheiros e programas corretos no seu diretório *programming historian*. Ao final de cada capítulo nesta série pode fazer o *download* do ficheiro zip do programming historian para garantir que possui o código correto. + +- python-lessons4.zip ([zip sync](/assets/python-lessons4.zip)) diff --git a/pt/licoes/preservar-os-seus-dados-de-investigacao.md b/pt/licoes/preservar-os-seus-dados-de-investigacao.md index 3eb104630f..0037587b61 100644 --- a/pt/licoes/preservar-os-seus-dados-de-investigacao.md +++ b/pt/licoes/preservar-os-seus-dados-de-investigacao.md @@ -173,7 +173,7 @@ documentada), embora esquemas existentes, como o [Markdown][] estejam disponíve (os ficheiros do Markdown são salvos como .md). Uma excelente página de dicas do Markdown está disponível no GitHub ) para aqueles que desejam seguir - ou adaptar - este esquema existente. O Notepad++ - é recomendado para usuários do Windows, embora + é recomendado para usuários do Windows, embora de modo algum seja essencial para trabalhar com ficheiros .md. Usuários de Mac ou Unix podem achar útil o [Komodo Edit][] ou o [Text Wrangler][]. @@ -236,7 +236,7 @@ frequentemente usadas. O site Homens e Navios do Bacalhau estrutura seu arquivo usando o formato: - *nome do site*/registo/*número de referência* -- +- E o Arquivo Histórico Ultramarino usa o formato: @@ -418,11 +418,11 @@ blog (17 outubro 2013) Hitchcock, Tim, 'Judging a book by its URLs', Historyonics blog (3 janeiro 2014) - + Howard, Sharon, 'Unclean, unclean! What historians can do about sharing our messy research data', Early Modern Notes blog (18 maio 2013) - + Noble, William Stafford, A Quick Guide to Organizing Computational Biology Projects.PLoSComputBiol 5(7): e1000424 (2009) @@ -435,7 +435,7 @@ Information Management: Organising Humanities Material' (2011) Pennock, Maureen, 'The Twelve Principles of Digital Preservation (and a cartridge in a repository…)', British Library Collection Care blog (3 setembro 2013) - + Pritchard, Adam, 'Markdown Cheatsheet' (2013) diff --git a/pt/licoes/processamento-basico-texto-r.md b/pt/licoes/processamento-basico-texto-r.md index 0c171d8e21..c868b1628c 100644 --- a/pt/licoes/processamento-basico-texto-r.md +++ b/pt/licoes/processamento-basico-texto-r.md @@ -1,1078 +1,1078 @@ ---- -title: Processamento Básico de Texto em R -slug: processamento-basico-texto-r -layout: lesson -date: 2017-03-27 -translation_date: 2021-07-13 -authors: -- Taylor Arnold -- Lauren Tilton -reviewers: -- Brandon Walsh -- John Russell -editors: -- Jeri Wieringa -translator: -- Diana Rebelo Rodriguez -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Rômulo Predes -- Maria Guedes -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/381 -activity: analyzing -topics: [distant-reading, r, data-visualization] -abstract: "Aprenda a usar o R para analisar padrões de alto nível em textos, aplicar métodos estilométricos ao longo do tempo e entre autores, assim como a usar métodos para resumir informações para descrever um corpus" -original: basic-text-processing-in-r -avatar_alt: Crianças com livros junto a uma biblioteca itinerante -doi: 10.46430/phpt0013 ---- - -{% include toc.html %} - -## Objetivos - -Hoje em dia há uma quantidade substancial de dados históricos disponíveis em forma de texto simples e digitalizado. Alguns exemplos comuns são cartas, artigos de jornal, notas pessoais, diários, documentos legais e transcrições de discursos. Enquanto algumas aplicações de softwares independentes têm ferramentas para analisar dados textuais, o uso de linguagens de programação apresenta uma maior flexibilidade para analisar um corpus de documentos de texto. Neste tutorial, guiaremos os usuários no básico da análise de texto na linguagem de programação R. A nossa abordagem envolve usar apenas a tokenização que produz uma análise sintática do texto, com elementos como palavras, frases e orações. No final da presente lição, os usuários poderão: - -* utilizar análises exploratórias para verificar erros e detectar padrões gerais; -* aplicar métodos básicos de estilometria através do tempo e entre autores; -* conseguir resumir o conteúdo do documento para oferecer uma descrição geral do corpus. - -Para esta lição, será utilizado um conjunto de dados com os textos dos discursos presidenciais dos Estados Unidos da América sobre o [Estado da União](https://pt.wikipedia.org/wiki/Discurso_sobre_o_Estado_da_Uni%C3%A3o)[^1]. - -Assumimos que os usuários possuem um conhecimento básico da linguagem de programação R. A lição [Noções básicas de R com dados tabulares](/en/lessons/r-basics-with-tabular-data)[^2] (em inglês) é um excelente guia que contém todos os conhecimentos em R necessários aqui, tais como instalar e abrir R, instalar e carregar pacotes e importar e trabalhar com dados básicos de R. Os usuários podem fazer o download do R indicado para os seus sistemas operativos em [The Comprehensive R Archive Network](https://cran.r-project.org/). Ainda que não seja um pré-requisito, recomendamos que os novos usuários façam o download do [R Studio](https://www.rstudio.com/products/rstudio/#Desktop), um ambiente de desenvolvimento de código aberto para escrever e executar programas em R. - -Todo o código desta lição foi testado em R na versão 4.0.2, mas esperamos que ele rode adequadamente em qualquer versão futura do programa. - -# Um pequeno exemplo - -## Configuração de pacotes - -É necessário instalar dois pacotes de R antes de começar com o tutorial: o **tidyverse**[^3] e o **tokenizers**[^4]. O primeiro proporciona ferramentas convenientes para ler e trabalhar com grupos de dados e o segundo contém funções para dividir os dados do texto em palavras e orações. Para instalá-los, abra o R no seu computador e execute essas duas linhas de código no console: - -```{r} - -install.packages("tidyverse") - -install.packages("tokenizers") - -``` - -Dependendo da configuração do seu sistema, pode ser aberta uma caixa de diálogo solicitando a escolha de um lugar da internet para fazer o download. Caso apareça, escolha a opção mais perto de sua localização atual. O download e a instalação, provavelmente, irão ocorrer automaticamente. - -Agora que esses pacotes estão no seu computador, precisamos de avisar ao R que eles devem ser carregados para o uso. Isso é feito através do comando `library`. Pode ser que apareçam alguns avisos enquanto carregam outras dependências, mas eles podem ser ignorados sem nenhum problema. Execute essas duas linhas de código no console para habilitar o uso dos pacotes: - -```{r} - -library(tidyverse) - -library(tokenizers) - -``` - -O comando `install.packages` (instalar pacotes) só é necessário executar na primeira vez em que iniciar este tutorial, o comando `library` deverá ser executado todas as vezes que se inicia o R[^5]. - -## Segmentação de palavras - -Nesta seção, vamos trabalhar com um único parágrafo. Este exemplo pertence ao início do último discurso de Barack Obama sobre o Estado da União, em 2016. Para facilitar a compreensão do tutorial nesta primeira etapa, estudamos este parágrafo traduzido para português[^6]. - -Para carregar o texto, copie e cole o seguinte no console do R: - -``` - -texto <- paste("Também entendo que, pelo fato de estarmos em temporada eleitoral, as expectativas quanto ao que vamos realizar este ano são baixas. Mesmo assim, senhor presidente da Câmara, aprecio a atitude construtiva que o senhor e os outros líderes assumiram no final do ano passado para aprovar o orçamento e perpetuar a redução dos impostos sobre as famílias trabalhadoras. Desse modo, espero que possamos colaborar este ano sobre questões que são prioritárias para ambos os partidos, como a reforma da justiça criminal e a assistência às pessoas dependentes de drogas vendidas com receita médica. Quem sabe possamos surpreender os cínicos novamente.") - -``` - -Depois de executar o comando (clicando em “Enter”), escreva a palavra `texto` no console e pressione Enter. O R irá mostrar o conteúdo do objeto texto, uma vez que ele contém parte do discurso proferido por Obama. - -O primeiro passo do processamento de texto envolve utilizar a função `tokenize_words` (segmentar palavras) do pacote **tokenizers** para dividir o texto en palavras individuais. - -```{r} - -palavras <- tokenize_words(texto) - -``` - -Para apresentar os resultados na janela do console do R, mostrando tanto o resultado tokenizado como a posição de cada elemento na margem esquerda, execute palavras no console: - - -```{r} - -palavras - -``` - -Isso produz o seguinte resultado: - - -``` - -> [[1]] - -[1] "também" "entendo" "que" "pelo" "fato" - -[6] "de" "estarmos" "em" "temporada" "eleitoral" - -[11] "as" "expectativas" "quanto" "ao" "que" - -[16] "vamos" "realizar" "este" "ano" "são" - -[21] "baixas" "mesmo" "assim" "senhor" "presidente" - -[26] "da" "câmara" "aprecio" "a" "atitude" - -[31] "construtiva" "que" "o" "senhor" "e" - -[36] "os" "outros" "líderes" "assumiram" "no" - -[41] "final" "do" "ano" "passado" "para" - -[46] "aprovar" "o" "orçamento" "e" "perpetuar" - -[51] "a" "redução" "dos" "impostos" "sobre" - -[56] "as" "famílias" "trabalhadoras" "desse" "modo" - -[61] "espero" "que" "possamos" "colaborar" "este" - -[66] "ano" "sobre" "questões" "que" "são" - -[71] "prioritárias" "para" "ambos" "os" "partidos" - -[76] "como" "a" "reforma" "da" "justiça" - -[81] "criminal" "e" "a" "assistência" "às" - -[86] "pessoas" "dependentes" "de" "drogas" "vendidas" - -[91] "com" "receita" "médica" "quem" "sabe" - -[96] "possamos" "surpreender" "os" "cínicos" "novamente" - -``` - -Como o texto carregado mudou depois de se executar essa função de R? Ela removeu toda a pontuação, dividiu o texto em palavras individuais e converteu tudo para minúsculas. Em breve, veremos porque todas essas intervenções são úteis para a nossa análise. - -Quantas palavras existem neste fragmento de texto? Se usamos a função `length` (comprimento) diretamente no objeto `palavras`, o resultado não é muito útil. - - - -```{r} - -length(palavras) - -``` - - -O resultado é igual a: - - -```{r} - -[1] 1 - -``` - -O comprimento equivale a 1 porque a função `tokenize_words` retorna uma lista de objetos com uma entrada por documento carregado. O nosso carregamento possui apenas um documento, então a lista também possui apenas um elemento. Para ver as palavras dentro do primeiro documento, utilizamos o símbolo [], da seguinte forma: `[[1]]`. O objetivo é selecionar apenas o primeiro elemento da lista: - - -```{r} - -length(palavras[[1]]) - -``` - -O resultado é `100`, indicando que existem 100 palavras neste parágrafo. - -A separação do documento em palavras individuais torna possível calcular quantas vezes cada palavra foi utilizada durante o texto. Para fazer isso, primeiro aplicamos a função `table` (tabela) nas palavras do primeiro (e, neste caso, único) documento e depois separamos os nomes e os valores da tabela num novo objeto chamado _data frame_. O uso de um quadro de dados em R é semelhante ao uso de uma tabela numa base de dados. Esses passos, em conjunto com a impressão do resultado, são obtidos com as seguintes linhas de código: - - -```{r} - -tabela <- table(palavras[[1]]) - -tabela <- data_frame(palavra = names(tabela), contagem = as.numeric(tabela)) - -tabela - -``` - -O resultado deste comando deve aparecer assim no seu console (*tibble* é um tipo específico de _data frame_ criado no pacote [Tidy Data](https://en.wikipedia.org/wiki/Tidy_data)): - -``` - -# A tibble: 77 x 2 - -palavra contagem - - - -1 a 4. - -2 ambos 1. - -3 ano 3. - -4 ao 1. - -5 aprecio 1. - -6 aprovar 1. - -7 as 2. - -8 às 1. - -9 assim 1. - -10 assistência 1. - -# ... with 67 more rows - -``` - - -Há uma quantidade substancial de informação nesta amostra. Vemos que existem 77 palavras únicas, como indica a dimensão da tabela. As 10 primeiras fileiras do conjunto de dados são apresentadas, com a segunda coluna mostrando quantas vezes a palavra da primeira coluna foi utilizada. Por exemplo, “ano” foi usada três vezes, enquanto “aprovar”, apenas uma vez. - - - -Também podemos ordenar a tabela usando a função `arrange` (organizar). Esta função precisa do conjunto de dados a utilizar, aqui `tabela`, e depois o nome da coluna que serve de referência para ordená-lo. A função `desc` no segundo argumento indica que queremos ordenar em ordem decrescente. - - - -```{r} - -arrange(tabela, desc(contagem)) - -``` - - -E agora o resultado será: - - - -```{r} - -# A tibble: 77 x 2 - -palavra contagem - - - -1 que 5. - -2 a 4. - -3 ano 3. - -4 e 3. - -5 os 3. - -6 as 2. - -7 da 2. - -8 de 2. - -9 este 2. - -10 o 2. - -# … with 67 more rows - -``` - - - -As palavras mais comuns são pronomes e palavras funcionais tais como "que", "a", "e" e "os". Observe como a análise é facilitada pelo uso da versão em minúsculas de cada palavra. Qualquer contagem prevê que a palavra possa estar no início ou no meio da frase. - - - -Uma técnica popular é carregar uma lista de palavras frequentemente usadas e eliminá-las antes da análise formal. As palavras em tal lista são chamadas "*stopwords*" ou "palavras vazias" e são geralmente pronomes, conjugações dos verbos mais comuns e conjunções. Neste tutorial, temos uma variação sutil desta técnica. - - - -## Detectar frases - - - -O pacote **tokenizer** também contém a função `tokenize_sentences`, que detecta limites de frases, ao invés de palavras. Ele pode ser executado da seguinte maneira: - - - -```{r} - -frases <- tokenize_sentences(texto) - -frases - -``` - - - -Com o resultado: - - - -```{r} - -> frases - -[[1]] - -[1] "Também entendo que, pelo fato de estarmos em temporada eleitoral, as expectativas quanto ao que vamos realizar este ano são baixas." - -[2] "Mesmo assim, senhor presidente da Câmara, aprecio a atitude construtiva que o senhor e os outros líderes assumiram no final do ano passado para aprovar o orçamento e perpetuar a redução dos impostos sobre as famílias trabalhadoras." - -[3] "Desse modo, espero que possamos colaborar este ano sobre questões que são prioritárias para ambos os partidos, como a reforma da justiça criminal e a assistência às pessoas dependentes de drogas vendidas com receita médica." - -[4] "Quem sabe possamos surpreender os cínicos novamente." - -``` - - - -O resultado é um vetor de caracteres, um objeto unidimensional que consiste apenas em elementos representados como caracteres. Observe que o resultado marcou cada frase como um elemento separado. - - - -É possível conectar o resultado da divisão das frases com o resultado da divisão das palavras. Se executarmos a divisão de frases do parágrafo com a função `tokenize_words`, cada frase será tratada como um único documento. Execute isto usando a seguinte linha de código e veja se o resultado é o esperado, a segunda linha de comando serve para imprimir o resultado. - - - -```{r} - -frases_palavras <- tokenize_words(frases[[1]]) - -frases_palavras - -``` - - - -Se olharmos para o tamanho do resultado diretamente, podemos ver que existem quatro “documentos” no objeto `frases_palavras`: - - - -```{r} - -length(frases_palavras) - -``` - - - -Ao acessar cada uma delas diretamente, é possível saber quantas palavras há em cada frase do parágrafo: - - - -```{r} - -length(frases_palavras[[1]]) - -length(frases_palavras[[2]]) - -length(frases_palavras[[3]]) - -length(frases_palavras[[4]]) - -``` - - - -Isto pode demandar um pouco de esforço, mas felizmente existe uma maneira mais simples de o fazer. A função `sapply` executa a função no segundo argumento para cada elemento do primeiro argumento. Como resultado, podemos calcular a extensão de cada frase do primeiro parágrafo com uma única linha de código: - - - -```{r} - -sapply(frases_palavras, length) - -``` - - - -O resultado agora será assim: - - - -```{r} - -[1] 21 37 35 7 - -``` - - - -Podemos ver que existem quatro frases com um comprimento de 21, 37, 35 e 7 palavras. Utilizaremos esta função para trabalharmos com documentos maiores. - - - -# Analisar o discurso sobre o Estado da União de Barack Obama em 2016 - - - -## Análise exploratória - - - -Vamos aplicar as técnicas da seção anterior a um discurso sobre o Estado da União completo, desta vez, usando o original em inglês. Por uma questão de coerência, vamos usar o mesmo discurso de 2016 de Barack Obama. Agora, vamos carregar os dados de um ficheiro, uma vez que a cópia direta é difícil em grande escala. - - - -Para tal, vamos combinar a função `readLines` (ler linhas) para carregar o texto em R e a função `paste` (colar) para combinar todas as linhas num único objeto. Vamos criar a URL do arquivo de texto usando a função `sprintf`, uma vez que este formato permitirá que ele seja facilmente aproveitado para outros recursos online[^7],[^8]. - - - -```{r} - -base_url <- "https://raw.githubusercontent.com/programminghistorian/jekyll/gh-pages/assets/basic-text-processing-in-r/" - -url <- sprintf("%s/sotu_text/236.txt", base_url) - -texto <- paste(readLines(url), collapse = "\n") - -``` - - - -Como antes, vamos segmentar o texto e ver o número de palavras no documento. - - - -```{r} - -palavras <- tokenize_words(texto) - -length(palavras[[1]]) - -``` - - - -Vemos que este discurso contém um total de `6113` palavras. Ao combinar as funções `table` (tabela), `data_frame` e `arrange` (organizar), como fizemos no exemplo anterior, obtemos as palavras mais frequentes em todo o discurso. Ao fazer isso, observe como é fácil reutilizar o código anterior para repetir a análise num novo conjunto de dados. Este é um dos maiores benefícios de usar uma linguagem de programação para realizar uma análise baseada em dados [^9]. - - - -```{r} - -tabela <- table(palavras[[1]]) - -tabela <- data_frame(word = names(tabela), count = as.numeric(tabela)) - -tabela <- arrange(tabela, desc(count)) - -tabela - -``` - -O resultado deve ser: - - -```{r} - ->#A tibble: 1,590 x 2 - -word count - - - -1 the 281. - -2 to 209. - -3 and 189. - -4 of 148. - -5 that 125. - -6 we 124. - -7 a 120. - -8 in 105. - -9 our 96. - -10 is 72. - ->#... with 1,580 more rows - -``` - -Mais uma vez, palavras extremamente comuns como *the* ("o" ou "a"), *to* ("para") e *and* ("e") estão no topo da tabela. Estes termos não são particularmente esclarecedores se quisermos conhecer o assunto do discurso. Na realidade, queremos encontrar palavras que se destaquem mais neste texto do que num grande corpus externo em inglês. Para conseguir isso, precisamos de um conjunto de dados que forneça essas frequências. Aqui está o conjunto de dados de Peter Norviq usando o *Google Web Trillion Word Corpus* (Corpus de um trilhão de palavras da web do Google), coletado a partir dos dados compilados através do rastreamento de sites populares em inglês pelo Google [^10]: - - -```{r} - -palavras_frequentes <- read_csv(sprintf("%s/%s", base_url, "word_frequency.csv")) - -palavras_frequentes - -``` - - -A primeira coluna indica o idioma (sempre "en" para inglês neste caso), a segunda coluna - frequency - fornece a palavra em questão e a terceira coluna indica a percentagem com a qual ela aparece no *Corpus de um trilhão de palavras do Google*. Por exemplo, a palavra "for" aparece quase exatamente 1 vez a cada 100 palavras, pelo menos nos textos dos sites indexados pelo Google. - - - -Para combinar estas palavras frequentes com o conjunto de dados na `tabela` construída a partir do discurso do Estado da União, podemos usar a função `inner_join` (união interna). Esta função toma dois conjuntos de dados e combina-os em todas as colunas que têm o mesmo nome. Neste caso, a coluna comum é a chamada _word_ ("palavra"). - - - -```{r} - -tabela <- inner_join(tabela, palavras_frequentes) - -tabela - -``` - - - -Note que agora o nosso conjunto de dados tem duas colunas extras que fornecem o idioma (aqui relativamente pouco útil já que é sempre "en") e a frequência da palavra no corpus externo. Esta segunda nova coluna será muito útil, porque podemos filtrar linhas que têm uma frequência inferior a 0,1%, ou seja, que aparecem mais de uma vez em cada 1000 palavras: - - - -```{r} - -filter(tabela, frequency < 0.1) - -``` - - -Isto produz: - - -```{r} - ->#A tibble: 1,457 x 4 - -word count language frequency - - - -1 america 28. en 0.0232 - -2 people 27. en 0.0817 - -3 just 25. en 0.0787 - -4 world 23. en 0.0734 - -5 american 22. en 0.0387 - -6 work 22. en 0.0713 - -7 make 20. en 0.0689 - -8 want 19. en 0.0440 - -9 change 18. en 0.0358 - -10 years 18. en 0.0574 - ->#... with 1,447 more rows - -``` - - - -Esta lista está começando a se tornar mais interessante. Um termo como "america" aparece no topo da lista porque, podemos pensar, é muito usado nos discursos dos políticos e menos em outros campos. Ao estabelecer o limiar ainda mais baixo, em 0.002, obtemos um melhor resumo do discurso. Como seria útil ver mais do que as dez linhas padrão, vamos usar a função `print` (imprimir) junto com a opção `n` (de número) definida como 15 para que possamos ver mais linhas. - - - -```{r} - -print(filter(tabela, frequency < 0.002), n = 15) - -``` - - - -Isto agora nos mostra o seguinte resultado: - - - -```{r} - ->#A tibble: 463 x 4 - -word count language frequency - - - -1 laughter 11. en 0.000643 - -2 voices 8. en 0.00189 - -3 allies 4. en 0.000844 - -4 harder 4. en 0.00152 - -5 qaida 4. en 0.000183 - -6 terrorists 4. en 0.00122 - -7 bipartisan 3. en 0.000145 - -8 generations 3. en 0.00123 - -9 stamp 3. en 0.00166 - -10 strongest 3. en 0.000591 - -11 syria 3. en 0.00136 - -12 terrorist 3. en 0.00181 - -13 tougher 3. en 0.000247 - -14 weaken 3. en 0.000181 - -15 accelerate 2. en 0.000544 - ->#... with 448 more rows - -``` - -Os resultados parecem sugerir alguns dos temas principais deste discurso, como “syria” (Síria), “terrorist” (terrorista) e “qaida” (Qaeda) (o nome al-qaida foi dividido em “al” e “qaida” pelo tokenizador). - - - -## Sumarizar o documento - - - -Para fornecer informações contextuais para o conjunto de dados que estamos analisando, temos uma tabela com metadados sobre cada um dos discursos do Estado da União. Vamos carregá-la em R: - - ```{r} - -metadados <- read_csv(sprintf("%s/%s", base_url, "metadata.csv")) - -metadados - -``` - - -As primeiras dez linhas do grupo de dados aparecem assim: - - -```{r} - ->#A tibble: 236 x 4 - -president year party sotu_type - - - -1 George Washington 1790 Nonpartisan speech - -2 George Washington 1790 Nonpartisan speech - -3 George Washington 1791 Nonpartisan speech - -4 George Washington 1792 Nonpartisan speech - -5 George Washington 1793 Nonpartisan speech - -6 George Washington 1794 Nonpartisan speech - -7 George Washington 1795 Nonpartisan speech - -8 George Washington 1796 Nonpartisan speech - -9 John Adams 1797 Federalist speech - -10 John Adams 1798 Federalist speech - ->#... with 226 more rows - -``` - - -Temos o nome do presidente, o ano, o partido político do presidente e o formato de discurso do Estado da União (oral ou escrito) para cada discurso no conjunto. O discurso de 2016 está na linha 236 dos metadados que, por acaso, é a última linha. - - - -Na próxima seção, pode ser útil resumir os dados para um discurso numa única linha de texto. Podemos fazer isto extraindo as cinco palavras mais frequentes com uma frequência inferior a 0,002% no *Corpus de um trilhão de palavras do Google* e combinando isso com dados sobre o presidente e o ano. - - -```{r} - -tabela <- filter(tabela, frequency < 0.002) - -resultado <- c(metadados$president[236], metadados$year[236], tabela$word[1:5]) - -paste(resultado, collapse = "; ") - -``` - - - -Isto deveria dar-nos o seguinte resultado: - - - -```{r} - -[1] "Barack Obama; 2016; laughter; voices; allies; harder; qaida" - -[1] “Barack Obama; 2016; risadas; vozes; aliados; mais duro; qaeda” - -``` - -Esta linha capta tudo sobre o discurso? É evidente que não. O processamento de texto nunca substituirá a leitura atenta de um texto, mas ajuda a dar um resumo de alto nível das questões discutidas ("risadas" aparecem aqui porque as reações do público são anotadas no texto do discurso). Este resumo é útil de várias maneiras. Pode fornecer um título ad-hoc ou resumo para um documento que não tenha estas informações; pode servir para lembrar aos leitores que leram ou ouviram o discurso quais foram os principais temas discutidos; e compilar vários resumos com uma única ação pode mostrar padrões em grande escala que muitas vezes se perdem em grandes corpus. É a este último uso que recorremos agora ao aplicar as técnicas desta seção a um grupo maior de discursos do Estado da União. - - -# Análise dos discursos do Estado da União de 1790 a 2016 - -## Carregar o corpus - - -A primeira coisa a fazer para analisar o corpus de discursos do Estado da União é carregá-los em R. Isto envolve as mesmas funções `paste` (colar) e `readLines` (ler linhas) como antes, mas temos que gerar um loop `for` (para) que executa as funções nos 236 ficheiros de texto. Estas são combinadas com a função `c`. - - - -```{r} - -ficheiros <- sprintf("%s/sotu_text/%03d.txt", base_url, 1:236) - -texto <- c() - -for (f in ficheiros) { - -texto <- c(texto, paste(readLines(f), collapse = "\n")) - -} - -``` - -Esta técnica carrega todos os ficheiros um a um do Github. Opcionalmente, é possível baixar um arquivo zip (comprimido) com o corpus completo e carregar os ficheiros manualmente. Esta técnica é descrita na próxima seção. - - -## Forma alternativa de carregar o corpus (opcional) - -Pode fazer o download do corpus aqui: [sotu_text.zip](/assets/basic-text-processing-in-r/sotu_text.zip). Descompacte o repositório em algum lugar no seu computador e defina a variável `input_loc` (local de upload) para o caminho do diretório onde o arquivo foi descompactado. Por exemplo, se os ficheiros estão na área de trabalho de um computador macOS e o usuário é o stevejobs, `input_loc` deve ser: - - ```{r} - -input_loc <- "/Users/stevejobs/Desktop/sotu_text" - -``` - -Uma vez feito, pode usar o seguinte bloco de código para carregar todos os textos: - - ```{r} - -ficheiros <- dir(input_loc, full.names = TRUE) - -texto <- c() - -for (f in ficheiros) { - -texto <- c(texto, paste(readLines(f), collapse = "\n")) - -} - -``` - - -É possível usar esta mesma técnica para carregar seu próprio corpus de textos. - - -## Análise exploratória - - -Uma vez mais, com a função `tokenize_words`, podemos calcular o comprimento de cada discurso em número de palavras. - - ```{r} - -palavras <- tokenize_words(texto) - -sapply(palavras, length) - -``` - -Existe um padrão temporal na duração dos discursos? Como se compara a duração dos discursos de outros presidentes com os de Franklin D. Roosevelt, Abraham Lincoln e George Washington? - - -A melhor maneira de descobrir é criando um gráfico de dispersão. É possível construir um usando a função `qplot` (gráfico), com o ano (year) no eixo x ou horizontal e o número de palavras (lenght) no eixo y ou vertical. - -```{r} - -qplot(metadados$year, sapply(palavras, length)) + labs(x = "Ano", y = "Número de palavras") - -``` - -Isto cria um gráfico como este: - -![Number of words in each State of the Union Address plotted by year.](/images/basic-text-processing-in-r/sotu-number-of-words.jpg)Número de palavras em cada discurso do Estado da União por ano. - -Parece que a maioria dos discursos aumentaram de 1790 a 1850 e depois aumentaram novamente no final do século XIX. A duração diminuiu drasticamente em torno da Primeira Guerra Mundial, com alguns pontos discrepantes espalhados ao longo do século XX. - - -Existe alguma razão por trás dessas mudanças? Para explicar esta variação, podemos definir a cor dos pontos para denotar se são discursos que foram apresentados por escrito ou falados. O comando para fazer este gráfico envolve apenas uma pequena mudança no comando do gráfico: - - -```{r} - -qplot(metadados$year, sapply(palavras, length), color = metadados$sotu_type) + labs(x = "Ano", y = "Número de palavras", color = "Modalidade do discurso") - -``` - -Isto produz o seguinte gráfico: - -![Number of words in each State of the Union Address plotted by year, with color denoting whether it was a written or oral message.](/images/basic-text-processing-in-r/sotu-number-of-words-and-type.jpg)Número de palavras em cada discurso do Estado da União organizado por ano e com a cor denotando se se tratava de um discurso escrito ou oral. - - -Vemos que o aumento no século XIX foi quando os discursos se tornaram documentos escritos e que a queda drástica foi quando Woodrow Wilson (28º Presidente dos Estados Unidos, entre 1913 e 1921) rompeu com a tradição e deu o seu discurso sobre o Estado da União oralmente no Congresso. Os pontos discrepantes que vimos anteriormente eram discursos proferidos por escrito após a Segunda Guerra Mundial. - - - -## Análise estilométrica - - -A estilometria, o estudo linguístico do estilo, faz uso extensivo de métodos computacionais para descrever o estilo de escrita de um autor. Com o nosso corpus, é possível detectar mudanças no estilo de escrita ao longo dos séculos XIX e XX. Um estudo estilométrico mais formal, geralmente, envolve o uso de código de análise sintática ou de reduções dimensionais algorítmicas complexas, tais como a análise dos principais componentes a serem estudados ao longo do tempo e entre autores. Neste tutorial, continuaremos a nos concentrar no estudo do comprimento das frases. - -O corpus pode ser dividido em frases usando a função `tokenize_sentences`. Neste caso, o resultado é uma lista com 236 objetos, cada um representando um documento específico. - - - -```{r} - -frases <- tokenize_sentences(texto) - -``` - - - -Em seguida, queremos dividir cada frase em palavras. A função `tokenize_words` pode ser utilizada, mas não diretamente sobre a lista de objetos `frases`. Poderíamos fazer isso com um loop `for` de novo, mas há uma forma mais simples de o fazer. A função `sapply` oferece uma aproximação mais direta. Aqui, queremos aplicar a segmentação de palavras individualmente a cada documento e, para isso, esta função é perfeita. - - - -```{r} - -frases_palavras <- sapply(frases, tokenize_words) - -``` - -Agora, temos uma lista (com cada elemento representando um documento) de listas (com cada elemento representando as palavras de uma dada frase). O resultado que precisamos é uma lista de objetos que forneça o comprimento de cada frase num dado documento. Para isto, combinamos o loop `for` com a função `sapply`. - - - -```{r} - -comprimento_frases <- list() - -for (i in 1:nrow(metadados)) { - -comprimento_frases[[i]] <- sapply(frases_palavras[[i]], length) - -} - -``` - - -O resultado de `comprimento_frases` pode ser visualizado numa linha temporal. Primeiro, precisamos de resumir o comprimento de todas as frases de um documento a um único número. A função `median` (mediana), que encontra o 50º percentil dos dados inseridos, é uma boa opção para resumir as frases, porque não será muito afectada por possíveis erros de segmentação que podem ter criado uma frase artificialmente longa [^11]. - - - -```{r} - -mediana_comprimento_frases <- sapply(comprimento_frases, median) - -``` - - -Agora, criamos um diagrama com essa variável junto com os anos dos discursos utilizando, mais uma vez, a função `qplot`. - - - -```{r} - -qplot(metadados$year, mediana_comprimento_frases) + labs(x = "Ano", y = "Mediana do comprimento das frases") - -``` - - ![Median sentence length for each State of the Union Address.](/images/basic-text-processing-in-r/sotu-sentence-length.jpg)Duração mediana das frases por discurso do Estado da União. - -O gráfico mostra-nos uma forte tendência geral de frases mais curtas nos dois séculos do corpus. Lembre-se que alguns discursos no final da segunda metade do século XX eram longos e escritos, muito parecidos com os do século XIX. É particularmente interessante que estes não se destaquem em se tratando de mediana do comprimento das frases. - - -Para tornar esse padrão ainda mais explícito, é possível adicionar uma linha de tendência no gráfico com a função `geom_smooth` (geometrização suave). - - -```{r} - -qplot(metadados$year, mediana_comprimento_frases) + geom_smooth() + labs(x = "Ano", y = "Mediana do comprimento das frases") - -``` - ![Median sentence length for each State of the Union Address, with a smoothing line.](/images/basic-text-processing-in-r/sotu-sentence-length-smooth.jpg)Comprimento mediano de cada discurso do Estado da União com uma linha de tendência. - - -As linhas de tendência são um ótimo complemento aos gráficos. Elas possuem a função dupla de mostrar a tendência geral dos dados no tempo, enquanto destacam pontos atípicos ou periféricos. - - - -## Resumo do documento - - - -Como tarefa final, queremos aplicar a função de resumo simples que utilizamos na seção anterior a cada um dos documentos desse corpus mais amplo. Precisamos utilizar um loop outra vez, mas o código interno permanece quase o mesmo, com a exceção de que precisamos guardar os resultados como um elemento do vetor `description` (descrição). - - - -```{r} - -description <- c() - -``` - -```{r} - -for (i in 1:length(palavras)) { - -tabela <- table(palavras[[i]]) - -tabela <- data_frame(word = names(tabela), count = as.numeric(tabela)) - -tabela <- arrange(tabela, desc(count)) - -tabela <- inner_join(tabela, palavras_frequentes) - -tabela <- filter(tabela, frequency < 0.002) - -resultado <- c(metadados$president[i], metadados$year[i], tabela$word[1:5]) - -description <- c(description, paste(resultado, collapse = "; ")) - -} - -``` - - - -Enquanto se processa cada ficheiro como resultado da função `inner_join`, é possível ver uma linha que diz **Joining, by = “word”**. Como o loop pode demorar um ou mais minutos o processamento da função, esta linha serve para assegurar que o código está processando os ficheiros. Podemos ver o resultado do loop escrevendo `description` no console, mas, com a função `cat`, obtemos uma visão mais nítida dos resultados. - - ```{r} - -cat(description, sep = "\n") - -``` - - -Os resultados oferecem uma linha para cada discurso do Estado da União. Aqui, por exemplo, estão as linhas dos presidentes Bill Clinton, George W. Bush e Barack Obama: - - ``` - ->William J. Clinton; 1993; deficit; propose; incomes; invest; decade - -William J. Clinton; 1994; deficit; renew; ought; brady; cannot - -William J. Clinton; 1995; ought; covenant; deficit; bureaucracy; voted - -William J. Clinton; 1996; bipartisan; gangs; medicare; deficit; harder - -William J. Clinton; 1997; bipartisan; cannot; balanced; nato; immigrants - -William J. Clinton; 1998; bipartisan; deficit; propose; bosnia; millennium - -William J. Clinton; 1999; medicare; propose; surplus; balanced; bipartisan - -William J. Clinton; 2000; propose; laughter; medicare; bipartisan; prosperity - -George W. Bush; 2001; medicare; courage; surplus; josefina; laughter - -George W. Bush; 2002; terrorist; terrorists; allies; camps; homeland - -George W. Bush; 2003; hussein; saddam; inspectors; qaida; terrorists - -George W. Bush; 2004; terrorists; propose; medicare; seniors; killers - -George W. Bush; 2005; terrorists; iraqis; reforms; decades; generations - -George W. Bush; 2006; hopeful; offensive; retreat; terrorists; terrorist - -George W. Bush; 2007; terrorists; qaida; extremists; struggle; baghdad - -George W. Bush; 2008; terrorists; empower; qaida; extremists; deny - -Barack Obama; 2009; deficit; afford; cannot; lending; invest - -Barack Obama; 2010; deficit; laughter; afford; decade; decades - -Barack Obama; 2011; deficit; republicans; democrats; laughter; afghan - -Barack Obama; 2012; afford; deficit; tuition; cannot; doubling - -Barack Obama; 2013; deficit; deserve; stronger; bipartisan; medicare - -Barack Obama; 2014; cory; laughter; decades; diplomacy; invest - -Barack Obama; 2015; laughter; childcare; democrats; rebekah; republicans - -Barack Obama; 2016; laughter; voices; allies; harder; qaida - -``` - -Como já foi referido, estes resumos temáticos não são, de forma alguma, um substituto para uma leitura atenta de cada documento. Eles servem, no entanto, como um resumo geral e de alto nível de cada presidência. Vemos, por exemplo, o foco inicial no déficit durante os primeiros anos da presidência de Bill Clinton, sua mudança em direção ao bipartidarismo enquanto a Câmara e o Senado se inclinavam para os republicanos em meados dos anos 1990, e uma mudança em direção à reforma do Medicare no final de sua presidência. Os discursos de George W. Bush concentraram-se, principalmente, no terrorismo, com exceção do discurso de 2001 proferido antes dos ataques terroristas de 11 de setembro. Barack Obama voltou a preocupar-se com a economia sob a sombra da recessão de 2008. A palavra "riso" aparece frequentemente porque é adicionada às transcrições quando o riso do público faz com que o orador pare. - - - -# Próximos passos - - - -Neste pequeno tutorial exploramos algumas maneiras básicas de analisar dados textuais com a linguagem de programação R. Há várias direções que se pode tomar para se aprofundar nas novas técnicas de análise de texto. Aqui estão três exemplos particularmente interessantes: - - - -* conduzir uma análise completa com base em processamento de linguagem natural (NLP) num texto para extrair características tais como nomes de entidades, categorias gramaticais e relações de dependência. Estes estão disponíveis em vários pacotes R, incluindo o **cleanNLP**[^12], e para vários idiomas. - -* realizar uma modelagem por tópicos (*topic models*) para detectar discursos específicos no corpus usando pacotes como **mallet**[^13] e **topicmodels**[^14]. - -* aplicar técnicas de redução de dimensionalidade para traçar tendências estilísticas ao longo do tempo ou entre diferentes autores. Por exemplo, o pacote **tsne** [^15] realiza uma poderosa forma de redução de dimensionalidade particularmente favorável a gráficos detalhados. - - -Existem muitos tutoriais genéricos para estes três exemplos, assim como uma documentação detalhada dos pacotes[^16]. Esperamos oferecer tutoriais focados em aplicações históricas deles no futuro. - - - -# Notas - -[^1]: O nosso corpus contém 236 discursos sobre o Estado da União. Dependendo do que for contado, este número pode ser ligeiramente superior ou inferior. - -[^2]: Taryn Dewar, “R Basics with Tabular Data,” Programming Historian (05 September 2016), [/lessons/r-basics-with-tabular-data](/en/lessons/r-basics-with-tabular-data). - -[^3]: Hadley Wickham. “tidyverse: Easily Install and Load ‘Tidyverse’ Packages”. R Package, Version 1.1.1. https://cran.r-project.org/web/packages/tidyverse/index.html - -[^4]: Lincoln Mullen and Dmitriy Selivanov. “tokenizers: A Consistent Interface to Tokenize Natural Language Text Convert”. R Package, Version 0.1.4. https://cran.r-project.org/web/packages/tokenizers/index.html - -[^5]: Tenha em mente que os nomes das funções, como `library` e `install.packages`, sempre estarão em inglês. Apesar disso, colocamos uma tradução do significado para facilitar a compreensão e traduzimos os nomes das variáveis [N. de T.]. - -[^6]: Tradução publicada pela Folha em português (13 de janeiro de 2016) [https://www1.folha.uol.com.br/mundo/2016/01/1729011-leia-a-integra-do-ultimo-discurso-do-estado-da-uniao-de-obama.shtml](https://www1.folha.uol.com.br/mundo/2016/01/1729011-leia-a-integra-do-ultimo-discurso-do-estado-da-uniao-de-obama.shtml) [N. de T.] - -[^7]: Foi feito o download de todos os discursos presidenciais do The American Presidency Project da University of California Santa Barbara (acesso em 11 de novembro de 2016) [http://www.presidency.ucsb.edu/sou.php](http://www.presidency.ucsb.edu/sou.php) - -[^8]: Aqui, voltamos para a versão original do discurso, em inglês, para dar prosseguimento à análise e, particularmente, para observarmos a lista de palavras mais utilizadas em inglês. Continuaremos a traduzir os nomes das variáveis e das funções para facilitar a compreensão em português [N. de T.]. - -[^9]: Aqui, optamos por nomear as colunas da tabela em inglês, como *word* (palavra) e *count* (contagem), para facilitar a interação com o conjunto de dados que será introduzido depois com a função `inner_join` [N. de T.]. - -[^10]: Peter Norvig. “Google Web Trillion Word Corpus”. (Accedido el 11 de noviembre de 2016) [http://norvig.com/ngrams/](https://web.archive.org/web/20260326183858/http://norvig.com/ngrams/). - -[^11]: Isto ocorre em alguns discursos escritos do Estado da União, quando uma lista com numeração é segmentada numa única frase longa. - -[^12]: Taylor Arnold. “cleanNLP: A Tidy Data Model for Natural Language Processing”. R Package, Version 0.24. https://cran.r-project.org/web/packages/cleanNLP/index.html - -[^13]: David Mimno. “mallet: A wrapper around the Java machine learning tool MALLET”. R Package, Version 1.0. https://cran.r-project.org/web/packages/mallet/index.html - -[^14]: Bettina Grün and Kurt Hornik. “https://cran.r-project.org/web/packages/topicmodels/index.html”. R Package, Version 0.2-4. https://cran.r-project.org/web/packages/topicmodels/index.html - -[^15]: Ver o artigo" t-distributed stochastic neighbor embedding" na Wikipedia (em inglês). https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding [N. de T.] - -[^16]: Ver, por exemplo, o livro dos autores Taylor Arnold and Lauren Tilton. *Humanities Data in R: Exploring Networks, Geospatial Data, Images, and Text.* Springer, 2015. +--- +title: Processamento Básico de Texto em R +slug: processamento-basico-texto-r +layout: lesson +date: 2017-03-27 +translation_date: 2021-07-13 +authors: +- Taylor Arnold +- Lauren Tilton +reviewers: +- Brandon Walsh +- John Russell +editors: +- Jeri Wieringa +translator: +- Diana Rebelo Rodriguez +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Rômulo Predes +- Maria Guedes +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/381 +activity: analyzing +topics: [distant-reading, r, data-visualization] +abstract: "Aprenda a usar o R para analisar padrões de alto nível em textos, aplicar métodos estilométricos ao longo do tempo e entre autores, assim como a usar métodos para resumir informações para descrever um corpus" +original: basic-text-processing-in-r +avatar_alt: Crianças com livros junto a uma biblioteca itinerante +doi: 10.46430/phpt0013 +--- + +{% include toc.html %} + +## Objetivos + +Hoje em dia há uma quantidade substancial de dados históricos disponíveis em forma de texto simples e digitalizado. Alguns exemplos comuns são cartas, artigos de jornal, notas pessoais, diários, documentos legais e transcrições de discursos. Enquanto algumas aplicações de softwares independentes têm ferramentas para analisar dados textuais, o uso de linguagens de programação apresenta uma maior flexibilidade para analisar um corpus de documentos de texto. Neste tutorial, guiaremos os usuários no básico da análise de texto na linguagem de programação R. A nossa abordagem envolve usar apenas a tokenização que produz uma análise sintática do texto, com elementos como palavras, frases e orações. No final da presente lição, os usuários poderão: + +* utilizar análises exploratórias para verificar erros e detectar padrões gerais; +* aplicar métodos básicos de estilometria através do tempo e entre autores; +* conseguir resumir o conteúdo do documento para oferecer uma descrição geral do corpus. + +Para esta lição, será utilizado um conjunto de dados com os textos dos discursos presidenciais dos Estados Unidos da América sobre o [Estado da União](https://pt.wikipedia.org/wiki/Discurso_sobre_o_Estado_da_Uni%C3%A3o)[^1]. + +Assumimos que os usuários possuem um conhecimento básico da linguagem de programação R. A lição [Noções básicas de R com dados tabulares](/en/lessons/r-basics-with-tabular-data)[^2] (em inglês) é um excelente guia que contém todos os conhecimentos em R necessários aqui, tais como instalar e abrir R, instalar e carregar pacotes e importar e trabalhar com dados básicos de R. Os usuários podem fazer o download do R indicado para os seus sistemas operativos em [The Comprehensive R Archive Network](https://cran.r-project.org/). Ainda que não seja um pré-requisito, recomendamos que os novos usuários façam o download do [R Studio](https://www.rstudio.com/products/rstudio/#Desktop), um ambiente de desenvolvimento de código aberto para escrever e executar programas em R. + +Todo o código desta lição foi testado em R na versão 4.0.2, mas esperamos que ele rode adequadamente em qualquer versão futura do programa. + +# Um pequeno exemplo + +## Configuração de pacotes + +É necessário instalar dois pacotes de R antes de começar com o tutorial: o **tidyverse**[^3] e o **tokenizers**[^4]. O primeiro proporciona ferramentas convenientes para ler e trabalhar com grupos de dados e o segundo contém funções para dividir os dados do texto em palavras e orações. Para instalá-los, abra o R no seu computador e execute essas duas linhas de código no console: + +```{r} + +install.packages("tidyverse") + +install.packages("tokenizers") + +``` + +Dependendo da configuração do seu sistema, pode ser aberta uma caixa de diálogo solicitando a escolha de um lugar da internet para fazer o download. Caso apareça, escolha a opção mais perto de sua localização atual. O download e a instalação, provavelmente, irão ocorrer automaticamente. + +Agora que esses pacotes estão no seu computador, precisamos de avisar ao R que eles devem ser carregados para o uso. Isso é feito através do comando `library`. Pode ser que apareçam alguns avisos enquanto carregam outras dependências, mas eles podem ser ignorados sem nenhum problema. Execute essas duas linhas de código no console para habilitar o uso dos pacotes: + +```{r} + +library(tidyverse) + +library(tokenizers) + +``` + +O comando `install.packages` (instalar pacotes) só é necessário executar na primeira vez em que iniciar este tutorial, o comando `library` deverá ser executado todas as vezes que se inicia o R[^5]. + +## Segmentação de palavras + +Nesta seção, vamos trabalhar com um único parágrafo. Este exemplo pertence ao início do último discurso de Barack Obama sobre o Estado da União, em 2016. Para facilitar a compreensão do tutorial nesta primeira etapa, estudamos este parágrafo traduzido para português[^6]. + +Para carregar o texto, copie e cole o seguinte no console do R: + +``` + +texto <- paste("Também entendo que, pelo fato de estarmos em temporada eleitoral, as expectativas quanto ao que vamos realizar este ano são baixas. Mesmo assim, senhor presidente da Câmara, aprecio a atitude construtiva que o senhor e os outros líderes assumiram no final do ano passado para aprovar o orçamento e perpetuar a redução dos impostos sobre as famílias trabalhadoras. Desse modo, espero que possamos colaborar este ano sobre questões que são prioritárias para ambos os partidos, como a reforma da justiça criminal e a assistência às pessoas dependentes de drogas vendidas com receita médica. Quem sabe possamos surpreender os cínicos novamente.") + +``` + +Depois de executar o comando (clicando em “Enter”), escreva a palavra `texto` no console e pressione Enter. O R irá mostrar o conteúdo do objeto texto, uma vez que ele contém parte do discurso proferido por Obama. + +O primeiro passo do processamento de texto envolve utilizar a função `tokenize_words` (segmentar palavras) do pacote **tokenizers** para dividir o texto en palavras individuais. + +```{r} + +palavras <- tokenize_words(texto) + +``` + +Para apresentar os resultados na janela do console do R, mostrando tanto o resultado tokenizado como a posição de cada elemento na margem esquerda, execute palavras no console: + + +```{r} + +palavras + +``` + +Isso produz o seguinte resultado: + + +``` + +> [[1]] + +[1] "também" "entendo" "que" "pelo" "fato" + +[6] "de" "estarmos" "em" "temporada" "eleitoral" + +[11] "as" "expectativas" "quanto" "ao" "que" + +[16] "vamos" "realizar" "este" "ano" "são" + +[21] "baixas" "mesmo" "assim" "senhor" "presidente" + +[26] "da" "câmara" "aprecio" "a" "atitude" + +[31] "construtiva" "que" "o" "senhor" "e" + +[36] "os" "outros" "líderes" "assumiram" "no" + +[41] "final" "do" "ano" "passado" "para" + +[46] "aprovar" "o" "orçamento" "e" "perpetuar" + +[51] "a" "redução" "dos" "impostos" "sobre" + +[56] "as" "famílias" "trabalhadoras" "desse" "modo" + +[61] "espero" "que" "possamos" "colaborar" "este" + +[66] "ano" "sobre" "questões" "que" "são" + +[71] "prioritárias" "para" "ambos" "os" "partidos" + +[76] "como" "a" "reforma" "da" "justiça" + +[81] "criminal" "e" "a" "assistência" "às" + +[86] "pessoas" "dependentes" "de" "drogas" "vendidas" + +[91] "com" "receita" "médica" "quem" "sabe" + +[96] "possamos" "surpreender" "os" "cínicos" "novamente" + +``` + +Como o texto carregado mudou depois de se executar essa função de R? Ela removeu toda a pontuação, dividiu o texto em palavras individuais e converteu tudo para minúsculas. Em breve, veremos porque todas essas intervenções são úteis para a nossa análise. + +Quantas palavras existem neste fragmento de texto? Se usamos a função `length` (comprimento) diretamente no objeto `palavras`, o resultado não é muito útil. + + + +```{r} + +length(palavras) + +``` + + +O resultado é igual a: + + +```{r} + +[1] 1 + +``` + +O comprimento equivale a 1 porque a função `tokenize_words` retorna uma lista de objetos com uma entrada por documento carregado. O nosso carregamento possui apenas um documento, então a lista também possui apenas um elemento. Para ver as palavras dentro do primeiro documento, utilizamos o símbolo [], da seguinte forma: `[[1]]`. O objetivo é selecionar apenas o primeiro elemento da lista: + + +```{r} + +length(palavras[[1]]) + +``` + +O resultado é `100`, indicando que existem 100 palavras neste parágrafo. + +A separação do documento em palavras individuais torna possível calcular quantas vezes cada palavra foi utilizada durante o texto. Para fazer isso, primeiro aplicamos a função `table` (tabela) nas palavras do primeiro (e, neste caso, único) documento e depois separamos os nomes e os valores da tabela num novo objeto chamado _data frame_. O uso de um quadro de dados em R é semelhante ao uso de uma tabela numa base de dados. Esses passos, em conjunto com a impressão do resultado, são obtidos com as seguintes linhas de código: + + +```{r} + +tabela <- table(palavras[[1]]) + +tabela <- data_frame(palavra = names(tabela), contagem = as.numeric(tabela)) + +tabela + +``` + +O resultado deste comando deve aparecer assim no seu console (*tibble* é um tipo específico de _data frame_ criado no pacote [Tidy Data](https://en.wikipedia.org/wiki/Tidy_data)): + +``` + +# A tibble: 77 x 2 + +palavra contagem + + + +1 a 4. + +2 ambos 1. + +3 ano 3. + +4 ao 1. + +5 aprecio 1. + +6 aprovar 1. + +7 as 2. + +8 às 1. + +9 assim 1. + +10 assistência 1. + +# ... with 67 more rows + +``` + + +Há uma quantidade substancial de informação nesta amostra. Vemos que existem 77 palavras únicas, como indica a dimensão da tabela. As 10 primeiras fileiras do conjunto de dados são apresentadas, com a segunda coluna mostrando quantas vezes a palavra da primeira coluna foi utilizada. Por exemplo, “ano” foi usada três vezes, enquanto “aprovar”, apenas uma vez. + + + +Também podemos ordenar a tabela usando a função `arrange` (organizar). Esta função precisa do conjunto de dados a utilizar, aqui `tabela`, e depois o nome da coluna que serve de referência para ordená-lo. A função `desc` no segundo argumento indica que queremos ordenar em ordem decrescente. + + + +```{r} + +arrange(tabela, desc(contagem)) + +``` + + +E agora o resultado será: + + + +```{r} + +# A tibble: 77 x 2 + +palavra contagem + + + +1 que 5. + +2 a 4. + +3 ano 3. + +4 e 3. + +5 os 3. + +6 as 2. + +7 da 2. + +8 de 2. + +9 este 2. + +10 o 2. + +# … with 67 more rows + +``` + + + +As palavras mais comuns são pronomes e palavras funcionais tais como "que", "a", "e" e "os". Observe como a análise é facilitada pelo uso da versão em minúsculas de cada palavra. Qualquer contagem prevê que a palavra possa estar no início ou no meio da frase. + + + +Uma técnica popular é carregar uma lista de palavras frequentemente usadas e eliminá-las antes da análise formal. As palavras em tal lista são chamadas "*stopwords*" ou "palavras vazias" e são geralmente pronomes, conjugações dos verbos mais comuns e conjunções. Neste tutorial, temos uma variação sutil desta técnica. + + + +## Detectar frases + + + +O pacote **tokenizer** também contém a função `tokenize_sentences`, que detecta limites de frases, ao invés de palavras. Ele pode ser executado da seguinte maneira: + + + +```{r} + +frases <- tokenize_sentences(texto) + +frases + +``` + + + +Com o resultado: + + + +```{r} + +> frases + +[[1]] + +[1] "Também entendo que, pelo fato de estarmos em temporada eleitoral, as expectativas quanto ao que vamos realizar este ano são baixas." + +[2] "Mesmo assim, senhor presidente da Câmara, aprecio a atitude construtiva que o senhor e os outros líderes assumiram no final do ano passado para aprovar o orçamento e perpetuar a redução dos impostos sobre as famílias trabalhadoras." + +[3] "Desse modo, espero que possamos colaborar este ano sobre questões que são prioritárias para ambos os partidos, como a reforma da justiça criminal e a assistência às pessoas dependentes de drogas vendidas com receita médica." + +[4] "Quem sabe possamos surpreender os cínicos novamente." + +``` + + + +O resultado é um vetor de caracteres, um objeto unidimensional que consiste apenas em elementos representados como caracteres. Observe que o resultado marcou cada frase como um elemento separado. + + + +É possível conectar o resultado da divisão das frases com o resultado da divisão das palavras. Se executarmos a divisão de frases do parágrafo com a função `tokenize_words`, cada frase será tratada como um único documento. Execute isto usando a seguinte linha de código e veja se o resultado é o esperado, a segunda linha de comando serve para imprimir o resultado. + + + +```{r} + +frases_palavras <- tokenize_words(frases[[1]]) + +frases_palavras + +``` + + + +Se olharmos para o tamanho do resultado diretamente, podemos ver que existem quatro “documentos” no objeto `frases_palavras`: + + + +```{r} + +length(frases_palavras) + +``` + + + +Ao acessar cada uma delas diretamente, é possível saber quantas palavras há em cada frase do parágrafo: + + + +```{r} + +length(frases_palavras[[1]]) + +length(frases_palavras[[2]]) + +length(frases_palavras[[3]]) + +length(frases_palavras[[4]]) + +``` + + + +Isto pode demandar um pouco de esforço, mas felizmente existe uma maneira mais simples de o fazer. A função `sapply` executa a função no segundo argumento para cada elemento do primeiro argumento. Como resultado, podemos calcular a extensão de cada frase do primeiro parágrafo com uma única linha de código: + + + +```{r} + +sapply(frases_palavras, length) + +``` + + + +O resultado agora será assim: + + + +```{r} + +[1] 21 37 35 7 + +``` + + + +Podemos ver que existem quatro frases com um comprimento de 21, 37, 35 e 7 palavras. Utilizaremos esta função para trabalharmos com documentos maiores. + + + +# Analisar o discurso sobre o Estado da União de Barack Obama em 2016 + + + +## Análise exploratória + + + +Vamos aplicar as técnicas da seção anterior a um discurso sobre o Estado da União completo, desta vez, usando o original em inglês. Por uma questão de coerência, vamos usar o mesmo discurso de 2016 de Barack Obama. Agora, vamos carregar os dados de um ficheiro, uma vez que a cópia direta é difícil em grande escala. + + + +Para tal, vamos combinar a função `readLines` (ler linhas) para carregar o texto em R e a função `paste` (colar) para combinar todas as linhas num único objeto. Vamos criar a URL do arquivo de texto usando a função `sprintf`, uma vez que este formato permitirá que ele seja facilmente aproveitado para outros recursos online[^7],[^8]. + + + +```{r} + +base_url <- "https://raw.githubusercontent.com/programminghistorian/jekyll/gh-pages/assets/basic-text-processing-in-r/" + +url <- sprintf("%s/sotu_text/236.txt", base_url) + +texto <- paste(readLines(url), collapse = "\n") + +``` + + + +Como antes, vamos segmentar o texto e ver o número de palavras no documento. + + + +```{r} + +palavras <- tokenize_words(texto) + +length(palavras[[1]]) + +``` + + + +Vemos que este discurso contém um total de `6113` palavras. Ao combinar as funções `table` (tabela), `data_frame` e `arrange` (organizar), como fizemos no exemplo anterior, obtemos as palavras mais frequentes em todo o discurso. Ao fazer isso, observe como é fácil reutilizar o código anterior para repetir a análise num novo conjunto de dados. Este é um dos maiores benefícios de usar uma linguagem de programação para realizar uma análise baseada em dados [^9]. + + + +```{r} + +tabela <- table(palavras[[1]]) + +tabela <- data_frame(word = names(tabela), count = as.numeric(tabela)) + +tabela <- arrange(tabela, desc(count)) + +tabela + +``` + +O resultado deve ser: + + +```{r} + +>#A tibble: 1,590 x 2 + +word count + + + +1 the 281. + +2 to 209. + +3 and 189. + +4 of 148. + +5 that 125. + +6 we 124. + +7 a 120. + +8 in 105. + +9 our 96. + +10 is 72. + +>#... with 1,580 more rows + +``` + +Mais uma vez, palavras extremamente comuns como *the* ("o" ou "a"), *to* ("para") e *and* ("e") estão no topo da tabela. Estes termos não são particularmente esclarecedores se quisermos conhecer o assunto do discurso. Na realidade, queremos encontrar palavras que se destaquem mais neste texto do que num grande corpus externo em inglês. Para conseguir isso, precisamos de um conjunto de dados que forneça essas frequências. Aqui está o conjunto de dados de Peter Norviq usando o *Google Web Trillion Word Corpus* (Corpus de um trilhão de palavras da web do Google), coletado a partir dos dados compilados através do rastreamento de sites populares em inglês pelo Google [^10]: + + +```{r} + +palavras_frequentes <- read_csv(sprintf("%s/%s", base_url, "word_frequency.csv")) + +palavras_frequentes + +``` + + +A primeira coluna indica o idioma (sempre "en" para inglês neste caso), a segunda coluna - frequency - fornece a palavra em questão e a terceira coluna indica a percentagem com a qual ela aparece no *Corpus de um trilhão de palavras do Google*. Por exemplo, a palavra "for" aparece quase exatamente 1 vez a cada 100 palavras, pelo menos nos textos dos sites indexados pelo Google. + + + +Para combinar estas palavras frequentes com o conjunto de dados na `tabela` construída a partir do discurso do Estado da União, podemos usar a função `inner_join` (união interna). Esta função toma dois conjuntos de dados e combina-os em todas as colunas que têm o mesmo nome. Neste caso, a coluna comum é a chamada _word_ ("palavra"). + + + +```{r} + +tabela <- inner_join(tabela, palavras_frequentes) + +tabela + +``` + + + +Note que agora o nosso conjunto de dados tem duas colunas extras que fornecem o idioma (aqui relativamente pouco útil já que é sempre "en") e a frequência da palavra no corpus externo. Esta segunda nova coluna será muito útil, porque podemos filtrar linhas que têm uma frequência inferior a 0,1%, ou seja, que aparecem mais de uma vez em cada 1000 palavras: + + + +```{r} + +filter(tabela, frequency < 0.1) + +``` + + +Isto produz: + + +```{r} + +>#A tibble: 1,457 x 4 + +word count language frequency + + + +1 america 28. en 0.0232 + +2 people 27. en 0.0817 + +3 just 25. en 0.0787 + +4 world 23. en 0.0734 + +5 american 22. en 0.0387 + +6 work 22. en 0.0713 + +7 make 20. en 0.0689 + +8 want 19. en 0.0440 + +9 change 18. en 0.0358 + +10 years 18. en 0.0574 + +>#... with 1,447 more rows + +``` + + + +Esta lista está começando a se tornar mais interessante. Um termo como "america" aparece no topo da lista porque, podemos pensar, é muito usado nos discursos dos políticos e menos em outros campos. Ao estabelecer o limiar ainda mais baixo, em 0.002, obtemos um melhor resumo do discurso. Como seria útil ver mais do que as dez linhas padrão, vamos usar a função `print` (imprimir) junto com a opção `n` (de número) definida como 15 para que possamos ver mais linhas. + + + +```{r} + +print(filter(tabela, frequency < 0.002), n = 15) + +``` + + + +Isto agora nos mostra o seguinte resultado: + + + +```{r} + +>#A tibble: 463 x 4 + +word count language frequency + + + +1 laughter 11. en 0.000643 + +2 voices 8. en 0.00189 + +3 allies 4. en 0.000844 + +4 harder 4. en 0.00152 + +5 qaida 4. en 0.000183 + +6 terrorists 4. en 0.00122 + +7 bipartisan 3. en 0.000145 + +8 generations 3. en 0.00123 + +9 stamp 3. en 0.00166 + +10 strongest 3. en 0.000591 + +11 syria 3. en 0.00136 + +12 terrorist 3. en 0.00181 + +13 tougher 3. en 0.000247 + +14 weaken 3. en 0.000181 + +15 accelerate 2. en 0.000544 + +>#... with 448 more rows + +``` + +Os resultados parecem sugerir alguns dos temas principais deste discurso, como “syria” (Síria), “terrorist” (terrorista) e “qaida” (Qaeda) (o nome al-qaida foi dividido em “al” e “qaida” pelo tokenizador). + + + +## Sumarizar o documento + + + +Para fornecer informações contextuais para o conjunto de dados que estamos analisando, temos uma tabela com metadados sobre cada um dos discursos do Estado da União. Vamos carregá-la em R: + + ```{r} + +metadados <- read_csv(sprintf("%s/%s", base_url, "metadata.csv")) + +metadados + +``` + + +As primeiras dez linhas do grupo de dados aparecem assim: + + +```{r} + +>#A tibble: 236 x 4 + +president year party sotu_type + + + +1 George Washington 1790 Nonpartisan speech + +2 George Washington 1790 Nonpartisan speech + +3 George Washington 1791 Nonpartisan speech + +4 George Washington 1792 Nonpartisan speech + +5 George Washington 1793 Nonpartisan speech + +6 George Washington 1794 Nonpartisan speech + +7 George Washington 1795 Nonpartisan speech + +8 George Washington 1796 Nonpartisan speech + +9 John Adams 1797 Federalist speech + +10 John Adams 1798 Federalist speech + +>#... with 226 more rows + +``` + + +Temos o nome do presidente, o ano, o partido político do presidente e o formato de discurso do Estado da União (oral ou escrito) para cada discurso no conjunto. O discurso de 2016 está na linha 236 dos metadados que, por acaso, é a última linha. + + + +Na próxima seção, pode ser útil resumir os dados para um discurso numa única linha de texto. Podemos fazer isto extraindo as cinco palavras mais frequentes com uma frequência inferior a 0,002% no *Corpus de um trilhão de palavras do Google* e combinando isso com dados sobre o presidente e o ano. + + +```{r} + +tabela <- filter(tabela, frequency < 0.002) + +resultado <- c(metadados$president[236], metadados$year[236], tabela$word[1:5]) + +paste(resultado, collapse = "; ") + +``` + + + +Isto deveria dar-nos o seguinte resultado: + + + +```{r} + +[1] "Barack Obama; 2016; laughter; voices; allies; harder; qaida" + +[1] “Barack Obama; 2016; risadas; vozes; aliados; mais duro; qaeda” + +``` + +Esta linha capta tudo sobre o discurso? É evidente que não. O processamento de texto nunca substituirá a leitura atenta de um texto, mas ajuda a dar um resumo de alto nível das questões discutidas ("risadas" aparecem aqui porque as reações do público são anotadas no texto do discurso). Este resumo é útil de várias maneiras. Pode fornecer um título ad-hoc ou resumo para um documento que não tenha estas informações; pode servir para lembrar aos leitores que leram ou ouviram o discurso quais foram os principais temas discutidos; e compilar vários resumos com uma única ação pode mostrar padrões em grande escala que muitas vezes se perdem em grandes corpus. É a este último uso que recorremos agora ao aplicar as técnicas desta seção a um grupo maior de discursos do Estado da União. + + +# Análise dos discursos do Estado da União de 1790 a 2016 + +## Carregar o corpus + + +A primeira coisa a fazer para analisar o corpus de discursos do Estado da União é carregá-los em R. Isto envolve as mesmas funções `paste` (colar) e `readLines` (ler linhas) como antes, mas temos que gerar um loop `for` (para) que executa as funções nos 236 ficheiros de texto. Estas são combinadas com a função `c`. + + + +```{r} + +ficheiros <- sprintf("%s/sotu_text/%03d.txt", base_url, 1:236) + +texto <- c() + +for (f in ficheiros) { + +texto <- c(texto, paste(readLines(f), collapse = "\n")) + +} + +``` + +Esta técnica carrega todos os ficheiros um a um do Github. Opcionalmente, é possível baixar um arquivo zip (comprimido) com o corpus completo e carregar os ficheiros manualmente. Esta técnica é descrita na próxima seção. + + +## Forma alternativa de carregar o corpus (opcional) + +Pode fazer o download do corpus aqui: [sotu_text.zip](/assets/basic-text-processing-in-r/sotu_text.zip). Descompacte o repositório em algum lugar no seu computador e defina a variável `input_loc` (local de upload) para o caminho do diretório onde o arquivo foi descompactado. Por exemplo, se os ficheiros estão na área de trabalho de um computador macOS e o usuário é o stevejobs, `input_loc` deve ser: + + ```{r} + +input_loc <- "/Users/stevejobs/Desktop/sotu_text" + +``` + +Uma vez feito, pode usar o seguinte bloco de código para carregar todos os textos: + + ```{r} + +ficheiros <- dir(input_loc, full.names = TRUE) + +texto <- c() + +for (f in ficheiros) { + +texto <- c(texto, paste(readLines(f), collapse = "\n")) + +} + +``` + + +É possível usar esta mesma técnica para carregar seu próprio corpus de textos. + + +## Análise exploratória + + +Uma vez mais, com a função `tokenize_words`, podemos calcular o comprimento de cada discurso em número de palavras. + + ```{r} + +palavras <- tokenize_words(texto) + +sapply(palavras, length) + +``` + +Existe um padrão temporal na duração dos discursos? Como se compara a duração dos discursos de outros presidentes com os de Franklin D. Roosevelt, Abraham Lincoln e George Washington? + + +A melhor maneira de descobrir é criando um gráfico de dispersão. É possível construir um usando a função `qplot` (gráfico), com o ano (year) no eixo x ou horizontal e o número de palavras (lenght) no eixo y ou vertical. + +```{r} + +qplot(metadados$year, sapply(palavras, length)) + labs(x = "Ano", y = "Número de palavras") + +``` + +Isto cria um gráfico como este: + +![Number of words in each State of the Union Address plotted by year.](/images/basic-text-processing-in-r/sotu-number-of-words.jpg)Número de palavras em cada discurso do Estado da União por ano. + +Parece que a maioria dos discursos aumentaram de 1790 a 1850 e depois aumentaram novamente no final do século XIX. A duração diminuiu drasticamente em torno da Primeira Guerra Mundial, com alguns pontos discrepantes espalhados ao longo do século XX. + + +Existe alguma razão por trás dessas mudanças? Para explicar esta variação, podemos definir a cor dos pontos para denotar se são discursos que foram apresentados por escrito ou falados. O comando para fazer este gráfico envolve apenas uma pequena mudança no comando do gráfico: + + +```{r} + +qplot(metadados$year, sapply(palavras, length), color = metadados$sotu_type) + labs(x = "Ano", y = "Número de palavras", color = "Modalidade do discurso") + +``` + +Isto produz o seguinte gráfico: + +![Number of words in each State of the Union Address plotted by year, with color denoting whether it was a written or oral message.](/images/basic-text-processing-in-r/sotu-number-of-words-and-type.jpg)Número de palavras em cada discurso do Estado da União organizado por ano e com a cor denotando se se tratava de um discurso escrito ou oral. + + +Vemos que o aumento no século XIX foi quando os discursos se tornaram documentos escritos e que a queda drástica foi quando Woodrow Wilson (28º Presidente dos Estados Unidos, entre 1913 e 1921) rompeu com a tradição e deu o seu discurso sobre o Estado da União oralmente no Congresso. Os pontos discrepantes que vimos anteriormente eram discursos proferidos por escrito após a Segunda Guerra Mundial. + + + +## Análise estilométrica + + +A estilometria, o estudo linguístico do estilo, faz uso extensivo de métodos computacionais para descrever o estilo de escrita de um autor. Com o nosso corpus, é possível detectar mudanças no estilo de escrita ao longo dos séculos XIX e XX. Um estudo estilométrico mais formal, geralmente, envolve o uso de código de análise sintática ou de reduções dimensionais algorítmicas complexas, tais como a análise dos principais componentes a serem estudados ao longo do tempo e entre autores. Neste tutorial, continuaremos a nos concentrar no estudo do comprimento das frases. + +O corpus pode ser dividido em frases usando a função `tokenize_sentences`. Neste caso, o resultado é uma lista com 236 objetos, cada um representando um documento específico. + + + +```{r} + +frases <- tokenize_sentences(texto) + +``` + + + +Em seguida, queremos dividir cada frase em palavras. A função `tokenize_words` pode ser utilizada, mas não diretamente sobre a lista de objetos `frases`. Poderíamos fazer isso com um loop `for` de novo, mas há uma forma mais simples de o fazer. A função `sapply` oferece uma aproximação mais direta. Aqui, queremos aplicar a segmentação de palavras individualmente a cada documento e, para isso, esta função é perfeita. + + + +```{r} + +frases_palavras <- sapply(frases, tokenize_words) + +``` + +Agora, temos uma lista (com cada elemento representando um documento) de listas (com cada elemento representando as palavras de uma dada frase). O resultado que precisamos é uma lista de objetos que forneça o comprimento de cada frase num dado documento. Para isto, combinamos o loop `for` com a função `sapply`. + + + +```{r} + +comprimento_frases <- list() + +for (i in 1:nrow(metadados)) { + +comprimento_frases[[i]] <- sapply(frases_palavras[[i]], length) + +} + +``` + + +O resultado de `comprimento_frases` pode ser visualizado numa linha temporal. Primeiro, precisamos de resumir o comprimento de todas as frases de um documento a um único número. A função `median` (mediana), que encontra o 50º percentil dos dados inseridos, é uma boa opção para resumir as frases, porque não será muito afectada por possíveis erros de segmentação que podem ter criado uma frase artificialmente longa [^11]. + + + +```{r} + +mediana_comprimento_frases <- sapply(comprimento_frases, median) + +``` + + +Agora, criamos um diagrama com essa variável junto com os anos dos discursos utilizando, mais uma vez, a função `qplot`. + + + +```{r} + +qplot(metadados$year, mediana_comprimento_frases) + labs(x = "Ano", y = "Mediana do comprimento das frases") + +``` + + ![Median sentence length for each State of the Union Address.](/images/basic-text-processing-in-r/sotu-sentence-length.jpg)Duração mediana das frases por discurso do Estado da União. + +O gráfico mostra-nos uma forte tendência geral de frases mais curtas nos dois séculos do corpus. Lembre-se que alguns discursos no final da segunda metade do século XX eram longos e escritos, muito parecidos com os do século XIX. É particularmente interessante que estes não se destaquem em se tratando de mediana do comprimento das frases. + + +Para tornar esse padrão ainda mais explícito, é possível adicionar uma linha de tendência no gráfico com a função `geom_smooth` (geometrização suave). + + +```{r} + +qplot(metadados$year, mediana_comprimento_frases) + geom_smooth() + labs(x = "Ano", y = "Mediana do comprimento das frases") + +``` + ![Median sentence length for each State of the Union Address, with a smoothing line.](/images/basic-text-processing-in-r/sotu-sentence-length-smooth.jpg)Comprimento mediano de cada discurso do Estado da União com uma linha de tendência. + + +As linhas de tendência são um ótimo complemento aos gráficos. Elas possuem a função dupla de mostrar a tendência geral dos dados no tempo, enquanto destacam pontos atípicos ou periféricos. + + + +## Resumo do documento + + + +Como tarefa final, queremos aplicar a função de resumo simples que utilizamos na seção anterior a cada um dos documentos desse corpus mais amplo. Precisamos utilizar um loop outra vez, mas o código interno permanece quase o mesmo, com a exceção de que precisamos guardar os resultados como um elemento do vetor `description` (descrição). + + + +```{r} + +description <- c() + +``` + +```{r} + +for (i in 1:length(palavras)) { + +tabela <- table(palavras[[i]]) + +tabela <- data_frame(word = names(tabela), count = as.numeric(tabela)) + +tabela <- arrange(tabela, desc(count)) + +tabela <- inner_join(tabela, palavras_frequentes) + +tabela <- filter(tabela, frequency < 0.002) + +resultado <- c(metadados$president[i], metadados$year[i], tabela$word[1:5]) + +description <- c(description, paste(resultado, collapse = "; ")) + +} + +``` + + + +Enquanto se processa cada ficheiro como resultado da função `inner_join`, é possível ver uma linha que diz **Joining, by = “word”**. Como o loop pode demorar um ou mais minutos o processamento da função, esta linha serve para assegurar que o código está processando os ficheiros. Podemos ver o resultado do loop escrevendo `description` no console, mas, com a função `cat`, obtemos uma visão mais nítida dos resultados. + + ```{r} + +cat(description, sep = "\n") + +``` + + +Os resultados oferecem uma linha para cada discurso do Estado da União. Aqui, por exemplo, estão as linhas dos presidentes Bill Clinton, George W. Bush e Barack Obama: + + ``` + +>William J. Clinton; 1993; deficit; propose; incomes; invest; decade + +William J. Clinton; 1994; deficit; renew; ought; brady; cannot + +William J. Clinton; 1995; ought; covenant; deficit; bureaucracy; voted + +William J. Clinton; 1996; bipartisan; gangs; medicare; deficit; harder + +William J. Clinton; 1997; bipartisan; cannot; balanced; nato; immigrants + +William J. Clinton; 1998; bipartisan; deficit; propose; bosnia; millennium + +William J. Clinton; 1999; medicare; propose; surplus; balanced; bipartisan + +William J. Clinton; 2000; propose; laughter; medicare; bipartisan; prosperity + +George W. Bush; 2001; medicare; courage; surplus; josefina; laughter + +George W. Bush; 2002; terrorist; terrorists; allies; camps; homeland + +George W. Bush; 2003; hussein; saddam; inspectors; qaida; terrorists + +George W. Bush; 2004; terrorists; propose; medicare; seniors; killers + +George W. Bush; 2005; terrorists; iraqis; reforms; decades; generations + +George W. Bush; 2006; hopeful; offensive; retreat; terrorists; terrorist + +George W. Bush; 2007; terrorists; qaida; extremists; struggle; baghdad + +George W. Bush; 2008; terrorists; empower; qaida; extremists; deny + +Barack Obama; 2009; deficit; afford; cannot; lending; invest + +Barack Obama; 2010; deficit; laughter; afford; decade; decades + +Barack Obama; 2011; deficit; republicans; democrats; laughter; afghan + +Barack Obama; 2012; afford; deficit; tuition; cannot; doubling + +Barack Obama; 2013; deficit; deserve; stronger; bipartisan; medicare + +Barack Obama; 2014; cory; laughter; decades; diplomacy; invest + +Barack Obama; 2015; laughter; childcare; democrats; rebekah; republicans + +Barack Obama; 2016; laughter; voices; allies; harder; qaida + +``` + +Como já foi referido, estes resumos temáticos não são, de forma alguma, um substituto para uma leitura atenta de cada documento. Eles servem, no entanto, como um resumo geral e de alto nível de cada presidência. Vemos, por exemplo, o foco inicial no déficit durante os primeiros anos da presidência de Bill Clinton, sua mudança em direção ao bipartidarismo enquanto a Câmara e o Senado se inclinavam para os republicanos em meados dos anos 1990, e uma mudança em direção à reforma do Medicare no final de sua presidência. Os discursos de George W. Bush concentraram-se, principalmente, no terrorismo, com exceção do discurso de 2001 proferido antes dos ataques terroristas de 11 de setembro. Barack Obama voltou a preocupar-se com a economia sob a sombra da recessão de 2008. A palavra "riso" aparece frequentemente porque é adicionada às transcrições quando o riso do público faz com que o orador pare. + + + +# Próximos passos + + + +Neste pequeno tutorial exploramos algumas maneiras básicas de analisar dados textuais com a linguagem de programação R. Há várias direções que se pode tomar para se aprofundar nas novas técnicas de análise de texto. Aqui estão três exemplos particularmente interessantes: + + + +* conduzir uma análise completa com base em processamento de linguagem natural (NLP) num texto para extrair características tais como nomes de entidades, categorias gramaticais e relações de dependência. Estes estão disponíveis em vários pacotes R, incluindo o **cleanNLP**[^12], e para vários idiomas. + +* realizar uma modelagem por tópicos (*topic models*) para detectar discursos específicos no corpus usando pacotes como **mallet**[^13] e **topicmodels**[^14]. + +* aplicar técnicas de redução de dimensionalidade para traçar tendências estilísticas ao longo do tempo ou entre diferentes autores. Por exemplo, o pacote **tsne** [^15] realiza uma poderosa forma de redução de dimensionalidade particularmente favorável a gráficos detalhados. + + +Existem muitos tutoriais genéricos para estes três exemplos, assim como uma documentação detalhada dos pacotes[^16]. Esperamos oferecer tutoriais focados em aplicações históricas deles no futuro. + + + +# Notas + +[^1]: O nosso corpus contém 236 discursos sobre o Estado da União. Dependendo do que for contado, este número pode ser ligeiramente superior ou inferior. + +[^2]: Taryn Dewar, “R Basics with Tabular Data,” Programming Historian (05 September 2016), [/lessons/r-basics-with-tabular-data](/en/lessons/r-basics-with-tabular-data). + +[^3]: Hadley Wickham. “tidyverse: Easily Install and Load ‘Tidyverse’ Packages”. R Package, Version 1.1.1. https://cran.r-project.org/web/packages/tidyverse/index.html + +[^4]: Lincoln Mullen and Dmitriy Selivanov. “tokenizers: A Consistent Interface to Tokenize Natural Language Text Convert”. R Package, Version 0.1.4. https://cran.r-project.org/web/packages/tokenizers/index.html + +[^5]: Tenha em mente que os nomes das funções, como `library` e `install.packages`, sempre estarão em inglês. Apesar disso, colocamos uma tradução do significado para facilitar a compreensão e traduzimos os nomes das variáveis [N. de T.]. + +[^6]: Tradução publicada pela Folha em português (13 de janeiro de 2016) [https://www1.folha.uol.com.br/mundo/2016/01/1729011-leia-a-integra-do-ultimo-discurso-do-estado-da-uniao-de-obama.shtml](https://www1.folha.uol.com.br/mundo/2016/01/1729011-leia-a-integra-do-ultimo-discurso-do-estado-da-uniao-de-obama.shtml) [N. de T.] + +[^7]: Foi feito o download de todos os discursos presidenciais do The American Presidency Project da University of California Santa Barbara (acesso em 11 de novembro de 2016) [https://www.presidency.ucsb.edu/sou.php](https://www.presidency.ucsb.edu/sou.php) + +[^8]: Aqui, voltamos para a versão original do discurso, em inglês, para dar prosseguimento à análise e, particularmente, para observarmos a lista de palavras mais utilizadas em inglês. Continuaremos a traduzir os nomes das variáveis e das funções para facilitar a compreensão em português [N. de T.]. + +[^9]: Aqui, optamos por nomear as colunas da tabela em inglês, como *word* (palavra) e *count* (contagem), para facilitar a interação com o conjunto de dados que será introduzido depois com a função `inner_join` [N. de T.]. + +[^10]: Peter Norvig. “Google Web Trillion Word Corpus”. (Accedido el 11 de noviembre de 2016) [http://norvig.com/ngrams/](https://web.archive.org/web/20260326183858/http://norvig.com/ngrams/). + +[^11]: Isto ocorre em alguns discursos escritos do Estado da União, quando uma lista com numeração é segmentada numa única frase longa. + +[^12]: Taylor Arnold. “cleanNLP: A Tidy Data Model for Natural Language Processing”. R Package, Version 0.24. https://cran.r-project.org/web/packages/cleanNLP/index.html + +[^13]: David Mimno. “mallet: A wrapper around the Java machine learning tool MALLET”. R Package, Version 1.0. https://cran.r-project.org/web/packages/mallet/index.html + +[^14]: Bettina Grün and Kurt Hornik. “https://cran.r-project.org/web/packages/topicmodels/index.html”. R Package, Version 0.2-4. https://cran.r-project.org/web/packages/topicmodels/index.html + +[^15]: Ver o artigo" t-distributed stochastic neighbor embedding" na Wikipedia (em inglês). https://en.wikipedia.org/wiki/T-distributed_stochastic_neighbor_embedding [N. de T.] + +[^16]: Ver, por exemplo, o livro dos autores Taylor Arnold and Lauren Tilton. *Humanities Data in R: Exploring Networks, Geospatial Data, Images, and Text.* Springer, 2015. diff --git a/pt/licoes/qgis-camadas.md b/pt/licoes/qgis-camadas.md index ca8ba3b05f..e9f186a006 100644 --- a/pt/licoes/qgis-camadas.md +++ b/pt/licoes/qgis-camadas.md @@ -71,7 +71,7 @@ Descarregue os seguintes ficheiros *shapefiles*: 1. [coastline.SHP.zip](/assets/qgis-layers/coastline.SHP.zip) 2. [lot_town.SHP.zip](/assets/qgis-layers/lot_town.SHP.zip) 3. [hydronetwork.SHP.zip](/assets/qgis-layers/hydronetwork.SHP.zip) -4. +4. 5. [nat_parks.SHP.zip](/assets/qgis-layers/nat_parks.SHP.zip) 6. [PEI Highways](/assets/qgis-layers/PEI_highway.zip) 7. [PEI Places](/assets/qgis-layers/PEI_placenames.zip) @@ -103,7 +103,7 @@ Selecione Propriedades do Projeto. - Perceba que a projeção mudou no canto inferior direito da janela do QGIS. Próximo a ela, verá a localização geográfica do seu cursor em metros. - Na janela **Projeto**, selecione _Salvar Projeto_ (é recomendado salvar seu projeto após cada etapa). -Agora está pronto para trabalhar no projeto de tutorial, mas pode ser que tenha algumas perguntas sobre qual SRC utilizar para o seu próprio projeto. O WGS83 pode funcionar a curto prazo, principalmente se estiver trabalhando em uma escala consideravelmente maior, mas apresentará dificuldades em trabalhar com precisão em mapas locais. Uma dica é saber quais SRC ou Projeções são utilizados para os mapas em papel da região. Caso digitalize um mapa físico de alta qualidade para utilizar como camada base, pode ser uma boa ideia utilizar a mesma projeção. Pode-se também tentar buscar na internet quais os SRC mais comuns para determinada região. Para aqueles trabalhando em projetos norte americanos, identificar o NAD83 correto par a sua região vai ser, geralmente, o melhor SRC. Aqui estão alguns links para outros recursos que lhe ajudarão a escolher um SRC para o seu próprio projeto: [Tutorial: Trabalhando com Projeções no QGIS](http://web.archive.org/web/20180715071501/http://www.qgistutorials.com/pt_BR/docs/working_with_projections.html) (em inglês). +Agora está pronto para trabalhar no projeto de tutorial, mas pode ser que tenha algumas perguntas sobre qual SRC utilizar para o seu próprio projeto. O WGS83 pode funcionar a curto prazo, principalmente se estiver trabalhando em uma escala consideravelmente maior, mas apresentará dificuldades em trabalhar com precisão em mapas locais. Uma dica é saber quais SRC ou Projeções são utilizados para os mapas em papel da região. Caso digitalize um mapa físico de alta qualidade para utilizar como camada base, pode ser uma boa ideia utilizar a mesma projeção. Pode-se também tentar buscar na internet quais os SRC mais comuns para determinada região. Para aqueles trabalhando em projetos norte americanos, identificar o NAD83 correto par a sua região vai ser, geralmente, o melhor SRC. Aqui estão alguns links para outros recursos que lhe ajudarão a escolher um SRC para o seu próprio projeto: [Tutorial: Trabalhando com Projeções no QGIS](https://web.archive.org/web/20180715071501/https://www.qgistutorials.com/pt_BR/docs/working_with_projections.html) (em inglês). ### Construindo um mapa base @@ -275,4 +275,4 @@ Após criar um mapa utilizando camadas vetoriais, agora nós iremos adicionar ou Aprendeu a instalar o QGIS e a adicionar camadas. Certifique-se de salvar o seu trabalho! -*Essa lição é parte do [Geospatial Historian](http://geospatialhistorian.wordpress.com/).* +*Essa lição é parte do [Geospatial Historian](https://geospatialhistorian.wordpress.com/).* diff --git a/pt/licoes/som-dados-sonificacao-historiadores.md b/pt/licoes/som-dados-sonificacao-historiadores.md index 5934d15324..6770e55ae2 100644 --- a/pt/licoes/som-dados-sonificacao-historiadores.md +++ b/pt/licoes/som-dados-sonificacao-historiadores.md @@ -1,488 +1,506 @@ ---- -title: Sonificação de dados (uma introdução à sonificação para historiadores) -layout: lesson -slug: som-dados-sonificacao-historiadores -date: 2016-06-07 -translation_date: 2021-03-26 -authors: -- Shawn Graham -reviewers: -- Jeff Veitch -- Tim Compeau -editors: -- Ian Milligan -translator: -- Gabriela Kucuruza -translation-editor: -- Jimmy Medeiros -translation-reviewer: -- Samuel Van Ransbeeck -- Juliana Marques da Silva -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/429 -activity: transforming -topics: [distant-reading] -abstract: "Existem inúmeras lições que o ajudarão a visualizar o passado, mas esta lição o ajudará a ouvir o passado." -original: sonification -avatar_alt: Um violino -doi: 10.46430/phpt0020 ---- - -{% include toc.html %} - -# Introdução - -ποίησις - fabricação, criação, produção - -Eu estou muito cansado de ver o passado. Existem diversos guias que irão ajudar a _visualizar_ o passado que não podemos ver, mas muitas vezes nós esquecemos que a visualização é um ato de criatividade. Nós talvez estejamos muito ligados às nossas telas, muito focados em "ver". Ao invés disso, deixe-me ouvir algo do passado. - -Enquanto existe uma história e uma literatura profundas sobre arqueoacústica e paisagens sonoras que tentam capturar o som de um lugar _como ele era_ ([veja por exemplo a Virtual St. Paul's](https://www.digitalstudies.org/articles/10.16995/dscn.58) ou o trabalho de [Jeff Veitch em Ostia antiga](https://jeffdveitch.wordpress.com/)), eu tenho interesse em 'sonificar' o que eu tenho _agora_, os dados eles mesmos. Eu quero descobrir uma gramática para representar dados em som que seja apropriada para História. [Drucker](#Drucker) [notoriamente nos lembra](http://www.digitalhumanities.org/dhq/vol/5/1/000091/000091.html) que ‘dados’ não são coisas dadas, mas ao invés disso, coisas capturadas, coisas transformadas. Na sonificação de dados, eu literalmente realizo o passado no presente, e então as suposições e as transformações que faço estão em primeiro plano. A experiência auditiva resultante é uma "deformação" literal que nos faz ouvir as camadas modernas do passado de uma nova maneira. - -Eu quero ouvir os significados do passado, mas eu sei que não posso. No entanto, quando ouço um instrumento, posso imaginar a materialidade do músico tocando; posso discernir o espaço físico em seus ecos e ressonâncias. Eu posso sentir o som, eu posso me mover no ritmo. A música engaja o meu corpo inteiro, minha imaginação inteira. As suas associações com sons, música e tons que eu ouvi antes criam uma experiência temporal profunda, um sistema de relações incorporadas entre eu e o passado. Visual? Nós temos representações visuais do passado há tanto tempo, que nós quase nos esquecemos dos aspectos artístico e performativo dessas gramáticas de expressão. - -Nesse tutorial, você aprenderá a fazer um pouco de barulho a partir dos seus dados sobre o passado. O _significado_ desse barulho, bem... isso depende de você. Parte do objetivo desse tutorial é te fazer estranhar os seus dados. Traduzindo-o, transcodificando-o, [remediando-o](http://blog.taracopplestone.co.uk/making-things-photobashing-as-archaeological-remediation/) (em inglês), nós começaremos a ver elementos dos dados que a nossa familiaridade com modelos visuais nos impediu de enxergar. Essa deformação está de acordo com os argumentos apresentados por, por exemplo, Mark Sample sobre [quebrar coisas](http://www.samplereality.com/2012/05/02/notes-towards-a-deformed-humanities/) (em inglês), ou Bethany Nowviskie sobre a '[resistência nos materiais](http://nowviskie.org/2013/resistance-in-the-materials/)' (em inglês). Sonificação nos move através do continuum de dados para captação, ciências sociais para arte, [falha para estética](http://nooart.org/post/73353953758/temkin-glitchhumancomputerinteraction) (em inglês). Então vamos ver como isso tudo soa. - -## Objetivos - -Nesse tutorial, apresentarei três maneiras diferentes de gerar som ou música a partir de seus dados. - -Na primeira, usaremos um sistema desenvolvido por Jonathan Middleton, disponível gratuitamente para uso, chamado _Musicalgorithms_ (Algorítmos Musicais) a fim de introduzir algumas das questões e termos-chaves envolvidos. Na segunda, usaremos uma pequena biblioteca do Python para 'mapear por parâmetro' os nossos dados contra o teclado de 88 teclas e introduzir um pouco de arte em nosso trabalho. Finalmente, aprenderemos como carregar nossos dados no ambiente de codificação ao vivo de código aberto para som e música, _Sonic Pi_, momento em que te deixarei para que explore os abundantes tutoriais e recursos desse projeto. - -Você verá que "sonificação" nos movimenta através do espectro partindo de simples 'visualização/auralização' para performance real. - -### Ferramentas -+ Musicalgorithms [http://musicalgorithms.org/](http://musicalgorithms.org/) -+ MIDITime [https://github.com/cirlabs/miditime](https://github.com/cirlabs/miditime) (Eu bifurquei uma cópia no GitHub [aqui](https://github.com/shawngraham/miditime)) -+ Sonic Pi [http://sonic-pi.net/](http://sonic-pi.net/) - -### Dados de Exemplo - -+ [Dados sobre artefatos romanos](/assets/sonification/sonification-roman-data.csv) -+ [Excerto do modelo de tópicos do diário de John Adams](/assets/sonification/sonification-diary.csv) -+ [Excerto do modelo de tópicos das relações jesuíticas](/assets/sonification/sonification-jesuittopics.csv) - -# Um pouco de contexto sobre sonificação - -Sonificação é a prática de mapear aspectos dos dados para produzir sinais sonoros. Em geral, uma técnica pode ser chamada de "sonificação" se cumprir certas condições. Elas incluem reprodutibilidade (os mesmos dados podem ser transformados da mesma maneira por outros pesquisadores de forma que produzam os mesmos resultados) e o que pode ser chamado de inteligibilidade - que os elementos "objetivos" dos dados originais sejam sistematicamente refletidos no som resultante (veja [Hermann (2008)](http://www.icad.org/Proceedings/2008/Hermann2008.pdf) (em inglês) para uma taxonomia da sonificação). [Last e Usyskin (2015)](https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data) (em inglês) realizaram uma série de experimentos para determinar quais tarefas de análise de dados poderiam ser performadas quando os dados eram sonificados. Os seus resultados experimentais mostraram que mesmo um grupo de ouvintes não-treinados (sem treinamento formal em música) podem fazer distinções úteis nos dados. Eles encontraram ouvintes que conseguiam distinguir tarefas comuns de exploração de dados nos dados sonificados, como classificação e agrupamento. Os seus resultados sonificados mapearam os dados fundamentais da escala musical ocidental. - -Last e Usyskin focaram em dados de séries temporais. Eles argumentam que dados de séries temporais são particularmente bons para sonificação, pois há paralelos naturais com sons musicais. Música é sequencial, ela tem duração e ela se desenvolve ao longo do tempo, assim como dados de séries temporais. [(Last e Usyskin 2015, p. 424)](https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data). Torna-se um problema combinar os dados com as saídas sônicas apropriadas. Em muitas aplicações de sonificação, uma técnica chamada "mapeamento de parâmetros" é usada para combinar aspectos dos dados ao longo de várias dimensões da audição, como [tom](#tom), variação, brilho e início. O problema com esta abordagem é que onde não há relação temporal (ou melhor, nenhuma relação não linear) entre os pontos de dados originais, o som resultante pode ser "confuso" (2015, p. 422). - -## Escutando as lacunas -Há também o modo que preenchemos as lacunas do som com as nossas expectativas. Considere esse vídeo em que [mp3](#mp3) foi convertido para [MIDI](#midi) e de volta para mp3; a música foi 'achatada' para que todas as informações sonoras sejam tocadas por apenas um instrumento. (Gerar esse efeito é como salvar uma página da web como .txt, abri-la no Word e, então, salvá-la novamente como .html). Todos os sons (inclusive vocais) foram traduzidos para os seus valores de nota correspondentes e, em seguida, transformados de volta em mp3. - -É barulhento, entretanto percebemos o significado. Considere o vídeo abaixo: - - - -O que está acontecendo aqui? Se já conhecia essa música, provavelmente ouviu as 'palavras'. No entanto, nenhuma palavra está presente na música! Se você não conhecia esse música, deve ter soado como um absurdo inaudível (veja mais exemplos no website de [Andy Baio](http://waxy.org/2015/12/if_drake_was_born_a_piano/)). Esse efeito é, às vezes, chamado de 'alucinação auditiva' (cf. [Koebler, 2015](#Koebler)). Esses exemplos mostram como qualquer representação de dados que podemos ouvir/ver não está lá, estritamente falando. Nós preenchemos as lacunas com as nossas próprias expectativas. - -Considere as implicações para a História. Se sonificarmos nossos dados e começarmos a ouvir padrões no som, ou pontos fora da curva, nossas expectativas culturais sobre como a música funciona (nossas memórias de fragmentos musicais semelhantes, ouvidos em contextos específicos) irão colorir nossa interpretação. Isso, eu argumentaria, é verdadeiro para todas as representações do passado, mas sonificar é apenas estranho o suficiente em relação aos nossos métodos regulares, de forma que essa autoconsciência nos ajudará a identificar ou comunicar os padrões críticos nos dados do passado. - -Iremos progredir por meio de três ferramentas diferentes para sonificação de dados, observando como as escolhas em uma ferramenta afetam o resultado e podem ser atenuadas imaginando novamente os dados por meio de outra ferramenta. No fim das contas, não há nada mais objetivo em 'sonificação' do que há em 'visualização', então quem pesquisa deve estar preparado para justificar as suas escolhas, e fazer escolhas transparentes e reprodutíveis para outros. E para que não pensemos que a sonificação e a música gerada por algoritmos são de alguma forma algo "novo", indico ao leitor interessado [Hedges, (1978)](http://www.icad.org/Proceedings/2008/Hermann2008.pdf). - -Em cada seção, irei dar uma introdução conceitual, seguida por um passo a passo usando dados arqueológicos ou históricos de amostra. - -# Musicalgorithms - -Há uma grande variedade de ferramentas para sonificar dados. Algumas, por exemplo, são pacotes amplamente usadas do [ambiente de estatística R](https://cran.r-project.org/), como ‘[playitbyR](https://cran.r-project.org/web/packages/playitbyr/index.html)’ e ‘[AudiolyzR](https://cran.r-project.org/web/packages/audiolyzR/index.html)’. O primeiro desses pacotes, entretanto, não tem sido mantido ou atualizado para as versões atuais do R (sua última atualização foi muitos anos atrás) e o segundo precisa de um número considerável de configurações adicionais de software para que funcione adequadamente. - -Por outro lado, o site [Musicalgorithms](http://musicalgorithms.org/) é bem fácil de usar. O site Musicalgorithms está online há mais de uma década. Embora não seja código aberto, ele é um projeto de pesquisa de longa-duração em música computacional do seu criador, Jonathan Middleton. Ele está atualmente em sua terceira maior iteração (interações anteriores permanecem disponíveis para uso online). Começaremos com o Musicalalgorithms porque ele nos permite entrar e ajustar os nossos dados para produzir um ficheiro de representação MIDI. Tenha atenção e seleccione a '[Versão 3](http://musicalgorithms.org/3.0/index.html)'. - -{% include figure.html filename="sonification-musicalgorithms-main-site-1.png" caption="O site Musicalgorithms como aparecia em 2 de agosto de 2016" %} - -> Nota da tradução: há novas versões disponíveis para uso, mas de forma a seguir o tutorial, seguimos a versão 3 do Musicallgorithms, usada em 2016, e ainda disponível no site para uso. - -O Musicalgorithms efetua uma série de transformações nos dados. Nos dados de amostra abaixo (o padrão do próprio site), há apenas uma linha de dados, mesmo que pareça várias linhas. Os dados de amostra são compostos de campos separados por vírgula que são delimitados por espaço. - -``` -# Of Voices, Text Area Name, Text Area Data -1,morphBox, -,areaPitch1,2 7 1 8 2 8 1 8 2 8 4 5 9 0 4 5 2 3 5 3 6 0 2 8 -,dAreaMap1,2 7 1 8 2 8 1 8 2 8 4 5 9 0 4 5 2 3 5 3 6 0 2 8 -,mapArea1,20 69 11 78 20 78 11 78 20 78 40 49 88 1 40 49 20 30 49 30 59 1 20 78 -,dMapArea1,1 5 1 5 1 5 1 5 1 5 3 3 6 0 3 3 1 2 3 2 4 0 1 5 -,so_text_area1,20 69 11 78 20 78 11 78 20 78 40 49 88 1 40 49 20 30 49 30 59 1 20 78 -``` - -Esses dados representam os dados de origem e as suas transformações; compartilhar esses dados permitiria a outro pesquisador replicar ou estender a sonificação usando outras ferramentas. No entanto, quando se começa, apenas os dados básicos abaixo são necessários (uma lista de pontos de dados): - -``` -# Of Voices, Text Area Name, Text Area Data -1,morphBox, -,areaPitch1,24 72 12 84 21 81 14 81 24 81 44 51 94 01 44 51 24 31 5 43 61 04 21 81 -``` - -O campo-chave para nós é ‘areaPitch1’, que contém os dados de entrada delimitados por espaço. Os outros campos serão preenchidos à medida que avançamos pelas várias configurações de Musicalgorithms. Nos dados acima (por exemplo, 24 72 12 84 etc.), os valores são contagens brutas de inscrições de uma série de locais ao longo de uma estrada romana na Grã-Bretanha. (Vamos praticar com outros dados em breve, abaixo). - -{% include figure.html filename="sonification-musicalgorithms-pitch-mapping-2.png" caption="Depois de carregar seus dados, é possível selecionar as diferentes operações na barra de menu superior do site. Na captura de tela, o mouseover de informações está explicando o que acontece com o dimensionamento de seus dados se você selecionar a operação de divisão para dimensionar os seus dados para o intervalo de notas selecionado." %} - -Agora, conforme se percorre as várias guias da interface ‘duration input’ (entrada de duração) , ‘pitch mapping' (mapeamento de tom), ‘duration mapping’ (mapeamento de duração), ‘scale options’ (opções de escala musical) é possível realizar várias transformações. Em ‘pitch mapping’ (mapeamento de tom), há uma série de opções matemáticas para mapear os dados contra as 88 teclas/tons completos de um teclado de piano (em um mapeamento linear, a _média_ dos dados de alguém seria mapeado para dó médio, ou 40). Também é possível escolher o tipo de escala, se é um tom maior ou menor. Nesse ponto, uma vez que se tenha selecionado várias transformações, salve o ficheiro de texto. No menu 'play' é possível realizar o download de um ficheiro MIDI. O seu programa de áudio padrão pode tocar ficheiros MIDI (geralmente padronizando para um tom de piano). Uma instrumentação mais complicada pode ser atribuída abrindo o ficheiro MIDI em programas de mixagem de música, como GarageBand (Mac) ou [LMMS](https://lmms.io/) (Windows, Mac, Linux). (O uso do Garageband ou LMMS está fora do escopo desse tutorial. Um tutorial em vídeo sobre LMMS está disponível [aqui](https://youtu.be/4dYxV3tqTUc), enquanto há muitos tutoriais do Garageband online. Lynda.com tem [um tutorial excelente](http://www.lynda.com/GarageBand-tutorials/Importing-audio-tracks/156620/164050-4.html)). - -Se tivesse várias colunas de dados para os mesmos pontos - digamos, em nosso exemplo da Grã-Bretanha romana, também queríamos sonificar contagens de um tipo de cerâmica para essas mesmas cidades - é possível recarregar sua próxima série de dados, efetuar as transformações e mapeamentos, e gerar outro ficheiro MIDI. Como o Garageband e o LMMS permitem a sobreposição de vozes, você pode começar a criar sequências musicais complicadas. - -{% include figure.html filename="sonification-garageband-john-adams-3.png" caption="Captura de tela do Garageband, onde os ficheiros MIDI são tópicos sonorizados do Diário de John Adams. Na interface do Garageband (o LMMS é semelhante), cada ficheiro MIDI é arrastado e solto no lugar. A instrumentação para cada ficheiro MIDI (ou seja, trilha) pode ser selecionada nos menus do Garageband. Os rótulos de cada faixa foram alterados aqui para refletir as palavras-chave em cada tópico. A área verde à direita representa uma visualização das notas em cada faixa. Você pode ver esta interface em ação e ouvir a música [aqui](https://youtu.be/ikqRXtI3JeA) (em inglês)" %} - -Quais transformações devem ser usadas? Se tiver duas colunas de dados, terá duas vozes. Pode fazer sentido, em nossos dados hipotéticos, tocar a primeira voz bem alto, em uma tonalidade maior: as inscrições 'falam' conosco, afinal de contas. (As inscrições romanas de fato se dirigem ao leitor, o transeunte, literalmente: 'Ó tu que passas ...'). Então, se acaso as cerâmicas de interesse forem mercadorias mais despretensiosas, talvez elas possam ser mapeadas em relação à extremidade inferior da escala ou receberem notas de duração mais longas para representar sua onipresença nas classes nessa região. - -_Não há forma 'certa' de representar os seus dados como som, ao menos não por enquanto_, mas mesmo com essa amostra de exemplo, começamos a ver como sombras de significado e interpretação podem ser atribuídas aos nossos dados e à nossa experiência dos dados. - -Mas e o tempo? Dados históricos usualmente têm um ponto de inflexão, um distinto "tempo quando" algo aconteceu. Então, a quantidade de tempo entre dois pontos de dados precisa ser considerada. É nesse ponto que a nossa próxima ferramenta se torna bem útil, para quando nossos pontos de dados tiverem uma relação com outro espaço temporal. Começamos a nos mover de sonificação (pontos de dados) para música (relações entre pontos). - -### Prática -O [conjunto de dados de amostra](/assets/sonification/sonification-roman-data.csv) apresentado contém a contagem de moedas romanas na sua primeira coluna e a contagem de materiais romanos dos mesmos locais, conforme contido no banco de dados do Portable Antiquities Scheme (Esquema de Antiguidades Portáveis) do British Museum. A sonificação desses dados pode revelar ou acentuar aspectos da situação econômica ao longo da rua Watling, uma grande rota através da Britânia Romana. Esses pontos de dados estão organizados geograficamente do Noroeste ao Sudeste; então, na medida em que o som toca, nós estamos escutando movimento através do espaço. Cada nota representa outro passo no caminho. - -1. Abra o [dados-sonificação-romana.csv](/assets/sonification/sonification-roman-data.csv) em uma tabela. Copie a primeira coluna em um editor de texto. Delete os finais das linhas de forma que os dados fiquem todos em uma linha única. -2. Adicione a seguinte informação de coluna assim: -``` -# Of Voices, Text Area Name, Text Area Data -1,morphBox, -,areaPitch1, -``` -...para que os seus dados sigam imediatamente depois da última vírgula (como [esse exemplo](/assets/sonification/sonification-romancoin-data-music.csv)). Salve o ficheiro com um nome útil como `sonsdasmoedas1.csv`. - -3. Acesse o site do [Musicalgorithms](http://musicalgorithms.org/3.0/index.html) (versão 3) e clique no botão "load" (carregar). No pop-up, clique no botão azul "load" (carregar) e selecione o ficheiro salvo no passo 2. O site carregará os seus materiais e exibirá uma marca de seleção verde se tiver sido carregado com êxito. Caso contrário, certifique-se de que os seus valores estejam separados por espaços e que sigam imediatamente a última vírgula no bloco de código na etapa 2. Também é possível tentar carregar o [ficheiro de demonstração desse tutorial](/assets/sonification/sonification-romancoin-data-music.csv) ao invés. - -{% include figure.html filename="sonification-musicalgorithms-upload-4.png" caption="Clique em 'load' na tela principal para acessar essa caixa de diálogo. Então 'load csv'. (carregue o csv) Selecione o ficheiro; ele aparecerá na caixa. Então clique no botão 'load' (carregar)." %} - -4. Clique em 'Pitch Input'. Os valores dos seus dados serão exibidos. Por enquanto, **não selecione** nenhuma outra opção nesse página (consequentemente, usaremos os valores padrão do site). - -5. Clique em 'Duration Input'. **Não selecione nenhuma opção aqui por enquanto**. As opções aqui irão mapear várias transformações em relação aos dados que alterarão a duração para cada nota. Não se preocupe com as opções por enquanto: siga adiante. -6. Clique em 'Pitch Mapping'. Essa é a escolha mais crucial, pois irá transformar (isso é, escalar) os seus dados brutos para um mapeamento em relação às teclas do teclado. Deixe a configuração de `mapping` em 'division'. (As outras opções são módulo e logarítmico). A opção `Range` 1 a 88 usa todas as 88 teclas do teclado, assim, seu valor mais baixo estaria de acordo com a nota mais profunda do piano e seu valor mais alto com a nota mais alta. Em vez disso, você pode restringir sua música em torno de dó médio, então insira 25 a 60 como seu intervalo. O resultado deveria mudar para: `31,34,34,34,25,28,30,60,28,25,26,26,25,25,60,25,25,38,33,26,25,25,25` Essas não são mais suas contagens; são as notas do teclado. - -{% include figure.html filename="sonification-musicalgorithms-settings-for-pitch-mapping-5.png" caption="Clique na caixa 'range' e defina-o para 25. Os valores abaixo serão alterados automaticamente. Clique na caixa 'to' e defina-o para 60. Clique novamente na outra caixa; os valores serão atualizados." %} - -8. Clique em 'Duration Mapping'. Como Pitch Mapping, isso pega o intervalo de tempo especificado e usa várias opções matemáticas para mapear o intervalo de possibilidade contra as suas notas. Se passar o seu cursor por cima de `i` verá como os números correspondem com notas inteiras, semínimas, colcheias e assim por diante. Deixe os valores padrão por enquanto. -9. Clique em 'Scale Options'. Aqui nós podemos começar a selecionar o que pode ser chamado de aspecto 'emocional' do som. Nós geralmente pensamos que escalas maiores são 'alegres' enquanto escalas menores são 'tristes'; para uma discussão acessível acesse esse [post de blog](http://www.ethanhein.com/wp/2010/scales-and-emotions/) (em inglês). Por enquanto, escolha 'scale by: major' (escala maior). Deixe a 'scale' (escala) como `C`. - -Agora sonificamos uma coluna de dados! Clique no botão 'save' (salvar), então 'save csv' (salvar csv). - -{% include figure.html filename="sonification-musicalgorithms-save-6.png" caption="A caixa de diálogo salvar dados." %} -Haverá um ficheiro que se parecerá com isso: - -``` -# Of Voices, Text Area Name, Text Area Data -1,morphBox, -,areaPitch1,80 128 128 128 1 40 77 495 48 2 21 19 1 1 500 1 3 190 115 13 5 1 3 -,dAreaMap1,2 7 1 8 2 8 1 8 2 8 4 5 9 0 4 5 2 3 5 3 6 0 2 -,mapArea1,31 34 34 34 25 28 30 60 28 25 26 26 25 25 60 25 25 38 33 26 25 25 25 -,dMapArea1,1 5 1 5 1 5 1 5 1 5 3 3 6 0 3 3 1 2 3 2 4 0 1 -,so_text_area1,32 35 35 35 25 28 30 59 28 25 27 27 25 25 59 25 25 39 33 27 25 25 25 -``` - -É possível ver os dados originais no campo 'areaPitch1' e os subsequentes mapeamentos. O site permite que sejam geradas até quatro vozes por vez em um ficheiro MIDI; dependendo de como se quer adicionar instrumentação depois, pode-se querer gerar um ficheiro MIDI por vez. Vamos tocar a música - clique em 'Play'. É possível selecionar o tempo aqui, e um instrumento. É possível ouvir os seus dados no navegador, ou salvá-los como um ficheiro MIDI clicando no botão azul 'Save MIDI file'. - -Retorne ao começo e carregue as duas colunas de dados nesse modelo: -``` -# Of Voices, Text Area Name, Text Area Data -2,morphBox, -,areaPitch1, -,areaPitch2, -``` - -{% include figure.html filename="sonification-2voices-7.png" caption="Coloque 2 na caixa de vozes no topo da interface. Quando você for para qualquer uma das páginas de opção - aqui, nós estamos em 'pitch input' - dois monitores abrem para mostrar os dados das duas vozes. Carregue os seus dados do csv como antes, mas formate o seu csv para ter o 'areaPitch1' e o 'areaPitch2' como descrito no texto principal. Os dados para a primeira voz irão aparecer na esquerda, e a segunda voz na direita." %} - -Quando se tem dados com várias vozes, o que se destaca? Observe que, nessa abordagem, a distância entre os pontos no mundo real não é considerada em nossa sonificação. Essa distância, se fosse considerada, poderia ser crucial. A distância, é claro, não precisa ser geográfica - pode ser temporal. A próxima ferramenta que exploraremos nos permite abordar isso em nossa sonificação explicitamente. - -# Algumas palavras sobre configurar o Python - -A próxima seção desse tutorial precisa de Python. Se não usou Python ainda, será preciso passar algum tempo [se familiarizando com a linha de comando (PC) ou terminal (OS)](/en/lessons/intro-to-bash) (em inglês). Você pode achar esse rápido [guia de instalação dos módulos do python](/pt/licoes/instalacao-modulos-python-pip) útil (mas retorne para ele depois de ler o resto da seção). - -Usuários do Mac já possuirão o Python instalado na máquina deles. É possível testar isso apertando o botão COMMAND e a barra de espaço; na janela de pesquisa, digite `terminal` e clique na aplicação do terminal. No prompt de comando, por exemplo, no cursor piscando em `$` digite `python --version` e o computador responderá com a versão do python existente no seu computador. _A próxima seção desse tutorial usa a versão Python 2.7; ela não foi testada em Python 3_. - -Para usuários do Windows, Python não é instalado por padrão na sua máquina então [essa página](http://docs.python-guide.org/en/latest/starting/install/win/) te ajudará a iniciar, apesar das coisas serem um pouco mais complicadas do que parece de acordo com a página (nota de tradução: pode usar também a [lição de instalação do Python](/pt/licoes/introducao-instalacao-python) do _Programming Historian em português_, mas tenha em atenção que nessa lição é instalada a versão 3 do Python). Primeiro, realize o download do ficheiro `.msi` que a página recomenda (Python 2.7). Clique duas vezes no ficheiro e ele deve se instalar em um novo diretório, por exemplo, `C:\Python27\`. Então, nós temos de dizer para o Windows a localização para onde buscar pelo Python sempre que um programa em python for executado; ou seja, colocaremos a localização do diretório no seu 'path', ou a variável do ambiente que o Windows sempre apresenta quando confrontado com um novo comando. Existem algumas formas de fazer isso, mas talvez a mais fácil seja buscar no seu computador pelo programa `Powershell` (digite 'powershell' na janela de pesquisa do seu computador). Abra o Powershell e, no `>` prompt, copie essa linha inteira: - -`[Environment]::SetEnvironmentVariable("Path", "$env:Path;C:\Python27\;C:\Python27\Scripts\", "User")` - -Feche o powershell quando terminar. Você saberá que funcionou se nada acontecer quando clicar em 'enter'. Para testar se tudo está funcionando, abra o prompt de comando (aqui há [10 forma de fazer isso](http://www.howtogeek.com/235101/10-ways-to-open-the-command-prompt-in-windows-10/)) (em inglês) e digite no prompt `>`, `python --version`. Ele deve retornar `Python 2.7.10` ou algo similar. - -A última peça do quebra-cabeça que todos os usuários precisarão é um programa chamado `Pip`. Os usuários de Mac podem instalá-lo digitando no terminal: :`sudo easy_install pip`. Usuários do Windows terão um pouco mais de dificuldade (nota de tradução: pode usar também a [lição de instalação de módulos Python com pip](/pt/licoes/instalacao-modulos-python-pip) do _Programming Historian em português_, mas tenha em atenção que nessa lição é usada a versão 3 do Python). Primeiro, clique no botão direito do seu cursor e salve esse link: [https://bootstrap.pypa.io/get-pip.py](https://bootstrap.pypa.io/get-pip.py) (Se apenas clicar no link, ele irá te mostrar o código no seu navegador). Salve em algum lugar útil. Abra o prompt de comando no diretório em que salvou `get-pip.py`. Então, digite no prompt de comando, `python get-pip.py`. Convencionalmente, nos tutoriais, verá `>` ou `$` em lugares em que é preciso digitar algo no prompt de comando ou no terminal. Nunca é necessário digitar esses dois caracteres. - -Finalmente, quando você tem um código python que deseja executar, pode inseri-lo em seu editor de texto e salvá-lo com a extensão `.py` (nota de tradução: pode também seguir as indicações das lições “Configurar um ambiente de desenvolvimento integrado para Python”, do _Programming Historian em português_, nas suas versões [Windows](/pt/licoes/instalacao-windows) ou [Mac](/pt/licoes/instalacao-mac), mas tenha em atenção que nessas lições é usada a versão 3 do Python). O seu ficheiro é um ficheiro de texto, mas a **extensão** do ficheiro diz para o seu computador para usar o Python para interpretá-lo; mas lembre, digite `python` no prompt primeiro, por exemplo: `$ python meu-script-legal.py`. - -# MIDITime - -MIDITime é um pacote do python desenvolvido por [Reveal News (antes, Centro de Reportagens Investigativas)](https://www.revealnews.org/). O seu [repositório no Github está aqui](https://github.com/cirlabs/miditime). Miditime foi construído explicitamente para dados de séries temporais (ou seja, uma sequencia de observações coletadas ao longo do tempo). - -Enquanto a ferramenta Musicalgorithms tem uma interface mais ou menos intuitiva, quem pesquisa sacrifica a possibilidade de saber o que, exatamente, está acontecendo internamente. -Em princípio, alguém poderia examinar o código subjacente para o pacote MIDITime para saber o que está acontecendo. Mais importante ainda, na ferramenta anterior não há nenhuma habilidade de contabilizar os dados em que os pontos estão distantes uns dos outros no tempo do relógio. MIDITime nos permite considerar que os nossos dados podem ser agrupados pelo tempo. - -Vamos supor que você tenha um diário histórico no qual você encaixou um [modelo de tópicos](/en/lessons/topic-modeling-and-mallet). A saída resultante pode ter entradas de diário como linhas, e a composição percentual de cada tópico contribui para essa entrada como colunas. Nesse caso, _ouvir_ esses valores pode te ajudar a entender os padrões de pensamento no diário de uma forma que a visualização como um gráfico pode não permitir. Outliers ou padrões musicais recorrentes poderiam se destacar ao serem ouvidos de um modo que a gramática dos gráficos obscurece. - -### Instalando o MIDITime -Instalar MIDItime é simples com o [pip](/pt/licoes/instalacao-modulos-python-pip): - -`$ pip install miditime` ou `$ sudo pip install miditime` para uma máquina Mac ou Linux ; -`> pip install miditime` em uma máquina Windows. (Usuários Windows, se as instruções acima não funcionaram muito bem, talvez queira tentar [esse programa de ajuda](https://pydatalog.readthedocs.io/en/latest/installation/#using-pip) para fazer o Pip funcionar adequadamente na sua máquina ou então seguir as instruções da [lição sobre pip](/pt/licoes/instalacao-modulos-python-pip) do _Programming Historian em português_). - -### Prática -Vamos olhar para o exemplo de script providenciado. Abra o seu editor de texto, e copie e cole o script de exemplo em: - -```python -#!/usr/bin/python - -from miditime.miditime import MIDITime - -# Instancie a classe com uma frequência (120bpm é o padrão) e o destino do ficheiro resultante. -mymidi = MIDITime(120, 'meuficheiro.mid') - -# Crie uma lista de notas. Cada nota é uma lista: [tempo, tom, ataque, duração] -midinotes = [ - [0, 60, 200, 3], #Na batida 0 (o começo), C Médio com ataque 200, para 3 batidas - [10, 61, 200, 4] #Em 10 batidas (12 segndos a partir do começo), C#5 com ataque 200, para quatro batidas 4 -] - -# Adicione uma faixa com essas notas -mymidi.add_track(midinotes) - -# Resultado do ficheiro .mid -mymidi.save_midi() -``` - -Salve o script como `musica1.py`. No seu terminal ou prompt de comando, execute o script: - -`$ python musica1.py` - -O novo ficheiro, `meuficheiro.mid` será registrado no seu diretório. Para ouvir esse ficheiro, é possível abri-lo com Quicktime ou Windows Media Player. (É possível adicionar instrumentação abrindo o ficheiro no Garageband ou [LMMS](https://lmms.io/)). - -`Musica1.py` importa miditime (lembre, é preciso realizar o `pip install miditime` antes de executar o script). Então, ele cria um ficheiro resultante de destinação e configura o tempo. Todas as notas são listadas individualmente, onde o primeiro número é o tempo em que a nota deve ser tocada, o tom da nota (ou seja, a nota de fato!), o quão forte ou ritmicamente a nota é atingida (o ataque), e a duração da nota. As notas musicais são então registradas na faixa e a faixa é registrada no `myfile.mid`. - -Agora, execute o script e adicione mais notas. As notas para a 'A barata diz que tem' são: - -``` -C7, F, Gm, Am, Bb, C, F, Dm, Gm, C, F -A ... Barata diz que tem sete saias de filó -``` - -Você consegue fazer o seu computador tocar essa música? (Esse [material](http://www.electronics.dit.ie/staff/tscarff/Music_technology/midi/midi_note_numbers_for_octaves.html) (em inglês) irá ajudar). - -**A propósito**, há uma especificação de ficheiro de texto para descrever música chamado [Notação ABC](https://pt.wikipedia.org/wiki/ABC_(nota%C3%A7%C3%A3o_musical)). Por enquanto, está além de nossa compreensão, mas alguém poderia escrever um script de sonificação em, por exemplo, uma planilha, mapeando valores para nomes de notas na especificação ABC (se você já usou um IF - THEN no Excel para converter notas percentuais em notas alfabéticas, terá uma noção de como isso pode ser feito) e então usando um site como [esse](http://trillian.mit.edu/~jc/music/abc/ABCcontrib.html) (em inglês) para converter a notação ABC em um ficheiro .mid. - -### Inserindo os seus próprios dados -[Esse ficheiro](/assets/sonification/sonification-diary.csv) é uma seleção do modelo de tópicos dos Diários de John Adams do [The Macroscope](http://themacroscope.org) (Explorando Grandes Dados Históricos: O Macroscópico do Historiador). Apenas os sinais mais fortes foram preservados através do arredondamento dos valores nas colunas para duas casas decimais (lembrando que 0.25, por exemplo, indica que aquele tópico está contribuindo para um quarto da composição daquela entrada do diário). Para obter esses dados em seu script de Python, eles devem ser formatados de uma maneira específica. A parte complicada é acertar o campo de data. - -_Para os propósitos desse tutorial, nós iremos deixar os nomes das variáveis sem alterações em relação ao script de amostra. O script de amostra foi desenvolvido com dados de um terremoto em mente; então onde diz 'magnitude' podemos pensar como '% composição do tópico.'_ - -``` -meus_dados = [ - {'data_evento': , 'magnitude': 3.4}, - {'data_evento': , 'magnitude': 3.2}, - {'data_evento': , 'magnitude': 3.6}, - {'data_evento': , 'magnitude': 3.0}, - {'data_evento': , 'magnitude': 5.6}, - {'data_evento': , 'magnitude': 4.0} -] -``` - -Alguém poderia abordar o problema de obter os nossos dados no formato usando expressões regulares; pode ser mais fácil abrir o modelo de tópicos em uma tabela. Copie os tópicos de dados em uma nova planilha, e deixe as colunas na esquerda e na direita dos dados. No exemplo abaixo, eu coloquei na coluna D e, então, preenchi o resto dos dados ao redor dela, assim: - -| | A | B | C | D | E | -|---|---|---|---|---|---| -|1 | {'data_evento': datetime |(1753,6,8) |, 'magnitude': |0.0024499630 |}, | -|2 | | | | | | -|3 | | | | | | - -Então copie e cole os elementos que não mudaram para preencher a coluna inteira. O elemento de data tem de ser (ano, mês, dia). Uma vez que preencheu a tabela, copie e cole no seu editor de texto de forma que se torne parte do arranjo `meus_dados`, como: - -Nota da tradução: note que a ordem do _datetime_ segue o padrão em inglês estadunidense. -``` -meus_dados = [ -{'data_evento': datetime(1753,6,8), 'magnitude':0.0024499630}, -{'data_evento': datetime(1753,6,9), 'magnitude':0.0035766320}, -{'data_evento': datetime(1753,6,10), 'magnitude':0.0022171550}, -{'data_evento': datetime(1753,6,11), 'magnitude':0.0033220150}, -{'data_evento': datetime(1753,6,12), 'magnitude':0.0046445900}, -{'data_evento': datetime(1753,6,13), 'magnitude':0.0035766320}, -{'data_evento': datetime(1753,6,14), 'magnitude':0.0042241550} -] -``` - -Note que a última linha não tem uma vírgula no seu fim. - -O seu script final será similar a essa, usando o exemplo da página do Miditime (as seções de código abaixo foram interrompidas pelos comentários, mas devem ser coladas no seu editor de texto como um ficheiro único): - -```python -from miditime.miditime import MIDITime -from datetime import datetime -import random - -meumidi = MIDITime(108, 'johnadams1.mid', 3, 4, 1) -``` - -Os valores após MIDITime, `MIDITime(108, 'johnadams1.mid', 3, 4, 1)` configuram -+ as batidas por minuto (108), -+ o ficheiro resultante ('johnadams1.mid'), -+ o número de segundos para representar o ano na música (3 segundos no calendário anual, então todas as notas para as entradas desse diário de 1753 serão escaladas contra 3 segundos; há 50 anos nos dados, então a música final terá duração de 50 x 3, ou um pouco mais de dois minutos), -+ a oitava base para a música (C médio é convencionalmente representado como C5, então aqui 4 representa uma oitava abaixo do C médio), -+ o nº de oitavas em que os tons são mapeados. - -Agora passamos os seus dados para o script inserindo-o no arranjo `meus_dados` (isso será colado em seguida): - -```python -meus_dados = [ -{'data_evento': datetime(1753,6,8), 'magnitude':0.0024499630}, -{'data_evento': datetime(1753,6,9), 'magnitude':0.0035766320}, -``` - -...tenha os seus dados aqui, lembrando-se de terminar a linha final data_evento **sem** uma vírgula, e finalizando os dados com um `]` na sua própria linha, por exemplo - -```python -{'data_evento': datetime(1753,6,14), 'magnitude':0.0042241550} -] -``` - -e então copie: - -```python -meus_dados_epoca = [{'dias_desde_epoca': meumidi.days_since_epoch(d['data_evento']), 'magnitude': d['magnitude']} for d in meus_dados] - -meus_dados_tempo = [{'beat': meumidi.beat(d['dias_desde_epoca']), 'magnitude': d['magnitude']} for d in meus_dados_epoca] - -tempo_inicio = meus_dados_tempo[0]['beat'] -``` - -Esta parte calcula o tempo entre as diferentes entradas do diário; diários que estão próximos no tempo terão, portanto, suas notas soando mais próximas. Finalmente, nós definimos como os dados serão mapeados em relação ao tom. Lembre-se que os nossos dados são porcentagens variando de 0.01 (ou seja, 1%) a 0.99 (99%), em `escala_pct` entre 0 e 1. Se não estiver lidando com porcentagens, seria usado o menor valor e o maior valor (se, por exemplo, os seus dados fossem contagens de algum elemento de interesse, como nos dados arqueológicos usados anteriormente). Então, nós colamos: - -```python -def sintonia_mag_para_tom(magnitude): - escala_pct = meumidi.linear_scale_pct(0, 1, magnitude) - # Pick a range of notes. This allows you to play in a key. - c_major = ['C', 'C#', 'D', 'D#', 'E', 'E#', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B', 'B#'] - - #Encontre as notas que correspondem com os pontos dos seus dados - nota = meumidi.scale_to_note(escala_pct, c_major) - - #Traduza essa nota em um tom MIDI - midi_tom = meumidi.note_to_midi_pitch(nota) - - return midi_tom - -lista_notas = [] - -for d in meus_dados_tempo: - lista_notas.append([ - d['beat'] - tempo_inicio, - sintonia_mag_para_tom(d['magnitude']), - random.randint(0,200), # ataque - random.randint(1,4) # duration, in beats - ]) -``` - -e então cole nessa parte final do código para escrever os seus valores de som no ficheiro: - -``` -# Adicione uma faixa com essas notas -meumidi.add_track(lista_notas) - -# Ficheiro .mid resultante -meumidi.save_midi() -``` - -Salve esse ficheiro com um novo nome e a extensão de ficheiro `.py`. - -Para cada coluna de dados nos seus dados originais, **tenha um script único e lembre-se de mudar o nome do ficheiro de saída**, pois, caso contrário, você irá sobrescrever seus dados. Então, você pode carregar os ficheiros individuais midi no Garageband ou LMMS para instrumentação. Aqui está a íntegra do [Diário de John Adams](https://www.youtube.com/watch?v=ikqRXtI3JeA). - -# Sonic Pi - -Harmonizar ficheiros MIDI únicos (no Garageband ou em algum outro programa de composição musical) nos leva de sonificação para composição e arte sonora. Nessa seção final, não será oferecido um tutorial completo sobre como usar o [Sonic Pi](http://sonic-pi.net), mas um direcionamento para um ambiente que permite a performance da codificação dos seus dados ao vivo (veja [esse vídeo](https://www.youtube.com/watch?v=oW-3HVOeUQA) para uma performance ao vivo real de codificação). Os tutoriais do próprio Sonic Pi's mostrarão o potencial do uso do computador como um instrumento musical (em que você digita código em Ruby no editor interno enquanto o interpretador toca o que está sendo codificado). - -Por que alguém iria querer fazer isso? Como progressivamente ficou evidente no tutorial, quando os seus dados são sonificados, escolhas passam a ser feitas sobre como mapear os dados em som, e essas escolhas refletem implícita ou explicitamente decisões sobre quais dados importam. Existe um _continuum_ de 'objetividade', se quiser. Em uma extremidade, uma sonificação que apoia uma discussão sobre o passado; do outro, uma apresentação sobre o passado tão fascinante e pessoal quanto qualquer palestra pública bem-feita. A sonificação tira nossos dados das páginas e os leva aos ouvidos de nossos ouvintes: é uma espécie de história pública. Apresentando nossos dados ... imagine só! - -Aqui, eu ofereço simplesmente um trecho de código que possibilitará a importação dos seus dados, que aqui são simplesmente uma lista de valores salvos como csv. Estou em dívida com a bibliotecária da George Washington University, Laura Wrubel, que postou em [gist.github.com](https://gist.github.com/lwrubel) os experimentos dela de sonificação das transações de circulação de sua biblioteca. - -Nesse [ficheiro de amostra](/assets/sonification/sonification-jesuittopics.csv) (um modelo de tópicos gerado do [Jesuit Relations](http://puffin.creighton.edu/jesuit/relations/), (Relações Jesuítas)), há dois tópicos. A primeira linha contem os cabeçalhos: topic1 (em PT-BR, tópico1), topic2 (em PT-BR, tópico2). - -### Prática - -Siga os tutoriais iniciais que o Sonic Pi oferece até se sentir confortável com a interface e algumas das suas possibilidades. (Esses tutoriais também estão agrupados [aqui](https://gist.github.com/jwinder/e59be201082cca694df9); também é possível escutar uma entrevista com Sam Aaron, o criador do Sonic Pi, [aqui](https://web.archive.org/web/20250710140900/https://devchat.cachefly.net/rubyrogues/RR215SonicPi.mp3?rss=true)). Então, em uma nova janela de edição, copie o seguinte (novamente, o trecho de código a seguir eventualmente será agrupado em um script único na sua janela do Sonic Pi): - -``` -require 'csv' -dados = CSV.parse(File.read("/path/to/your/directory/dados.csv"), headers: true, header_converters: :symbol) -use_bpm 100 -``` - -Lembre, `path/to/your/directory/` é a localização real dos seus dados na sua máquina. Tenha certeza de que eles estão nomeados como `dados.csv` ou altere a linha acima de forma que o seu ficheiro seja carregado! - -Agora, vamos carregar esses dados na nossa música: - -``` -#esse pedaço de código será executado apenas uma vez, a menos que você tire o comentário da linha com -#'live_loop', e também retirar o comentário do 'end' final na parte inferior -# desse blóco de código -#'retirar o comentário' signfica remover o sinal #. - -# live_loop :jesuit do -dados.each do |line| - topic1 = line[:topic1].to_f - topic2 = line[:topic2].to_f - - use_synth :piano - play topic1*100, attack: rand(0.5), decay: rand(1), amp: rand(0.25) - use_synth :piano - play topic2*100, attack: rand(0.5), decay: rand(1), amp: rand(0.25) - sleep (0.5) - - end -# end -``` - -As primeiras linhas carregam as colunas de dados; então dizemos qual amostra de som que desejamos usar (piano) e, em seguida, dizemos ao Sonic Pi para tocar o tópico 1 de acordo com os seguintes critérios (um valor aleatório menor que 0,5 para o ataque; um decaimento usando um valor aleatório menor que 1; e uma [amplitude](#amplitude) com um valor aleatório menor que 0.25). Vê o x 100 na linha? Isso pega os valores dos nossos dados (que são um decimal, lembre) e torna-os em um número inteiro. Nessa parte do código, (do modo que eu escrevi), aquele número equivale diretamente a nota. Se 88 é a menor nota e 1 é a maior, é possível ver que essa abordagem é um pouco problemática: nós não fizemos nenhum mapeamento de tom aqui! Nesse caso, é possível usar o Musicalgorithms para fazer o seu mapeamento de tom, e então inserir esses valores no Sonic Pi. Alternativamente, uma vez que esse código é praticamente em Ruby, é possível buscar como normalizar os dados e então realizar um mapeamento linear dos valores entre 1 - 88. Um bom lugar para começar seria estudar [essa tabela do Steve Lloyd](https://github.com/stevelloyd/Learn-sonification-with-Sonic-Pi) sobre sonificação de dados de clima com Sonic Pi. Finalmente, outra coisa a se notar é que o valor 'rand' (random, aleatório) permite que se adiciona um pouco de 'humanidade' na música em termos de dinâmicas. Então nós faremos a mesma coisa novamente para o topic2 (tópico2). - -É possível adicionar batidas, loops, amostras, e toda a parafernália que o Sonic Pi permite. Onde você coloca os seus pedaços de código afeta a reprodução, se os loops forem colocados antes dos dados acima, ele será reproduzido primeiro. Por exemplo, se o trecho a seguir for inserido depois da linha `use_bpm 100`, - -``` -#pedaço de intro -sleep 2 -sample :ambi_choir, attack: 2, sustain: 4, rate: 0.25, release: 1 -sleep 6 -``` - -Haverá um pouco de uma introdução na sua obra. Há uma pausa de 2 segundos, a amostra 'ambi_choir' é reproduzida, então há uma pausa de mais 6 segundos antes dos seus dados serem tocados. Se quiser adicionar um pouco de um som de bateria sinistro ao longo da sua obra, insira esse trecho a seguir (e antes de seus próprios dados): - -``` -#trecho que continua tocando ao longo da música -live_loop :boom do - with_fx :reverb, room: 0.5 do - sample :bd_boom, rate: 1, amp: 1 - end - sleep 2 -end -``` - -O código é bem simples: realize um loop da amostra 'bd_boom' com o efeito de som de ressonância, em um ritmo particular. Pause por 2 segundos entre os loops. - -A propósito, 'codificação ao vivo'? O que torna esse ambiente um espaço de 'codificação ao vivo' é a possibilidade de se fazer alterações no código _enquanto o Sonic Pi o transforma em música_. Não gosta do que está ouvindo? Altere o código na hora! - -Para mais sobre o Sonic Pi, [esse site de workshop](https://web.archive.org/web/20150907155822/https://www.miskatonic.org/music/access2015/) (em inglês) é um bom lugar para começar. Veja também o [relatório de Laura Wrubel sobre participar desse worksop, e o trabalho dela e de seus colegas na área](http://library.gwu.edu/scholarly-technology-group/posts/sound-library-work) (em inglês). - -# Nihil Novi Sub Sole - -Mais uma vez, para que não pensemos que estamos na vanguarda através da nossa geração algorítmica de música, um lembrete foi publicado em 1978 sobre 'jogos de música de dados' no século XVIII, em que o lançamento de dados determinava a recombinação de trechos pré-escritos de música. [Alguns desses jogos foram explorados e recodificados para o Sonic-Pi por Robin Newman](https://rbnrpi.wordpress.com/project-list/mozart-dice-generated-waltz-revisited-with-sonic-pi/). Newman também usa uma ferramenta que poderia ser descrita como um Markdown+Pandoc da notação musical, [Lilypond](http://www.lilypond.org/) para pontuar essas composições. Os antecedentes para tudo que pode ser encontrado no _The Programming Historian_ são mais profundos do que se pode suspeitar! - -# Conclusão - -Sonificar os nossos dados nos faz confrontar os modos como os nossos dados são, muitas vezes, não sobre o passado, mas sobre o que construímos dele. Isso ocorre em parte em virtude de sua novidade, e da arte e do artifício necessários para mapear os dados para o som. Mas isso também acontece pelo contraste com as nossas noções pré-concebidas sobre visualização de dados. Pode ser que os sons gerados por alguém nunca cheguem ao nível da 'música'; mas se ajudar a transformar como nós encontramos o passado, e como outros engajam com o passado, então o esforço terá sido frutífero. Como Trevor Owens pode ter colocado, 'Sonificação é sobre [descoberta, não justificação](http://www.trevorowens.org/2012/11/discovery-and-justification-are-different-notes-on-sciencing-the-humanities/)'. - -## Termos - -+ **MIDI**, interface digital de instrumento musical. É uma descrição do valor e do tempo de uma nota, não de sua dinâmica ou de como alguém pode tocá-la (esta é uma distinção importante). Ele permite que computadores e instrumentos conversem entre si; pode-se aplicar instrumentação diferente a um ficheiro MIDI da mesma forma que se mudaria a fonte em um pedaço de texto (ou executar um ficheiro Markdown por meio do Pandoc). -+ **MP3**, formato de compressão que remove dados como parte de sua rotina de compactação. -+ **Tom**, a nota em si (C médio, etc) -+ **Ataque**, como a nota é tocada ou atingida -+ **Duração**, quanto tempo a nota dura (notas inteiras, semínimas, colcheias etc) -+ **Mapeamento do Tom e Mapeamento da Duração**, dimensionamento de valores de dados em relação a um intervalo de notas ou a duração da nota -+ **Amplitude**, em resumo, o volume da nota - -# Referências -Baio, Andy. 2015. 'If Drake Was Born A Piano'. Waxy. [http://waxy.org/2015/12/if_drake_was_born_a_piano/](https://waxy.org/2015/12/if_drake_was_born_a_piano/) - -Drucker, Johanna. 2011. Humanities Approaches to Graphical Display. DHQ 5.1 [http://www.digitalhumanities.org/dhq/vol/5/1/000091/000091.html](http://www.digitalhumanities.org/dhq/vol/5/1/000091/000091.html) - -Hedges, Stephen A. 1978. “Dice Music in the Eighteenth Century”. Music & Letters 59 (2). Oxford University Press: 180–87. [http://www.jstor.org/stable/734136](http://www.jstor.org/stable/734136). - -Hermann, T. 2008. "Taxonomy and definitions for sonification and auditory display". In P. Susini and O. Warusfel (eds.) Proceedings of the 14th international conference on auditory display (ICAD 2008). IRCAM, Paris. [http://www.icad.org/Proceedings/2008/Hermann2008.pdf](http://www.icad.org/Proceedings/2008/Hermann2008.pdf) - -Koebler, Jason. 2015. "The Strange Acoustic Phenomenon Behind These Wacked-Out Versions of Pop Songs" Motherboard, Dec 18. [http://motherboard.vice.com/read/the-strange-acoustic-phenomenon-behind-these-wacked-out-versions-of-pop-songs](http://motherboard.vice.com/read/the-strange-acoustic-phenomenon-behind-these-wacked-out-versions-of-pop-songs) - -Last and Usyskin, 2015. "Listen to the Sound of Data". In Aaron K. Baughman et al. (eds.) Multimedia Data Mining and Analytics. Springer: Heidelberg. Pp. 419-446 [https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data](https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data) - +--- +title: Sonificação de dados (uma introdução à sonificação para historiadores) +layout: lesson +slug: som-dados-sonificacao-historiadores +date: 2016-06-07 +translation_date: 2021-03-26 +authors: +- Shawn Graham +reviewers: +- Jeff Veitch +- Tim Compeau +editors: +- Ian Milligan +translator: +- Gabriela Kucuruza +translation-editor: +- Jimmy Medeiros +translation-reviewer: +- Samuel Van Ransbeeck +- Juliana Marques da Silva +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/429 +activity: transforming +topics: [distant-reading] +abstract: "Existem inúmeras lições que o ajudarão a visualizar o passado, mas esta lição o ajudará a ouvir o passado." +original: sonification +avatar_alt: Um violino +doi: 10.46430/phpt0020 +--- + +{% include toc.html %} + +# Introdução + +ποίησις - fabricação, criação, produção + +Eu estou muito cansado de ver o passado. Existem diversos guias que irão ajudar a _visualizar_ o passado que não podemos ver, mas muitas vezes nós esquecemos que a visualização é um ato de criatividade. Nós talvez estejamos muito ligados às nossas telas, muito focados em "ver". Ao invés disso, deixe-me ouvir algo do passado. + +Enquanto existe uma história e uma literatura profundas sobre arqueoacústica e paisagens sonoras que tentam capturar o som de um lugar _como ele era_ ([veja por exemplo a Virtual St. Paul's](https://www.digitalstudies.org/articles/10.16995/dscn.58) ou o trabalho de [Jeff Veitch em Ostia antiga](https://jeffdveitch.wordpress.com/)), eu tenho interesse em 'sonificar' o que eu tenho _agora_, os dados eles mesmos. Eu quero descobrir uma gramática para representar dados em som que seja apropriada para História. Drucker [notoriamente nos lembra](https://www.digitalhumanities.org/dhq/vol/5/1/000091/000091.html) que ‘dados’ não são coisas dadas, mas ao invés disso, coisas capturadas, coisas transformadas. Na sonificação de dados, eu literalmente realizo o passado no presente, e então as suposições e as transformações que faço estão em primeiro plano[^1]. A experiência auditiva resultante é uma "deformação" literal que nos faz ouvir as camadas modernas do passado de uma nova maneira. + +Eu quero ouvir os significados do passado, mas eu sei que não posso. No entanto, quando ouço um instrumento, posso imaginar a materialidade do músico tocando; posso discernir o espaço físico em seus ecos e ressonâncias. Eu posso sentir o som, eu posso me mover no ritmo. A música engaja o meu corpo inteiro, minha imaginação inteira. As suas associações com sons, música e tons que eu ouvi antes criam uma experiência temporal profunda, um sistema de relações incorporadas entre eu e o passado. Visual? Nós temos representações visuais do passado há tanto tempo, que nós quase nos esquecemos dos aspectos artístico e performativo dessas gramáticas de expressão. + +Nesse tutorial, você aprenderá a fazer um pouco de barulho a partir dos seus dados sobre o passado. O _significado_ desse barulho, bem... isso depende de você. Parte do objetivo desse tutorial é te fazer estranhar os seus dados. Traduzindo-o, transcodificando-o, [remediando-o](https://blog.taracopplestone.co.uk/making-things-photobashing-as-archaeological-remediation/) (em inglês), nós começaremos a ver elementos dos dados que a nossa familiaridade com modelos visuais nos impediu de enxergar. Essa deformação está de acordo com os argumentos apresentados por, por exemplo, Mark Sample sobre [quebrar coisas](https://www.samplereality.com/2012/05/02/notes-towards-a-deformed-humanities/) (em inglês), ou Bethany Nowviskie sobre a '[resistência nos materiais](https://nowviskie.org/2013/resistance-in-the-materials/)' (em inglês). Sonificação nos move através do continuum de dados para captação, ciências sociais para arte, [falha para estética](https://nooart.org/post/73353953758/temkin-glitchhumancomputerinteraction) (em inglês). Então vamos ver como isso tudo soa. + +## Objetivos + +Nesse tutorial, apresentarei três maneiras diferentes de gerar som ou música a partir de seus dados. + +Na primeira, usaremos um sistema desenvolvido por Jonathan Middleton, disponível gratuitamente para uso, chamado _Musicalgorithms_ (Algorítmos Musicais) a fim de introduzir algumas das questões e termos-chaves envolvidos. Na segunda, usaremos uma pequena biblioteca do Python para 'mapear por parâmetro' os nossos dados contra o teclado de 88 teclas e introduzir um pouco de arte em nosso trabalho. Finalmente, aprenderemos como carregar nossos dados no ambiente de codificação ao vivo de código aberto para som e música, _Sonic Pi_, momento em que te deixarei para que explore os abundantes tutoriais e recursos desse projeto. + +Você verá que "sonificação" nos movimenta através do espectro partindo de simples 'visualização/auralização' para performance real. + +### Ferramentas ++ Musicalgorithms [https://musicalgorithms.org/](https://musicalgorithms.org/) ++ MIDITime [https://github.com/cirlabs/miditime](https://github.com/cirlabs/miditime) (Eu bifurquei uma cópia no GitHub [aqui](https://github.com/shawngraham/miditime)) ++ Sonic Pi [https://sonic-pi.net/](https://sonic-pi.net/) + +### Dados de Exemplo + ++ [Dados sobre artefatos romanos](/assets/sonification/sonification-roman-data.csv) ++ [Excerto do modelo de tópicos do diário de John Adams](/assets/sonification/sonification-diary.csv) ++ [Excerto do modelo de tópicos das relações jesuíticas](/assets/sonification/sonification-jesuittopics.csv) + +# Um pouco de contexto sobre sonificação + +Sonificação é a prática de mapear aspectos dos dados para produzir sinais sonoros. Em geral, uma técnica pode ser chamada de "sonificação" se cumprir certas condições. Elas incluem reprodutibilidade (os mesmos dados podem ser transformados da mesma maneira por outros pesquisadores de forma que produzam os mesmos resultados) e o que pode ser chamado de inteligibilidade - que os elementos "objetivos" dos dados originais sejam sistematicamente refletidos no som resultante (veja [Hermann (2008)](https://www.icad.org/Proceedings/2008/Hermann2008.pdf) (em inglês) para uma taxonomia da sonificação)[^2]. [Last e Usyskin (2015)](https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data) (em inglês) realizaram uma série de experimentos para determinar quais tarefas de análise de dados poderiam ser performadas quando os dados eram sonificados. Os seus resultados experimentais mostraram que mesmo um grupo de ouvintes não-treinados (sem treinamento formal em música) podem fazer distinções úteis nos dados. Eles encontraram ouvintes que conseguiam distinguir tarefas comuns de exploração de dados nos dados sonificados, como classificação e agrupamento[^3]. Os seus resultados sonificados mapearam os dados fundamentais da escala musical ocidental. + +Last e Usyskin focaram em dados de séries temporais. Eles argumentam que dados de séries temporais são particularmente bons para sonificação, pois há paralelos naturais com sons musicais. Música é sequencial, ela tem duração e ela se desenvolve ao longo do tempo, assim como dados de séries temporais. [(Last e Usyskin 2015, p. 424)](https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data). Torna-se um problema combinar os dados com as saídas sônicas apropriadas. Em muitas aplicações de sonificação, uma técnica chamada "mapeamento de parâmetros" é usada para combinar aspectos dos dados ao longo de várias dimensões da audição, como [tom](#termos), variação, brilho e início. O problema com esta abordagem é que onde não há relação temporal (ou melhor, nenhuma relação não linear) entre os pontos de dados originais, o som resultante pode ser "confuso" (2015, p. 422). + +## Escutando as lacunas +Há também o modo que preenchemos as lacunas do som com as nossas expectativas. Considere esse vídeo em que [mp3](#termos) foi convertido para [MIDI](#termos) e de volta para mp3; a música foi 'achatada' para que todas as informações sonoras sejam tocadas por apenas um instrumento. (Gerar esse efeito é como salvar uma página da web como .txt, abri-la no Word e, então, salvá-la novamente como .html). Todos os sons (inclusive vocais) foram traduzidos para os seus valores de nota correspondentes e, em seguida, transformados de volta em mp3. + +É barulhento, entretanto percebemos o significado. Considere o vídeo abaixo: + + + +O que está acontecendo aqui? Se já conhecia essa música, provavelmente ouviu as 'palavras'. No entanto, nenhuma palavra está presente na música! Se você não conhecia esse música, deve ter soado como um absurdo inaudível (veja mais exemplos no website de [Andy Baio](https://waxy.org/2015/12/if_drake_was_born_a_piano/)[^4]). Esse efeito é, às vezes, chamado de 'alucinação auditiva' (cf. Koebler, 2015[^5]). Esses exemplos mostram como qualquer representação de dados que podemos ouvir/ver não está lá, estritamente falando. Nós preenchemos as lacunas com as nossas próprias expectativas. + +Considere as implicações para a História. Se sonificarmos nossos dados e começarmos a ouvir padrões no som, ou pontos fora da curva, nossas expectativas culturais sobre como a música funciona (nossas memórias de fragmentos musicais semelhantes, ouvidos em contextos específicos) irão colorir nossa interpretação. Isso, eu argumentaria, é verdadeiro para todas as representações do passado, mas sonificar é apenas estranho o suficiente em relação aos nossos métodos regulares, de forma que essa autoconsciência nos ajudará a identificar ou comunicar os padrões críticos nos dados do passado. + +Iremos progredir por meio de três ferramentas diferentes para sonificação de dados, observando como as escolhas em uma ferramenta afetam o resultado e podem ser atenuadas imaginando novamente os dados por meio de outra ferramenta. No fim das contas, não há nada mais objetivo em 'sonificação' do que há em 'visualização', então quem pesquisa deve estar preparado para justificar as suas escolhas, e fazer escolhas transparentes e reprodutíveis para outros. E para que não pensemos que a sonificação e a música gerada por algoritmos são de alguma forma algo "novo", indico ao leitor interessado Hedges, (1978)[^6]. + +Em cada seção, irei dar uma introdução conceitual, seguida por um passo a passo usando dados arqueológicos ou históricos de amostra. + +# Musicalgorithms + +Há uma grande variedade de ferramentas para sonificar dados. Algumas, por exemplo, são pacotes amplamente usadas do [ambiente de estatística R](https://cran.r-project.org/), como ‘[playitbyR](https://cran.r-project.org/web/packages/playitbyr/index.html)’ e ‘[AudiolyzR](https://cran.r-project.org/web/packages/audiolyzR/index.html)’. O primeiro desses pacotes, entretanto, não tem sido mantido ou atualizado para as versões atuais do R (sua última atualização foi muitos anos atrás) e o segundo precisa de um número considerável de configurações adicionais de software para que funcione adequadamente. + +Por outro lado, o site [Musicalgorithms](https://musicalgorithms.org/) é bem fácil de usar. O site Musicalgorithms está online há mais de uma década. Embora não seja código aberto, ele é um projeto de pesquisa de longa-duração em música computacional do seu criador, Jonathan Middleton. Ele está atualmente em sua terceira maior iteração (interações anteriores permanecem disponíveis para uso online). Começaremos com o Musicalalgorithms porque ele nos permite entrar e ajustar os nossos dados para produzir um ficheiro de representação MIDI. Tenha atenção e seleccione a '[Versão 3](https://musicalgorithms.org/3.0/index.html)'. + +{% include figure.html filename="sonification-musicalgorithms-main-site-1.png" caption="O site Musicalgorithms como aparecia em 2 de agosto de 2016" %} + +> Nota da tradução: há novas versões disponíveis para uso, mas de forma a seguir o tutorial, seguimos a versão 3 do Musicallgorithms, usada em 2016, e ainda disponível no site para uso. + +O Musicalgorithms efetua uma série de transformações nos dados. Nos dados de amostra abaixo (o padrão do próprio site), há apenas uma linha de dados, mesmo que pareça várias linhas. Os dados de amostra são compostos de campos separados por vírgula que são delimitados por espaço. + +``` +# Of Voices, Text Area Name, Text Area Data +1,morphBox, +,areaPitch1,2 7 1 8 2 8 1 8 2 8 4 5 9 0 4 5 2 3 5 3 6 0 2 8 +,dAreaMap1,2 7 1 8 2 8 1 8 2 8 4 5 9 0 4 5 2 3 5 3 6 0 2 8 +,mapArea1,20 69 11 78 20 78 11 78 20 78 40 49 88 1 40 49 20 30 49 30 59 1 20 78 +,dMapArea1,1 5 1 5 1 5 1 5 1 5 3 3 6 0 3 3 1 2 3 2 4 0 1 5 +,so_text_area1,20 69 11 78 20 78 11 78 20 78 40 49 88 1 40 49 20 30 49 30 59 1 20 78 +``` + +Esses dados representam os dados de origem e as suas transformações; compartilhar esses dados permitiria a outro pesquisador replicar ou estender a sonificação usando outras ferramentas. No entanto, quando se começa, apenas os dados básicos abaixo são necessários (uma lista de pontos de dados): + +``` +# Of Voices, Text Area Name, Text Area Data +1,morphBox, +,areaPitch1,24 72 12 84 21 81 14 81 24 81 44 51 94 01 44 51 24 31 5 43 61 04 21 81 +``` + +O campo-chave para nós é ‘areaPitch1’, que contém os dados de entrada delimitados por espaço. Os outros campos serão preenchidos à medida que avançamos pelas várias configurações de Musicalgorithms. Nos dados acima (por exemplo, 24 72 12 84 etc.), os valores são contagens brutas de inscrições de uma série de locais ao longo de uma estrada romana na Grã-Bretanha. (Vamos praticar com outros dados em breve, abaixo). + +{% include figure.html filename="sonification-musicalgorithms-pitch-mapping-2.png" caption="Depois de carregar seus dados, é possível selecionar as diferentes operações na barra de menu superior do site. Na captura de tela, o mouseover de informações está explicando o que acontece com o dimensionamento de seus dados se você selecionar a operação de divisão para dimensionar os seus dados para o intervalo de notas selecionado." %} + +Agora, conforme se percorre as várias guias da interface ‘duration input’ (entrada de duração) , ‘pitch mapping' (mapeamento de tom), ‘duration mapping’ (mapeamento de duração), ‘scale options’ (opções de escala musical) é possível realizar várias transformações. Em ‘pitch mapping’ (mapeamento de tom), há uma série de opções matemáticas para mapear os dados contra as 88 teclas/tons completos de um teclado de piano (em um mapeamento linear, a _média_ dos dados de alguém seria mapeado para dó médio, ou 40). Também é possível escolher o tipo de escala, se é um tom maior ou menor. Nesse ponto, uma vez que se tenha selecionado várias transformações, salve o ficheiro de texto. No menu 'play' é possível realizar o download de um ficheiro MIDI. O seu programa de áudio padrão pode tocar ficheiros MIDI (geralmente padronizando para um tom de piano). Uma instrumentação mais complicada pode ser atribuída abrindo o ficheiro MIDI em programas de mixagem de música, como GarageBand (Mac) ou [LMMS](https://lmms.io/) (Windows, Mac, Linux). (O uso do Garageband ou LMMS está fora do escopo desse tutorial. Um tutorial em vídeo sobre LMMS está disponível [aqui](https://youtu.be/4dYxV3tqTUc), enquanto há muitos tutoriais do Garageband online. Lynda.com tem [um tutorial excelente](https://www.lynda.com/GarageBand-tutorials/Importing-audio-tracks/156620/164050-4.html)). + +Se tivesse várias colunas de dados para os mesmos pontos - digamos, em nosso exemplo da Grã-Bretanha romana, também queríamos sonificar contagens de um tipo de cerâmica para essas mesmas cidades - é possível recarregar sua próxima série de dados, efetuar as transformações e mapeamentos, e gerar outro ficheiro MIDI. Como o Garageband e o LMMS permitem a sobreposição de vozes, você pode começar a criar sequências musicais complicadas. + +{% include figure.html filename="sonification-garageband-john-adams-3.png" caption="Captura de tela do Garageband, onde os ficheiros MIDI são tópicos sonorizados do Diário de John Adams. Na interface do Garageband (o LMMS é semelhante), cada ficheiro MIDI é arrastado e solto no lugar. A instrumentação para cada ficheiro MIDI (ou seja, trilha) pode ser selecionada nos menus do Garageband. Os rótulos de cada faixa foram alterados aqui para refletir as palavras-chave em cada tópico. A área verde à direita representa uma visualização das notas em cada faixa. Você pode ver esta interface em ação e ouvir a música [aqui](https://youtu.be/ikqRXtI3JeA) (em inglês)" %} + +Quais transformações devem ser usadas? Se tiver duas colunas de dados, terá duas vozes. Pode fazer sentido, em nossos dados hipotéticos, tocar a primeira voz bem alto, em uma tonalidade maior: as inscrições 'falam' conosco, afinal de contas. (As inscrições romanas de fato se dirigem ao leitor, o transeunte, literalmente: 'Ó tu que passas ...'). Então, se acaso as cerâmicas de interesse forem mercadorias mais despretensiosas, talvez elas possam ser mapeadas em relação à extremidade inferior da escala ou receberem notas de duração mais longas para representar sua onipresença nas classes nessa região. + +_Não há forma 'certa' de representar os seus dados como som, ao menos não por enquanto_, mas mesmo com essa amostra de exemplo, começamos a ver como sombras de significado e interpretação podem ser atribuídas aos nossos dados e à nossa experiência dos dados. + +Mas e o tempo? Dados históricos usualmente têm um ponto de inflexão, um distinto "tempo quando" algo aconteceu. Então, a quantidade de tempo entre dois pontos de dados precisa ser considerada. É nesse ponto que a nossa próxima ferramenta se torna bem útil, para quando nossos pontos de dados tiverem uma relação com outro espaço temporal. Começamos a nos mover de sonificação (pontos de dados) para música (relações entre pontos). + +### Prática +O [conjunto de dados de amostra](/assets/sonification/sonification-roman-data.csv) apresentado contém a contagem de moedas romanas na sua primeira coluna e a contagem de materiais romanos dos mesmos locais, conforme contido no banco de dados do Portable Antiquities Scheme (Esquema de Antiguidades Portáveis) do British Museum. A sonificação desses dados pode revelar ou acentuar aspectos da situação econômica ao longo da rua Watling, uma grande rota através da Britânia Romana. Esses pontos de dados estão organizados geograficamente do Noroeste ao Sudeste; então, na medida em que o som toca, nós estamos escutando movimento através do espaço. Cada nota representa outro passo no caminho. + +1. Abra o [dados-sonificação-romana.csv](/assets/sonification/sonification-roman-data.csv) em uma tabela. Copie a primeira coluna em um editor de texto. Delete os finais das linhas de forma que os dados fiquem todos em uma linha única. +2. Adicione a seguinte informação de coluna assim: +``` +# Of Voices, Text Area Name, Text Area Data +1,morphBox, +,areaPitch1, +``` +...para que os seus dados sigam imediatamente depois da última vírgula (como [esse exemplo](/assets/sonification/sonification-romancoin-data-music.csv)). Salve o ficheiro com um nome útil como `sonsdasmoedas1.csv`. + +3. Acesse o site do [Musicalgorithms](https://musicalgorithms.org/3.0/index.html) (versão 3) e clique no botão "load" (carregar). No pop-up, clique no botão azul "load" (carregar) e selecione o ficheiro salvo no passo 2. O site carregará os seus materiais e exibirá uma marca de seleção verde se tiver sido carregado com êxito. Caso contrário, certifique-se de que os seus valores estejam separados por espaços e que sigam imediatamente a última vírgula no bloco de código na etapa 2. Também é possível tentar carregar o [ficheiro de demonstração desse tutorial](/assets/sonification/sonification-romancoin-data-music.csv) ao invés. + +{% include figure.html filename="sonification-musicalgorithms-upload-4.png" caption="Clique em 'load' na tela principal para acessar essa caixa de diálogo. Então 'load csv'. (carregue o csv) Selecione o ficheiro; ele aparecerá na caixa. Então clique no botão 'load' (carregar)." %} + +4. Clique em 'Pitch Input'. Os valores dos seus dados serão exibidos. Por enquanto, **não selecione** nenhuma outra opção nesse página (consequentemente, usaremos os valores padrão do site). + +5. Clique em 'Duration Input'. **Não selecione nenhuma opção aqui por enquanto**. As opções aqui irão mapear várias transformações em relação aos dados que alterarão a duração para cada nota. Não se preocupe com as opções por enquanto: siga adiante. +6. Clique em 'Pitch Mapping'. Essa é a escolha mais crucial, pois irá transformar (isso é, escalar) os seus dados brutos para um mapeamento em relação às teclas do teclado. Deixe a configuração de `mapping` em 'division'. (As outras opções são módulo e logarítmico). A opção `Range` 1 a 88 usa todas as 88 teclas do teclado, assim, seu valor mais baixo estaria de acordo com a nota mais profunda do piano e seu valor mais alto com a nota mais alta. Em vez disso, você pode restringir sua música em torno de dó médio, então insira 25 a 60 como seu intervalo. O resultado deveria mudar para: `31,34,34,34,25,28,30,60,28,25,26,26,25,25,60,25,25,38,33,26,25,25,25` Essas não são mais suas contagens; são as notas do teclado. + +{% include figure.html filename="sonification-musicalgorithms-settings-for-pitch-mapping-5.png" caption="Clique na caixa 'range' e defina-o para 25. Os valores abaixo serão alterados automaticamente. Clique na caixa 'to' e defina-o para 60. Clique novamente na outra caixa; os valores serão atualizados." %} + +8. Clique em 'Duration Mapping'. Como Pitch Mapping, isso pega o intervalo de tempo especificado e usa várias opções matemáticas para mapear o intervalo de possibilidade contra as suas notas. Se passar o seu cursor por cima de `i` verá como os números correspondem com notas inteiras, semínimas, colcheias e assim por diante. Deixe os valores padrão por enquanto. +9. Clique em 'Scale Options'. Aqui nós podemos começar a selecionar o que pode ser chamado de aspecto 'emocional' do som. Nós geralmente pensamos que escalas maiores são 'alegres' enquanto escalas menores são 'tristes'; para uma discussão acessível acesse esse [post de blog](https://www.ethanhein.com/wp/2010/scales-and-emotions/) (em inglês). Por enquanto, escolha 'scale by: major' (escala maior). Deixe a 'scale' (escala) como `C`. + +Agora sonificamos uma coluna de dados! Clique no botão 'save' (salvar), então 'save csv' (salvar csv). + +{% include figure.html filename="sonification-musicalgorithms-save-6.png" caption="A caixa de diálogo salvar dados." %} +Haverá um ficheiro que se parecerá com isso: + +``` +# Of Voices, Text Area Name, Text Area Data +1,morphBox, +,areaPitch1,80 128 128 128 1 40 77 495 48 2 21 19 1 1 500 1 3 190 115 13 5 1 3 +,dAreaMap1,2 7 1 8 2 8 1 8 2 8 4 5 9 0 4 5 2 3 5 3 6 0 2 +,mapArea1,31 34 34 34 25 28 30 60 28 25 26 26 25 25 60 25 25 38 33 26 25 25 25 +,dMapArea1,1 5 1 5 1 5 1 5 1 5 3 3 6 0 3 3 1 2 3 2 4 0 1 +,so_text_area1,32 35 35 35 25 28 30 59 28 25 27 27 25 25 59 25 25 39 33 27 25 25 25 +``` + +É possível ver os dados originais no campo 'areaPitch1' e os subsequentes mapeamentos. O site permite que sejam geradas até quatro vozes por vez em um ficheiro MIDI; dependendo de como se quer adicionar instrumentação depois, pode-se querer gerar um ficheiro MIDI por vez. Vamos tocar a música - clique em 'Play'. É possível selecionar o tempo aqui, e um instrumento. É possível ouvir os seus dados no navegador, ou salvá-los como um ficheiro MIDI clicando no botão azul 'Save MIDI file'. + +Retorne ao começo e carregue as duas colunas de dados nesse modelo: +``` +# Of Voices, Text Area Name, Text Area Data +2,morphBox, +,areaPitch1, +,areaPitch2, +``` + +{% include figure.html filename="sonification-2voices-7.png" caption="Coloque 2 na caixa de vozes no topo da interface. Quando você for para qualquer uma das páginas de opção - aqui, nós estamos em 'pitch input' - dois monitores abrem para mostrar os dados das duas vozes. Carregue os seus dados do csv como antes, mas formate o seu csv para ter o 'areaPitch1' e o 'areaPitch2' como descrito no texto principal. Os dados para a primeira voz irão aparecer na esquerda, e a segunda voz na direita." %} + +Quando se tem dados com várias vozes, o que se destaca? Observe que, nessa abordagem, a distância entre os pontos no mundo real não é considerada em nossa sonificação. Essa distância, se fosse considerada, poderia ser crucial. A distância, é claro, não precisa ser geográfica - pode ser temporal. A próxima ferramenta que exploraremos nos permite abordar isso em nossa sonificação explicitamente. + +# Algumas palavras sobre configurar o Python + +A próxima seção desse tutorial precisa de Python. Se não usou Python ainda, será preciso passar algum tempo [se familiarizando com a linha de comando (PC) ou terminal (OS)](/en/lessons/intro-to-bash) (em inglês). Você pode achar esse rápido [guia de instalação dos módulos do python](/pt/licoes/instalacao-modulos-python-pip) útil (mas retorne para ele depois de ler o resto da seção). + +Usuários do Mac já possuirão o Python instalado na máquina deles. É possível testar isso apertando o botão COMMAND e a barra de espaço; na janela de pesquisa, digite `terminal` e clique na aplicação do terminal. No prompt de comando, por exemplo, no cursor piscando em `$` digite `python --version` e o computador responderá com a versão do python existente no seu computador. _A próxima seção desse tutorial usa a versão Python 2.7; ela não foi testada em Python 3_. + +Para usuários do Windows, Python não é instalado por padrão na sua máquina então [essa página](https://docs.python-guide.org/en/latest/starting/install/win/) te ajudará a iniciar, apesar das coisas serem um pouco mais complicadas do que parece de acordo com a página (nota de tradução: pode usar também a [lição de instalação do Python](/pt/licoes/introducao-instalacao-python) do _Programming Historian em português_, mas tenha em atenção que nessa lição é instalada a versão 3 do Python). Primeiro, realize o download do ficheiro `.msi` que a página recomenda (Python 2.7). Clique duas vezes no ficheiro e ele deve se instalar em um novo diretório, por exemplo, `C:\Python27\`. Então, nós temos de dizer para o Windows a localização para onde buscar pelo Python sempre que um programa em python for executado; ou seja, colocaremos a localização do diretório no seu 'path', ou a variável do ambiente que o Windows sempre apresenta quando confrontado com um novo comando. Existem algumas formas de fazer isso, mas talvez a mais fácil seja buscar no seu computador pelo programa `Powershell` (digite 'powershell' na janela de pesquisa do seu computador). Abra o Powershell e, no `>` prompt, copie essa linha inteira: + +`[Environment]::SetEnvironmentVariable("Path", "$env:Path;C:\Python27\;C:\Python27\Scripts\", "User")` + +Feche o powershell quando terminar. Você saberá que funcionou se nada acontecer quando clicar em 'enter'. Para testar se tudo está funcionando, abra o prompt de comando (aqui há [10 forma de fazer isso](https://www.howtogeek.com/235101/10-ways-to-open-the-command-prompt-in-windows-10/)) (em inglês) e digite no prompt `>`, `python --version`. Ele deve retornar `Python 2.7.10` ou algo similar. + +A última peça do quebra-cabeça que todos os usuários precisarão é um programa chamado `Pip`. Os usuários de Mac podem instalá-lo digitando no terminal: :`sudo easy_install pip`. Usuários do Windows terão um pouco mais de dificuldade (nota de tradução: pode usar também a [lição de instalação de módulos Python com pip](/pt/licoes/instalacao-modulos-python-pip) do _Programming Historian em português_, mas tenha em atenção que nessa lição é usada a versão 3 do Python). Primeiro, clique no botão direito do seu cursor e salve esse link: [https://bootstrap.pypa.io/get-pip.py](https://bootstrap.pypa.io/get-pip.py) (Se apenas clicar no link, ele irá te mostrar o código no seu navegador). Salve em algum lugar útil. Abra o prompt de comando no diretório em que salvou `get-pip.py`. Então, digite no prompt de comando, `python get-pip.py`. Convencionalmente, nos tutoriais, verá `>` ou `$` em lugares em que é preciso digitar algo no prompt de comando ou no terminal. Nunca é necessário digitar esses dois caracteres. + +Finalmente, quando você tem um código python que deseja executar, pode inseri-lo em seu editor de texto e salvá-lo com a extensão `.py` (nota de tradução: pode também seguir as indicações das lições “Configurar um ambiente de desenvolvimento integrado para Python”, do _Programming Historian em português_, nas suas versões [Windows](/pt/licoes/instalacao-windows) ou [Mac](/pt/licoes/instalacao-mac), mas tenha em atenção que nessas lições é usada a versão 3 do Python). O seu ficheiro é um ficheiro de texto, mas a **extensão** do ficheiro diz para o seu computador para usar o Python para interpretá-lo; mas lembre, digite `python` no prompt primeiro, por exemplo: `$ python meu-script-legal.py`. + +# MIDITime + +MIDITime é um pacote do python desenvolvido por [Reveal News (antes, Centro de Reportagens Investigativas)](https://www.revealnews.org/). O seu [repositório no Github está aqui](https://github.com/cirlabs/miditime). Miditime foi construído explicitamente para dados de séries temporais (ou seja, uma sequencia de observações coletadas ao longo do tempo). + +Enquanto a ferramenta Musicalgorithms tem uma interface mais ou menos intuitiva, quem pesquisa sacrifica a possibilidade de saber o que, exatamente, está acontecendo internamente. +Em princípio, alguém poderia examinar o código subjacente para o pacote MIDITime para saber o que está acontecendo. Mais importante ainda, na ferramenta anterior não há nenhuma habilidade de contabilizar os dados em que os pontos estão distantes uns dos outros no tempo do relógio. MIDITime nos permite considerar que os nossos dados podem ser agrupados pelo tempo. + +Vamos supor que você tenha um diário histórico no qual você encaixou um [modelo de tópicos](/en/lessons/topic-modeling-and-mallet). A saída resultante pode ter entradas de diário como linhas, e a composição percentual de cada tópico contribui para essa entrada como colunas. Nesse caso, _ouvir_ esses valores pode te ajudar a entender os padrões de pensamento no diário de uma forma que a visualização como um gráfico pode não permitir. Outliers ou padrões musicais recorrentes poderiam se destacar ao serem ouvidos de um modo que a gramática dos gráficos obscurece. + +### Instalando o MIDITime +Instalar MIDItime é simples com o [pip](/pt/licoes/instalacao-modulos-python-pip): + +`$ pip install miditime` ou `$ sudo pip install miditime` para uma máquina Mac ou Linux ; +`> pip install miditime` em uma máquina Windows. (Usuários Windows, se as instruções acima não funcionaram muito bem, talvez queira tentar [esse programa de ajuda](https://pydatalog.readthedocs.io/en/latest/installation/#using-pip) para fazer o Pip funcionar adequadamente na sua máquina ou então seguir as instruções da [lição sobre pip](/pt/licoes/instalacao-modulos-python-pip) do _Programming Historian em português_). + +### Prática +Vamos olhar para o exemplo de script providenciado. Abra o seu editor de texto, e copie e cole o script de exemplo em: + +```python +#!/usr/bin/python + +from miditime.miditime import MIDITime + +# Instancie a classe com uma frequência (120bpm é o padrão) e o destino do ficheiro resultante. +mymidi = MIDITime(120, 'meuficheiro.mid') + +# Crie uma lista de notas. Cada nota é uma lista: [tempo, tom, ataque, duração] +midinotes = [ + [0, 60, 200, 3], #Na batida 0 (o começo), C Médio com ataque 200, para 3 batidas + [10, 61, 200, 4] #Em 10 batidas (12 segndos a partir do começo), C#5 com ataque 200, para quatro batidas 4 +] + +# Adicione uma faixa com essas notas +mymidi.add_track(midinotes) + +# Resultado do ficheiro .mid +mymidi.save_midi() +``` + +Salve o script como `musica1.py`. No seu terminal ou prompt de comando, execute o script: + +`$ python musica1.py` + +O novo ficheiro, `meuficheiro.mid` será registrado no seu diretório. Para ouvir esse ficheiro, é possível abri-lo com Quicktime ou Windows Media Player. (É possível adicionar instrumentação abrindo o ficheiro no Garageband ou [LMMS](https://lmms.io/)). + +`Musica1.py` importa miditime (lembre, é preciso realizar o `pip install miditime` antes de executar o script). Então, ele cria um ficheiro resultante de destinação e configura o tempo. Todas as notas são listadas individualmente, onde o primeiro número é o tempo em que a nota deve ser tocada, o tom da nota (ou seja, a nota de fato!), o quão forte ou ritmicamente a nota é atingida (o ataque), e a duração da nota. As notas musicais são então registradas na faixa e a faixa é registrada no `myfile.mid`. + +Agora, execute o script e adicione mais notas. As notas para a 'A barata diz que tem' são: + +``` +C7, F, Gm, Am, Bb, C, F, Dm, Gm, C, F +A ... Barata diz que tem sete saias de filó +``` + +Você consegue fazer o seu computador tocar essa música? (Esse [material](https://www.electronics.dit.ie/staff/tscarff/Music_technology/midi/midi_note_numbers_for_octaves.html) (em inglês) irá ajudar). + +**A propósito**, há uma especificação de ficheiro de texto para descrever música chamado [Notação ABC](https://pt.wikipedia.org/wiki/ABC_(nota%C3%A7%C3%A3o_musical)). Por enquanto, está além de nossa compreensão, mas alguém poderia escrever um script de sonificação em, por exemplo, uma planilha, mapeando valores para nomes de notas na especificação ABC (se você já usou um IF - THEN no Excel para converter notas percentuais em notas alfabéticas, terá uma noção de como isso pode ser feito) e então usando um site como [esse](https://trillian.mit.edu/~jc/music/abc/ABCcontrib.html) (em inglês) para converter a notação ABC em um ficheiro .mid. + +### Inserindo os seus próprios dados +[Esse ficheiro](/assets/sonification/sonification-diary.csv) é uma seleção do modelo de tópicos dos Diários de John Adams do [The Macroscope](https://themacroscope.org) (Explorando Grandes Dados Históricos: O Macroscópico do Historiador). Apenas os sinais mais fortes foram preservados através do arredondamento dos valores nas colunas para duas casas decimais (lembrando que 0.25, por exemplo, indica que aquele tópico está contribuindo para um quarto da composição daquela entrada do diário). Para obter esses dados em seu script de Python, eles devem ser formatados de uma maneira específica. A parte complicada é acertar o campo de data. + +_Para os propósitos desse tutorial, nós iremos deixar os nomes das variáveis sem alterações em relação ao script de amostra. O script de amostra foi desenvolvido com dados de um terremoto em mente; então onde diz 'magnitude' podemos pensar como '% composição do tópico.'_ + +``` +meus_dados = [ + {'data_evento': , 'magnitude': 3.4}, + {'data_evento': , 'magnitude': 3.2}, + {'data_evento': , 'magnitude': 3.6}, + {'data_evento': , 'magnitude': 3.0}, + {'data_evento': , 'magnitude': 5.6}, + {'data_evento': , 'magnitude': 4.0} +] +``` + +Alguém poderia abordar o problema de obter os nossos dados no formato usando expressões regulares; pode ser mais fácil abrir o modelo de tópicos em uma tabela. Copie os tópicos de dados em uma nova planilha, e deixe as colunas na esquerda e na direita dos dados. No exemplo abaixo, eu coloquei na coluna D e, então, preenchi o resto dos dados ao redor dela, assim: + +| | A | B | C | D | E | +|---|---|---|---|---|---| +|1 | {'data_evento': datetime |(1753,6,8) |, 'magnitude': |0.0024499630 |}, | +|2 | | | | | | +|3 | | | | | | + +Então copie e cole os elementos que não mudaram para preencher a coluna inteira. O elemento de data tem de ser (ano, mês, dia). Uma vez que preencheu a tabela, copie e cole no seu editor de texto de forma que se torne parte do arranjo `meus_dados`, como: + +Nota da tradução: note que a ordem do _datetime_ segue o padrão em inglês estadunidense. +``` +meus_dados = [ +{'data_evento': datetime(1753,6,8), 'magnitude':0.0024499630}, +{'data_evento': datetime(1753,6,9), 'magnitude':0.0035766320}, +{'data_evento': datetime(1753,6,10), 'magnitude':0.0022171550}, +{'data_evento': datetime(1753,6,11), 'magnitude':0.0033220150}, +{'data_evento': datetime(1753,6,12), 'magnitude':0.0046445900}, +{'data_evento': datetime(1753,6,13), 'magnitude':0.0035766320}, +{'data_evento': datetime(1753,6,14), 'magnitude':0.0042241550} +] +``` + +Note que a última linha não tem uma vírgula no seu fim. + +O seu script final será similar a essa, usando o exemplo da página do Miditime (as seções de código abaixo foram interrompidas pelos comentários, mas devem ser coladas no seu editor de texto como um ficheiro único): + +```python +from miditime.miditime import MIDITime +from datetime import datetime +import random + +meumidi = MIDITime(108, 'johnadams1.mid', 3, 4, 1) +``` + +Os valores após MIDITime, `MIDITime(108, 'johnadams1.mid', 3, 4, 1)` configuram ++ as batidas por minuto (108), ++ o ficheiro resultante ('johnadams1.mid'), ++ o número de segundos para representar o ano na música (3 segundos no calendário anual, então todas as notas para as entradas desse diário de 1753 serão escaladas contra 3 segundos; há 50 anos nos dados, então a música final terá duração de 50 x 3, ou um pouco mais de dois minutos), ++ a oitava base para a música (C médio é convencionalmente representado como C5, então aqui 4 representa uma oitava abaixo do C médio), ++ o nº de oitavas em que os tons são mapeados. + +Agora passamos os seus dados para o script inserindo-o no arranjo `meus_dados` (isso será colado em seguida): + +```python +meus_dados = [ +{'data_evento': datetime(1753,6,8), 'magnitude':0.0024499630}, +{'data_evento': datetime(1753,6,9), 'magnitude':0.0035766320}, +``` + +...tenha os seus dados aqui, lembrando-se de terminar a linha final data_evento **sem** uma vírgula, e finalizando os dados com um `]` na sua própria linha, por exemplo + +```python +{'data_evento': datetime(1753,6,14), 'magnitude':0.0042241550} +] +``` + +e então copie: + +```python +meus_dados_epoca = [{'dias_desde_epoca': meumidi.days_since_epoch(d['data_evento']), 'magnitude': d['magnitude']} for d in meus_dados] + +meus_dados_tempo = [{'beat': meumidi.beat(d['dias_desde_epoca']), 'magnitude': d['magnitude']} for d in meus_dados_epoca] + +tempo_inicio = meus_dados_tempo[0]['beat'] +``` + +Esta parte calcula o tempo entre as diferentes entradas do diário; diários que estão próximos no tempo terão, portanto, suas notas soando mais próximas. Finalmente, nós definimos como os dados serão mapeados em relação ao tom. Lembre-se que os nossos dados são porcentagens variando de 0.01 (ou seja, 1%) a 0.99 (99%), em `escala_pct` entre 0 e 1. Se não estiver lidando com porcentagens, seria usado o menor valor e o maior valor (se, por exemplo, os seus dados fossem contagens de algum elemento de interesse, como nos dados arqueológicos usados anteriormente). Então, nós colamos: + +```python +def sintonia_mag_para_tom(magnitude): + escala_pct = meumidi.linear_scale_pct(0, 1, magnitude) + # Pick a range of notes. This allows you to play in a key. + c_major = ['C', 'C#', 'D', 'D#', 'E', 'E#', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B', 'B#'] + + #Encontre as notas que correspondem com os pontos dos seus dados + nota = meumidi.scale_to_note(escala_pct, c_major) + + #Traduza essa nota em um tom MIDI + midi_tom = meumidi.note_to_midi_pitch(nota) + + return midi_tom + +lista_notas = [] + +for d in meus_dados_tempo: + lista_notas.append([ + d['beat'] - tempo_inicio, + sintonia_mag_para_tom(d['magnitude']), + random.randint(0,200), # ataque + random.randint(1,4) # duration, in beats + ]) +``` + +e então cole nessa parte final do código para escrever os seus valores de som no ficheiro: + +``` +# Adicione uma faixa com essas notas +meumidi.add_track(lista_notas) + +# Ficheiro .mid resultante +meumidi.save_midi() +``` + +Salve esse ficheiro com um novo nome e a extensão de ficheiro `.py`. + +Para cada coluna de dados nos seus dados originais, **tenha um script único e lembre-se de mudar o nome do ficheiro de saída**, pois, caso contrário, você irá sobrescrever seus dados. Então, você pode carregar os ficheiros individuais midi no Garageband ou LMMS para instrumentação. Aqui está a íntegra do [Diário de John Adams](https://www.youtube.com/watch?v=ikqRXtI3JeA). + +# Sonic Pi + +Harmonizar ficheiros MIDI únicos (no Garageband ou em algum outro programa de composição musical) nos leva de sonificação para composição e arte sonora. Nessa seção final, não será oferecido um tutorial completo sobre como usar o [Sonic Pi](https://sonic-pi.net), mas um direcionamento para um ambiente que permite a performance da codificação dos seus dados ao vivo (veja [esse vídeo](https://www.youtube.com/watch?v=oW-3HVOeUQA) para uma performance ao vivo real de codificação). Os tutoriais do próprio Sonic Pi's mostrarão o potencial do uso do computador como um instrumento musical (em que você digita código em Ruby no editor interno enquanto o interpretador toca o que está sendo codificado). + +Por que alguém iria querer fazer isso? Como progressivamente ficou evidente no tutorial, quando os seus dados são sonificados, escolhas passam a ser feitas sobre como mapear os dados em som, e essas escolhas refletem implícita ou explicitamente decisões sobre quais dados importam. Existe um _continuum_ de 'objetividade', se quiser. Em uma extremidade, uma sonificação que apoia uma discussão sobre o passado; do outro, uma apresentação sobre o passado tão fascinante e pessoal quanto qualquer palestra pública bem-feita. A sonificação tira nossos dados das páginas e os leva aos ouvidos de nossos ouvintes: é uma espécie de história pública. Apresentando nossos dados ... imagine só! + +Aqui, eu ofereço simplesmente um trecho de código que possibilitará a importação dos seus dados, que aqui são simplesmente uma lista de valores salvos como csv. Estou em dívida com a bibliotecária da George Washington University, Laura Wrubel, que postou em [gist.github.com](https://gist.github.com/lwrubel) os experimentos dela de sonificação das transações de circulação de sua biblioteca. + +Nesse [ficheiro de amostra](/assets/sonification/sonification-jesuittopics.csv) (um modelo de tópicos gerado do [Jesuit Relations](https://puffin.creighton.edu/jesuit/relations/), (Relações Jesuítas)), há dois tópicos. A primeira linha contem os cabeçalhos: topic1 (em PT-BR, tópico1), topic2 (em PT-BR, tópico2). + +### Prática + +Siga os tutoriais iniciais que o Sonic Pi oferece até se sentir confortável com a interface e algumas das suas possibilidades. (Esses tutoriais também estão agrupados [aqui](https://gist.github.com/jwinder/e59be201082cca694df9); também é possível escutar uma entrevista com Sam Aaron, o criador do Sonic Pi, [aqui](https://web.archive.org/web/20250710140900/https://devchat.cachefly.net/rubyrogues/RR215SonicPi.mp3?rss=true)). Então, em uma nova janela de edição, copie o seguinte (novamente, o trecho de código a seguir eventualmente será agrupado em um script único na sua janela do Sonic Pi): + +``` +require 'csv' +dados = CSV.parse(File.read("/path/to/your/directory/dados.csv"), headers: true, header_converters: :symbol) +use_bpm 100 +``` + +Lembre, `path/to/your/directory/` é a localização real dos seus dados na sua máquina. Tenha certeza de que eles estão nomeados como `dados.csv` ou altere a linha acima de forma que o seu ficheiro seja carregado! + +Agora, vamos carregar esses dados na nossa música: + +``` +#esse pedaço de código será executado apenas uma vez, a menos que você tire o comentário da linha com +#'live_loop', e também retirar o comentário do 'end' final na parte inferior +# desse blóco de código +#'retirar o comentário' signfica remover o sinal #. + +# live_loop :jesuit do +dados.each do |line| + topic1 = line[:topic1].to_f + topic2 = line[:topic2].to_f + + use_synth :piano + play topic1*100, attack: rand(0.5), decay: rand(1), amp: rand(0.25) + use_synth :piano + play topic2*100, attack: rand(0.5), decay: rand(1), amp: rand(0.25) + sleep (0.5) + + end +# end +``` + +As primeiras linhas carregam as colunas de dados; então dizemos qual amostra de som que desejamos usar (piano) e, em seguida, dizemos ao Sonic Pi para tocar o tópico 1 de acordo com os seguintes critérios (um valor aleatório menor que 0,5 para o ataque; um decaimento usando um valor aleatório menor que 1; e uma [amplitude](#termos) com um valor aleatório menor que 0.25). Vê o x 100 na linha? Isso pega os valores dos nossos dados (que são um decimal, lembre) e torna-os em um número inteiro. Nessa parte do código, (do modo que eu escrevi), aquele número equivale diretamente a nota. Se 88 é a menor nota e 1 é a maior, é possível ver que essa abordagem é um pouco problemática: nós não fizemos nenhum mapeamento de tom aqui! Nesse caso, é possível usar o Musicalgorithms para fazer o seu mapeamento de tom, e então inserir esses valores no Sonic Pi. Alternativamente, uma vez que esse código é praticamente em Ruby, é possível buscar como normalizar os dados e então realizar um mapeamento linear dos valores entre 1 - 88. Um bom lugar para começar seria estudar [essa tabela do Steve Lloyd](https://github.com/stevelloyd/Learn-sonification-with-Sonic-Pi) sobre sonificação de dados de clima com Sonic Pi. Finalmente, outra coisa a se notar é que o valor 'rand' (random, aleatório) permite que se adiciona um pouco de 'humanidade' na música em termos de dinâmicas. Então nós faremos a mesma coisa novamente para o topic2 (tópico2). + +É possível adicionar batidas, loops, amostras, e toda a parafernália que o Sonic Pi permite. Onde você coloca os seus pedaços de código afeta a reprodução, se os loops forem colocados antes dos dados acima, ele será reproduzido primeiro. Por exemplo, se o trecho a seguir for inserido depois da linha `use_bpm 100`, + +``` +#pedaço de intro +sleep 2 +sample :ambi_choir, attack: 2, sustain: 4, rate: 0.25, release: 1 +sleep 6 +``` + +Haverá um pouco de uma introdução na sua obra. Há uma pausa de 2 segundos, a amostra 'ambi_choir' é reproduzida, então há uma pausa de mais 6 segundos antes dos seus dados serem tocados. Se quiser adicionar um pouco de um som de bateria sinistro ao longo da sua obra, insira esse trecho a seguir (e antes de seus próprios dados): + +``` +#trecho que continua tocando ao longo da música +live_loop :boom do + with_fx :reverb, room: 0.5 do + sample :bd_boom, rate: 1, amp: 1 + end + sleep 2 +end +``` + +O código é bem simples: realize um loop da amostra 'bd_boom' com o efeito de som de ressonância, em um ritmo particular. Pause por 2 segundos entre os loops. + +A propósito, 'codificação ao vivo'? O que torna esse ambiente um espaço de 'codificação ao vivo' é a possibilidade de se fazer alterações no código _enquanto o Sonic Pi o transforma em música_. Não gosta do que está ouvindo? Altere o código na hora! + +Para mais sobre o Sonic Pi, [esse site de workshop](https://web.archive.org/web/20150907155822/https://www.miskatonic.org/music/access2015/) (em inglês) é um bom lugar para começar. Veja também o [relatório de Laura Wrubel sobre participar desse worksop, e o trabalho dela e de seus colegas na área](https://library.gwu.edu/scholarly-technology-group/posts/sound-library-work) (em inglês). + +# Nihil Novi Sub Sole + +Mais uma vez, para que não pensemos que estamos na vanguarda através da nossa geração algorítmica de música, um lembrete foi publicado em 1978 sobre 'jogos de música de dados' no século XVIII, em que o lançamento de dados determinava a recombinação de trechos pré-escritos de música. [Alguns desses jogos foram explorados e recodificados para o Sonic-Pi por Robin Newman](https://rbnrpi.wordpress.com/project-list/mozart-dice-generated-waltz-revisited-with-sonic-pi/). Newman também usa uma ferramenta que poderia ser descrita como um Markdown+Pandoc da notação musical, [Lilypond](https://www.lilypond.org/) para pontuar essas composições. Os antecedentes para tudo que pode ser encontrado no _The Programming Historian_ são mais profundos do que se pode suspeitar! + +# Conclusão + +Sonificar os nossos dados nos faz confrontar os modos como os nossos dados são, muitas vezes, não sobre o passado, mas sobre o que construímos dele. Isso ocorre em parte em virtude de sua novidade, e da arte e do artifício necessários para mapear os dados para o som. Mas isso também acontece pelo contraste com as nossas noções pré-concebidas sobre visualização de dados. Pode ser que os sons gerados por alguém nunca cheguem ao nível da 'música'; mas se ajudar a transformar como nós encontramos o passado, e como outros engajam com o passado, então o esforço terá sido frutífero. Como Trevor Owens pode ter colocado, 'Sonificação é sobre [descoberta, não justificação](https://www.trevorowens.org/2012/11/discovery-and-justification-are-different-notes-on-sciencing-the-humanities/)'. + +## Termos + +### MIDI {#midi} + +Interface digital de instrumento musical. É uma descrição do valor e do tempo de uma nota, não de sua dinâmica ou de como alguém pode tocá-la (esta é uma distinção importante). Ele permite que computadores e instrumentos conversem entre si; pode-se aplicar instrumentação diferente a um ficheiro MIDI da mesma forma que se mudaria a fonte em um pedaço de texto (ou executar um ficheiro Markdown por meio do Pandoc). + +### MP3 {#mp3} + +Formato de compressão que remove dados como parte de sua rotina de compactação. + +### Tom {#tom} + +A nota em si (C médio, etc) + +### Ataque {#ataque} + +Como a nota é tocada ou atingida + +### Duração {#duracao} + +Quanto tempo a nota dura (notas inteiras, semínimas, colcheias etc) + +### Mapeamento do Tom {#mapeamento-tom} + +Dimensionamento de valores de dados em relação a um intervalo de notas ou a duração da nota + +### Amplitude {#amplitude} + +Em resumo, o volume da nota + +# Notas de fim + +[^1]: Drucker, Johanna. 2011. Humanities Approaches to Graphical Display. DHQ 5.1 [https://www.digitalhumanities.org/dhq/vol/5/1/000091/000091.html](https://www.digitalhumanities.org/dhq/vol/5/1/000091/000091.html) + +[^2]: Hermann, T. 2008. "Taxonomy and definitions for sonification and auditory display". In P. Susini and O. Warusfel (eds.) Proceedings of the 14th international conference on auditory display (ICAD 2008). IRCAM, Paris. [https://www.icad.org/Proceedings/2008/Hermann2008.pdf](https://www.icad.org/Proceedings/2008/Hermann2008.pdf) + +[^3]: Last and Usyskin, 2015. "Listen to the Sound of Data". In Aaron K. Baughman et al. (eds.) Multimedia Data Mining and Analytics. Springer: Heidelberg. Pp. 419-446 [https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data](https://www.researchgate.net/publication/282504359_Listen_to_the_Sound_of_Data) + +[^4]: Baio, Andy. 2015. 'If Drake Was Born A Piano'. Waxy. [https://waxy.org/2015/12/if_drake_was_born_a_piano/](https://waxy.org/2015/12/if_drake_was_born_a_piano/) + +[^5]: Koebler, Jason. 2015. "The Strange Acoustic Phenomenon Behind These Wacked-Out Versions of Pop Songs" Motherboard, Dec 18. [https://motherboard.vice.com/read/the-strange-acoustic-phenomenon-behind-these-wacked-out-versions-of-pop-songs](https://motherboard.vice.com/read/the-strange-acoustic-phenomenon-behind-these-wacked-out-versions-of-pop-songs) diff --git a/pt/licoes/sumarizacao-narrativas-web-python.md b/pt/licoes/sumarizacao-narrativas-web-python.md index a30543f785..cd7e64aa99 100644 --- a/pt/licoes/sumarizacao-narrativas-web-python.md +++ b/pt/licoes/sumarizacao-narrativas-web-python.md @@ -1,447 +1,447 @@ ---- -title: "Sumarização de narrativas acerca de eventos do passado documentados na web utilizando Python: o caso do Arquivo.pt" -slug: sumarizacao-narrativas-web-python -collection: lessons -layout: lesson -date: 2023-04-29 -authors: -- Ricardo Campos -- Daniel Gomes -reviewers: -- Daniela Major -- Salete Farias -editors: -- Josir Cardoso Gomes -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/420 -difficulty: 2 -activity: transforming -topics: [api, python, data-manipulation, web-archiving] -avatar_alt: Homem sentado ensinando várias crianças -abstract: Nesta lição aprenderá a criar automaticamente resumos de eventos do passado a partir de conteúdos históricos arquivados da web. Em particular, demonstraremos como obter resultados relevantes ao combinar o uso da API do Arquivo.pt com a utilização do *Conta-me Histórias* permitindo, desta forma, processar um elevado volume de dados num curto espaço de tempo. -lesson-partners: [Jisc, The National Archives] -partnership-url: /pt/jisc-tna-parceria -doi: 10.46430/phpt0037 ---- - -{% include toc.html %} - -# Introdução - -Ao longo dos séculos a comunicação evoluiu paralelamente à evolução do homem. Esta, que antes se fazia a partir de meios físicos, é hoje digital e tem presença online. A "culpa" é da web, que desde o final dos anos 90 do século passado, se tornou na principal fonte de informação e comunicação do século XXI. Porém, cerca de [80% da informação disponível na web desaparece ou é alterada no prazo de apenas 1 ano](https://dl.acm.org/doi/10.1145/1145581.1145623) (em inglês). Este facto origina a perda de informação fundamental para documentar os eventos da era digital. - -A mudança para um paradigma de comunicação baseado na internet obrigou a uma alteração profunda na forma como as informações publicadas são preservadas. Os arquivos da web assumem especial relevância, ao preservarem as informações publicadas online desde a década de 1990. - -Apesar dos avanços recentes na preservação de informações arquivadas a partir da web, o problema de explorar de forma eficiente o património histórico preservado por estes arquivos permanece por resolver devido às enormes quantidades de dados históricos arquivados ao longo do tempo e à inexistência de ferramentas que possam processar automaticamente esse volume de dados. Neste contexto, as *timelines* (sistemas automáticos de sumarização temporal) surgem como a solução ideal para a produção automática de resumos de eventos ao longo do tempo e para a análise das informações publicadas online que os documentam, como é o caso das notícias. - -Neste tutorial, pretendemos mostrar como explorar o [Arquivo.pt](http://arquivo.pt), o arquivo da web portuguesa, e como criar automaticamente resumos de eventos do passado a partir de conteúdos históricos arquivados da web. Mais concretamente, demonstraremos como obter resultados relevantes ao combinar o uso da [API (Interface de Programação de Aplicações)](https://perma.cc/6ASS-KZFW) do Arquivo.pt com a utilização do [*Conta-me Histórias*](https://contamehistorias.pt), um sistema que permite criar automaticamente narrativas temporais sobre qualquer tema objeto de notícia. Para a concretização desse objetivo disponibilizamos um Jupyter Notebook que os usuários poderão usar para interagir com ambas as ferramentas. - -Na primeira parte do tutorial, iremos apresentar sumariamente as funções de pesquisa e acesso disponibilizadas pelo Arquivo.pt. Demonstraremos como podem ser utilizadas de forma automática através da invocação dos métodos disponibilizados pela API do Arquivo.pt, recorrendo a exemplos simples e práticos. A pesquisa automática de palavras em páginas arquivadas ao longo do tempo é o serviço base para desenvolver rapidamente aplicações informáticas inovadoras, que permitem explorar e tirar maior partido da informação histórica preservada pelo Arquivo.pt, como é caso do projeto *Conta-me Histórias*. - -Na segunda parte, recorremos ao *Conta-me Histórias* para exemplificar o processo de sumarização temporal de um evento. Nesse sentido, demonstraremos a forma como os usuários podem obter informações históricas resumidas sobre um determinado tópico (por exemplo, sobre [Jorge Sampaio](https://perma.cc/AWX8-9CA3), presidente da República Portuguesa entre 1996 e 2006), que envolva notícias do passado preservadas pelo Arquivo.pt. Uma tal infraestrutura permite aos usuários ter acesso a um conjunto de informações históricas a partir de páginas web que, muito provavelmente, já não existirão naquela que convencionalmente designamos como a web atual. - -# Pré-requisitos - -A participação neste tutorial pressupõe conhecimentos básicos de programação (nomeadamente Python) bem como familiarização com a instalação de pacotes python (via [git](https://perma.cc/6BK8-XZKR) (em inglês)), com o [formato JSON](https://www.w3schools.com/js/js_json_intro.asp) (em inglês) e com o consumo de APIs. A execução do código pressupõe o recurso ao Jupyter Notebook. Para a instalação deste *software* recomendamos o tutorial [Introduction to Jupyter Notebooks](/en/lessons/jupyter-notebooks#installing-jupyter-notebooks) (em inglês) ou, em alternativa, o recurso ao [Google Colab](https://colab.research.google.com/). Este tutorial foi testado com a versão 3.6.5 do Python. - -# Objetivos de Aprendizagem - -No final deste tutorial os participantes devem estar aptos a: -- Extrair informação relevante a partir do Arquivo.pt fazendo uso da [Arquivo.pt API (Full-text & URL search)](https://github.com/arquivo/pwa-technologies/wiki/Arquivo.pt-API) (em inglês) -- Saber usar a biblioteca Python do [*Conta-me Histórias*](https://github.com/LIAAD/TemporalSummarizationFramework) (em inglês) no contexto da sumarização temporal automática de eventos a partir de elevados volumes de dados preservados no arquivo da web portuguesa - -# Arquivo.pt - -O [Arquivo.pt](https://www.arquivo.pt) é um serviço público e gratuito disponibilizado pela [Fundação para a Ciência e a Tecnologia I.P.](https://perma.cc/D3XA-5J78), que permite a qualquer pessoa pesquisar e aceder a informação histórica preservada da web desde os anos 90. Embora se foque na preservação de informação de interesse para a comunidade portuguesa, contém também páginas escritas em várias línguas de interesse para a comunidade internacional e cerca de metade dos seus usuários são oriundos de fora de Portugal. - -[Este vídeo](https://www.youtube.com/embed/EnSys0HDnCc) introduz brevemente o Arquivo.pt. - -## Contributos - -O Arquivo.pt contém milhares de milhões de ficheiros recolhidos ao longo do tempo a partir de websites em várias línguas que documentam eventos nacionais e internacionais. Os serviços de pesquisa que fornece incluem a pesquisa de texto integral, a pesquisa de imagens, a listagem do histórico de versões, a pesquisa avançada e [APIs](https://arquivo.pt/api), que facilitam o desenvolvimento por terceiros de aplicações de valor acrescentado. - -Ao longo dos anos, o Arquivo.pt tem sido utilizado como recurso para suportar trabalhos de pesquisa em áreas como as Humanidades ou as Ciências Sociais. Desde 2018, o [Prémio Arquivo.pt](https://perma.cc/8F6F-KZFP) distingue anualmente trabalhos inovadores baseados na informação histórica preservada pelo Arquivo.pt. Os pesquisadores e cidadãos têm vindo a ser sensibilizados para a importância da preservação da informação publicada na web através da realização de sessões de formação gratuitas, por exemplo, sobre a [utilização das APIs disponibilizadas pelo Arquivo.pt](https://sobre.arquivo.pt/pt/ajuda/formacao/modulo-c/). - -Todo o *software* desenvolvido está disponível como [projetos de código-aberto gratuitos](https://github.com/arquivo/) (em inglês) e, desde 2008, tem sido documentado através de [artigos técnicos e científicos](https://arquivo.pt/publica). No decorrer das suas atividades, o Arquivo.pt gera dados que podem ser úteis para suportar novos trabalhos de pesquisa, como por exemplo a lista de Páginas do Governo de Portugal nas redes sociais ou de websites de partidos políticos. Estes [dados estão disponíveis em acesso aberto](https://arquivo.pt/dadosabertos). - -[Este vídeo](https://www.youtube.com/embed/CZ6R4Zydg0Q) detalha os serviços públicos disponibilizados pelo Arquivo.pt. Pode também aceder diretamente aos [slides da apresentação](https://perma.cc/854E-9XEV). Para saber mais detalhes acerca dos serviços disponibilizados pelo Arquivo.pt consulte: -* [Módulo A: Arquivo.pt: uma nova ferramenta para pesquisar o passado (módulo A)](https://sobre.arquivo.pt/pt/ajuda/formacao/modulo-a/) do programa de "Formação acerca de preservação da Web" do Arquivo.pt. - -## Onde posso encontrar o Arquivo.pt? - -O serviço Arquivo.pt encontra-se disponível a partir dos seguintes apontadores: -* [Interfaces de usuário em português e inglês para aceder aos serviços de pesquisa de páginas, imagens e histórico de versões](https://www.arquivo.pt) -* [Website informativo acerca do Arquivo.pt](https://sobre.arquivo.pt) -* [Documentação acerca das APIs do Arquivo.pt](https://perma.cc/FV3U-ZEL9) (em inglês) - -## Como funciona a pesquisa automática via API? - -Periodicamente, o Arquivo.pt recolhe e armazena automaticamente a informação publicada na web. A infraestrutura de *hardware* do Arquivo.pt está alojada no seu próprio centro de dados e é gerida por pessoal a ela dedicado a tempo inteiro. - -O fluxo de trabalho de preservação é realizado através de um [sistema de informação distribuído de grande escala](https://perma.cc/A3Z7-E358). A informação web armazenada é processada automaticamente para realizar atividades de pesquisa sobre [grandes volumes de dados](https://perma.cc/9FMH-DUY8) (em inglês, *big data*), através de uma plataforma de processamento distribuído para dados não estruturados ([Hadoop](https://perma.cc/B5PH-9B4V)). Tal permite, por exemplo, a deteção automática de *spam* na web ou avaliar a acessibilidade web para pessoas com deficiências. - -Os serviços de pesquisa e acesso via APIs permitem que os pesquisadores tirem partido desta infraestrutura de processamento e dos dados históricos preservados sem terem de endereçar a complexidade do sistema que suporta o Arquivo.pt. [Este vídeo](https://www.youtube.com/embed/PPuauEwIwPE) apresenta a [Arquivo.pt API (Full-text & URL search)](https://perma.cc/6ADS-LPLC) (em inglês). Pode também aceder diretamente aos [slides da apresentação](https://perma.cc/RMS4-UD76). - -Neste tutorial iremos abordar apenas a utilização da API Full-text & URL Search do Arquivo.pt. Porém, este disponibiliza também outras APIs: -* [Image Search API v1.1 (beta version)](https://perma.cc/U682-VNKD) (em inglês) -* [CDX-server API (URL search): international standard](https://perma.cc/9M6Y-A4BW) (em inglês) -* [Memento API (URL search): international standard](https://perma.cc/BF5E-32LR) (em inglês) - -Para saber detalhes acerca de [todas as APIs disponibilizadas pelo Arquivo.pt](https://perma.cc/FV3U-ZEL9) (em inglês) consulte os conteúdos de formação disponíveis em: -* [Módulo C: Acesso e processamento automático de informação preservada da Web através de APIs](https://sobre.arquivo.pt/pt/ajuda/formacao/modulo-c/) do programa de "Formação acerca de preservação da Web" do Arquivo.pt. - -## Utilização - -Em seguida, apresentaremos exemplos de como utilizar a [Arquivo.pt API (Full-text & URL search)](https://github.com/arquivo/pwa-technologies/wiki/Arquivo.pt-API) (em inglês) para pesquisar, de forma automática, páginas da web arquivadas entre determinados intervalos de tempo. Como exemplo, executaremos pesquisas acerca de "[Jorge Sampaio](https://pt.wikipedia.org/wiki/Jorge_Sampaio)"(1939-2021), antigo Presidente da Câmara Municipal de Lisboa (1990-1995) e antigo Presidente da República Portuguesa (1996-2006). - -### Definição dos parâmetros de pesquisa - -O parâmetro *query* define a(s) palavra(s) a pesquisar: `Jorge Sampaio`. - -Para facilitar a leitura dos resultados de pesquisa obtidos iremos limitá-los a um máximo de 5 através do parâmetro `maxItems`. - -A totalidade dos parâmetros de pesquisa disponíveis estão definidos na secção [*Request Parameters* da documentação da API do Arquivo.pt](https://perma.cc/2DMP-3XQC) (link em inglês. Em português, parâmetros requeridos). - -```python -import requests -query = "jorge sampaio" -maxItems = 5 -payload = {'q': query,'maxItems': maxItems} -r = requests.get('http://arquivo.pt/textsearch', params=payload) -print("GET",r.url) -``` - -### Percorrer os resultados obtidos no Arquivo.pt - -O seguinte código mostra os resultados de pesquisa obtidos no seu formato original (JSON): - -```python -import pprint -contentsJSon = r.json() -pprint.pprint(contentsJSon) -``` - -### Sumário dos resultados obtidos - -É possível extrair, para cada resultado, a seguinte informação: -* Título (campo `title`) -* Endereço para o conteúdo arquivado (campo `linkToArchive`) -* Data de arquivo (campo `tstamp`) -* Texto extraído da página (campo `linkToExtractedText`) - -Todos os campos obtidos como resposta a pesquisas disponíveis estão definidos na secção [*Response fields* da documentação da API do Arquivo.pt](https://perma.cc/VK9Z-EC83) (link em inglês. Em português, campos de resposta). - -```python -for item in contentsJSon["response_items"]: - title = item["title"] - url = item["linkToArchive"] - time = item["tstamp"] - - print(title) - print(url) - print(time) - - page = requests.get(item["linkToExtractedText"]) - - # Note a existencia de decode, para garantirmos que o conteudo devolvido pelo Arquivo.pt (no formato ISO-8859-1) e impresso no formato (UTF-8) - content = page.content.decode('utf-8') - print(content) - print("\n") -``` - -### Definir o intervalo temporal da pesquisa - -Uma das mais-valias do Arquivo.pt é fornecer o acesso a informação histórica publicada na web ao longo do tempo. - -No processo de acesso à informação os usuários podem definir o intervalo temporal das datas de arquivo das páginas a serem pesquisadas, através da especificação das datas pretendidas nos parâmetros de pesquisa da API `from` e `to`. Estas devem seguir o formato: ano, mês, dia, hora, minuto e segundo (aaaammddhhmmss). Por exemplo, a data 9 de março de 1996 seria representada por: -* 19960309000000 - -O seguinte código executa uma pesquisa por "Jorge Sampaio" de páginas arquivadas entre março de 1996 e março de 2006, período durante o qual este foi Presidente da República Portuguesa. - -```python -query = "jorge sampaio" -maxItems = 5 -fromDate = 19960309000000 -toDate = 20060309000000 -payload = {'q': query,'maxItems': maxItems, 'from': fromDate, 'to': toDate} -r = requests.get('http://arquivo.pt/textsearch', params=payload) -print("GET",r.url) -print("\n") - -contentsJSon = r.json() -for item in contentsJSon["response_items"]: - title = item["title"] - url = item["linkToArchive"] - time = item["tstamp"] - - print(title) - print(url) - print(time) - - page = requests.get(item["linkToExtractedText"]) - - # Note a existencia de decode, para garantirmos que o conteudo devolvido pelo Arquivo.pt (no formato ISO-8859-1) e impresso no formato (UTF-8) - content = page.content.decode('utf-8') - print(content) - print("\n") -``` - -### Restringir a pesquisa a um determinado website - -Se os usuários apenas tiverem interesse na informação histórica publicada por um determinado website, podem restringir a pesquisa através da especificação no parâmetro de pesquisa da API `siteSearch`. O seguinte código executa uma pesquisa por "Jorge Sampaio" de páginas arquivadas apenas a partir do website com o domínio "www.presidenciarepublica.pt", compreendidas entre março de 1996 e março de 2006, e apresenta os resultados obtidos. - - -```python -query = "jorge sampaio" -maxItems = 5 -fromDate = 19960309000000 -toDate = 20060309000000 -siteSearch = "www.presidenciarepublica.pt" -payload = {'q': query,'maxItems': maxItems, 'from': fromDate, 'to': toDate, 'siteSearch': siteSearch} -r = requests.get('http://arquivo.pt/textsearch', params=payload) -print("GET",r.url) -print("\n") - -contentsJSon = r.json() -for item in contentsJSon["response_items"]: - title = item["title"] - url = item["linkToArchive"] - time = item["tstamp"] - - print(title) - print(url) - print(time) - - page = requests.get(item["linkToExtractedText"]) - - # Note a existencia de decode, para garantirmos que o conteudo devolvido pelo Arquivo.pt (no formato ISO-8859-1) e impresso no formato (UTF-8) - content = page.content.decode('utf-8') - print(content) - print("\n") -``` - -### Restringir a pesquisa a um determinado tipo de ficheiro - -Além de páginas da web, o Arquivo.pt também preserva outros formatos de ficheiro vulgarmente publicados online, como por exemplo documentos do tipo PDF. Os usuários podem definir o tipo de ficheiro sobre o qual a pesquisa deverá incidir através da especificação no parâmetro de pesquisa `type` da API. - -O seguinte código executa uma pesquisa por "Jorge Sampaio": -* Sobre ficheiros do tipo PDF -* Arquivados apenas a partir do website com o domínio "www.presidenciarepublica.pt" -* Entre março de 1996 e março de 2006 - -E apresenta os resultados obtidos. Quando o usuário abrir o endereço do conteúdo arquivado fornecido pelo campo de resposta `linkToArchive` terá acesso ao ficheiro PDF. - -```python -query = "jorge sampaio" -maxItems = 5 -fromDate = 19960309000000 -toDate = 20060309000000 -siteSearch = "www.presidenciarepublica.pt" -fileType = "PDF" -payload = {'q': query,'maxItems': maxItems, 'from': fromDate, 'to': toDate, 'siteSearch': siteSearch, 'type': fileType} -r = requests.get('http://arquivo.pt/textsearch', params=payload) -print("GET",r.url) -print("\n") - -contentsJSon = r.json() -for item in contentsJSon["response_items"]: - title = item["title"] - url = item["linkToArchive"] - time = item["tstamp"] - - print(title) - print(url) - print(time) -``` - -# *Conta-me Histórias* - -O projeto *Conta-me Histórias* é desenvolvido por pesquisadores do Laboratório de Inteligência Artificial e Apoio a Decisão ([LIAAD](https://perma.cc/B5U2-R74J)) — [INESCTEC](https://perma.cc/4XN7-A6TR)) e afiliados às instituições [Instituto Politécnico de Tomar](https://perma.cc/7PDB-NRAL) — [Centro de Investigação em Cidades Inteligentes (CI2)](https://perma.cc/M3CE-HQ6U), [Universidade do Porto](https://perma.cc/MGZ3-S9AQ) e [Universidade de Innsbruck](https://perma.cc/THE2-KA3L) (em inglês). O projeto visa oferecer aos usuários a possibilidade de revisitarem tópicos do passado através de uma interface semelhante ao Google que, dada uma pesquisa, devolve uma sumarização temporal das notícias mais relevantes preservadas pelo Arquivo.pt acerca desse tópico. Um vídeo promocional do projeto pode ser visualizado [aqui](https://www.youtube.com/watch?v=fcPOsBCwyu8). - -## Contributos - -Nos últimos anos, o crescente aumento na disponibilização de conteúdos online tem colocado novos desafios àqueles que pretendem entender a estória de um dado evento. Mais recentemente, fenómenos como o [media bias](https://perma.cc/MH2W-5WL4) (em português, viés mediático), as [fake news](https://perma.cc/945E-WVDK) (em português, notícias falsas) e as [filter bubbles](https://perma.cc/7M7E-S5CD) (link em inglês. Em português, filtro de bolha), vieram adensar ainda mais as dificuldades já existentes no acesso transparente à informação. O *Conta-me Histórias* surge, neste contexto, como um importante contributo para todos aqueles que pretendem ter acesso rápido a uma visão histórica de um dado evento, criando automaticamente narrativas resumidas a partir de um elevado volume de dados coletados no passado. A sua disponibilização em 2018, é um importante contributo para que estudantes, jornalistas, políticos, pesquisadores, etc, possam gerar conhecimento e verificar factos de uma forma rápida, a partir da consulta de *timelines* automaticamente geradas, mas também pelo recurso à consulta de páginas web tipicamente inexistentes na web mais convencional, a web do presente. - -## Onde posso encontrar o *Conta-me Histórias*? - -O projeto *Conta-me Histórias* encontra-se disponível, desde 2018, a partir dos seguintes endereços: -- Página web (versão PT): [https://contamehistorias.pt](https://contamehistorias.pt) -- Biblioteca Python: [https://github.com/LIAAD/TemporalSummarizationFramework](https://perma.cc/J7BB-28YX) (em inglês) - -Outros endereços de relevância: -- *Conta-me Histórias front-end*: [https://github.com/LIAAD/contamehistorias-ui](https://perma.cc/J7BB-28YX) (em inglês) -- *Conta-me Histórias back-end*: [https://github.com/LIAAD/contamehistorias-api](https://perma.cc/Q3MH-3T4J) (em inglês) - -Mais recentemente, em setembro de 2021, o Arquivo.pt passou a disponibilizar a funcionalidade "Narrativa", através de um botão adicional na sua interface que redireciona os usuários para o website do *Conta-me Histórias*, para que a partir deste possam criar automaticamente narrativas temporais sobre qualquer tema. A funcionalidade "Narrativa" resulta da colaboração entre a equipa do *Conta-me Histórias*, vencedora do [Prémio Arquivo.pt 2018](https://perma.cc/8F6F-KZFP), e a equipa do Arquivo.pt. - -## Como Funciona? - -Quando um usuário insere um conjunto de palavras acerca de um tema na caixa de pesquisa do Arquivo.pt e clica no botão "Narrativa", é direcionado para o serviço *Conta-me Histórias* que, por sua vez, analisa automaticamente as notícias de 26 websites arquivados pelo Arquivo.pt ao longo do tempo e apresenta-lhe uma cronologia de notícias relacionadas com o tema pesquisado. - -Por exemplo, se pesquisarmos por "Jorge Sampaio" e carregarmos no botão "Narrativa", - -{% include figure.html filename="sumarizacao-narrativas-web-python-1.jpeg" alt="Pesquisa por Jorge Sampaio através do componente narrativa do Arquivo.pt" caption="Figura 1: Pesquisa por 'Jorge Sampaio' através da componente narrativa do Arquivo.pt." %} - -seremos direcionados para o *Conta-me Histórias*, onde obteremos, automaticamente, uma narrativa de notícias arquivadas. Na figura seguinte é possível observar a linha de tempo e o conjunto de notícias relevantes no período compreendido entre 2016-04-07 e 2016-11-17. O último período temporal é referente ao ano de 2019. - -{% include figure.html filename="sumarizacao-narrativas-web-python-2.jpeg" alt="Resultados da pesquisa por Jorge Sampaio no Conta-me Histórias para o periodo compreendido entre 07/04/2016 e 17/11/2016" caption="Figura 2: Resultados da pesquisa por 'Jorge Sampaio' no *Conta-me Histórias* para o periodo compreendido entre 2016-04-07 e 2016-11-17." %} - -Para a seleção das notícias mais relevantes recorremos ao [YAKE!](http://yake.inesctec.pt) (em inglês), um extrator de palavras relevantes (desenvolvido pela nossa equipa de pesquisa) e que, neste contexto, é utilizado para selecionar os excertos mais importantes de uma notícia (mais concretamente os seus títulos) ao longo do tempo. - -Um aspeto interessante da aplicação é o facto desta facilitar o acesso à página web arquivada que dá nome ao título selecionado como relevante. Por exemplo, ao clicar em cima do título "Jorge Sampaio formaliza apoio a Sampaio da Nóvoa" o usuário poderá visualizar a seguinte página web: - -{% include figure.html filename="sumarizacao-narrativas-web-python-3.jpeg" alt="Jorge Sampaio formaliza apoio a Sampaio da Nóvoa" caption="Figura 3: Jorge Sampaio formaliza apoio a Sampaio da Nóvoa." %} - -Paralelamente, poderá ter acesso a um conjunto de "termos relacionados" com o tópico de pesquisa. Na figura abaixo é possível observar, entre outros, a referência aos antigos presidentes da República Mário Soares e Cavaco Silva, bem como aos ex-primeiro-ministros Santana Lopes e Durão Barroso. - -{% include figure.html filename="sumarizacao-narrativas-web-python-4.jpeg" alt="Nuvem de palavras com os termos relacionados com a pesquisa Jorge Sampaio ao longo de 10 anos" caption="Figura 4: Nuvem de palavras com os termos relacionados com a pesquisa por 'Jorge Sampaio' ao longo de 10 anos." %} - -O *Conta-me Histórias* pesquisa, analisa e agrega milhares de resultados para gerar cada narrativa acerca de um tema. Recomenda-se a escolha de palavras descritivas sobre temas bem definidos, personalidades ou eventos para obter boas narrativas. No seção seguinte descrevemos a forma como, através da biblioteca Python, os usuários podem interagir e fazer uso dos dados do *Conta-me Histórias*. - -## Instalação - -Para a instalação da [biblioteca Conta-me Histórias](https://perma.cc/4ZXT-9FB5) (em inglês) necessita de ter o [git](https://perma.cc/6BK8-XZKR) (em inglês) instalado. Após a sua instalação proceda à execução do seguinte código: - -```python -!pip install -U git+https://github.com/LIAAD/TemporalSummarizationFramework -``` - -## Utilização - -### Definição dos parâmetros de pesquisa - -No próximo código o usuário é convidado a definir o conjunto de parâmetros de pesquisa. A variável `domains` lista o conjunto de 24 websites objeto de pesquisa. Um aspeto interessante desta variável é a possibilidade do usuário definir a sua própria lista de fontes noticiosas. Um exercício interessante passa por definir um conjunto de meios de comunicação de âmbito mais regional, por oposição aos meios de comunicação nacionais ali listados. - -Os parâmetros `from` e `to` permitem estabelecer o espectro temporal da pesquisa. Finalmente, na variável `query` o usuário é convidado a definir o tema da pesquisa (e.g., "Jorge Sampaio") para o qual pretende construir uma narrativa temporal. Uma vez executado o código o sistema inicia o processo de pesquisa junto do Arquivo.pt. Para tal, recorre à utilização da [Arquivo.pt API (Full-text & URL search)](https://perma.cc/6ADS-LPLC) (em inglês). - -```python -from contamehistorias.datasources.webarchive import ArquivoPT -from datetime import datetime - -# Especifica o website e o ambito temporal para restringir a pesquisa -domains = [ 'http://publico.pt/', 'http://www.dn.pt/', 'http://dnoticias.pt/', 'http://www.rtp.pt/', 'http://www.cmjornal.pt/', 'http://www.iol.pt/', 'http://www.tvi24.iol.pt/', 'http://noticias.sapo.pt/', 'http://www.sapo.pt/', 'http://expresso.sapo.pt/', 'http://sol.sapo.pt/', 'http://www.jornaldenegocios.pt/', 'http://abola.pt/', 'http://www.jn.pt/', 'http://sicnoticias.sapo.pt/', 'http://www.lux.iol.pt/', 'http://www.ionline.pt/', 'http://news.google.pt/', 'http://www.dinheirovivo.pt/', 'http://www.aeiou.pt/', 'http://www.tsf.pt/', 'http://meiosepublicidade.pt/', 'http://www.sabado.pt/', 'http://economico.sapo.pt/'] - -params = { 'domains':domains, 'from':datetime(year=2011, month=1, day=1), 'to':datetime(year=2021, month=12, day=31) } - -query = 'Jorge Sampaio' - -apt = ArquivoPT() -search_result = apt.getResult(query=query, **params) -``` - -### Percorrer os resultados obtidos no Arquivo.pt - -O objeto `search_result` devolve o número total de resultados obtidos a partir da chamada à API do Arquivo.pt. O número total de resultados excede facilmente as 10.000 entradas, um volume de dados praticamente impossível de processar por qualquer usuário que, a partir dele, queira retirar conhecimento em tempo útil. - -```python -len(search_result) -``` -Para lá do número total de resultados o objeto `search_result` reúne informação extremamente útil para o passo seguinte do algoritmo, i.e., a seleção das notícias mais relevantes ao longo do tempo. Em concreto, este objeto permite ter acesso a: -* `datatime`: data de coleta do recurso -* `domain`: fonte noticiosa -* `headline`: título da notícia -* `url`: url original da notícia - -bastando para tal executar o seguinte código: - -```python -for x in search_result: - print(x.datetime) - print(x.domain) - print(x.headline) - print(x.url) - print() -``` - -### Determinação de datas importantes e seleção das *keywords*/títulos relevantes - -No próximo passo o sistema recorre ao algoritmo do *Conta-me Histórias* para criar um resumo das notícias mais importantes a partir do conjunto de documentos obtidos no Arquivo.pt. Cada bloco temporal determinado como relevante pelo sistema reúne um total de 20 notícias. Os vários blocos temporais determinados automaticamente pelo sistema oferecem ao usuário uma narrativa ao longo do tempo. - -```python -from contamehistorias import engine -language = "pt" - -cont = engine.TemporalSummarizationEngine() -summ_result = cont.build_intervals(search_result, language, query) - -cont.pprint(summ_result) -``` - -#### Estatísticas da pesquisa - -O código seguinte permite ter acesso a um conjunto de estatísticas globais, nomeadamente, ao número total de documentos, de domínios, bem como ao tempo total de execução do algoritmo. - -```python -print(f"Número total de documentos: {summ_result['stats']['n_docs']}") -print(f"Número total de domínios: {summ_result['stats']['n_domains']}") -print(f"Tempo total de execução: {summ_result['stats']['time']}") -``` - -### Obter a lista dos domínios dos resultados da pesquisa - -Para listar todos os domínios execute o seguinte código: - -```python -for domain in summ_result["domains"]: - print(domain) -``` - -### Resultados da pesquisa para a "Narrativa" - -Finalmente, o código seguinte recorre à variável `summ_result ["results"]` para apresentar os resultados gerados com a informação necessária à produção de uma *timeline*, nomeadamente, o período temporal de cada bloco de notícias, as notícias propriamente ditas (um conjunto de 20 notícias relevantes por bloco temporal), a data de coleta, a fonte noticiosa, o url (ligação à página web original) e o título completo da notícia. - -```python -for period in summ_result["results"]: - - print("--------------------------------") - print(period["from"],"until",period["to"]) - - # Cabecalhos selecionados - keyphrases = period["keyphrases"] - - for keyphrase in keyphrases: - print("headline = " + keyphrase.kw) - - # Fontes - for headline in keyphrase.headlines: - print("Date", headline.info.datetime) - print("Source", headline.info.domain) - print("Url", headline.info.url) - print("Headline completa = ", headline.info.headline) - - print() -``` - -# Conclusões - -A web é hoje considerada uma ferramenta essencial de comunicação. Neste contexto, os arquivos web surgem como um importante recurso de preservação dos conteúdos aí publicados. Embora o seu uso seja dominado por pesquisadores, historiadores ou jornalistas, o elevado volume de dados aí disponíveis sobre o nosso passado faz deste tipo de infraestrutura uma fonte de recursos de elevado valor e extrema utilidade para os usuários mais comuns. O acesso generalizado a este tipo de infraestrutura obriga, no entanto, à existência de outro tipo de ferramentas capazes de satisfazer as necessidades de informação do usuário, diminuindo, ao mesmo tempo, os constrangimentos associados à exploração de elevados volumes de dados por parte de usuários não especialistas. - -Neste tutorial, procurámos mostrar como criar automaticamente sumários temporais a partir de eventos coletados no passado, fazendo uso dos dados obtidos no Arquivo.pt e da aplicação da biblioteca de sumarização temporal *Conta-me Histórias*. O tutorial aqui apresentado é um primeiro passo na tentativa de mostrarmos aos interessados na temática uma forma simples de como qualquer usuário pode, utilizando conceitos minímos de programação, fazer uso de APIs e bibliotecas existentes para extrair conhecimento a partir de um elevado volume de dados num curto espaço de tempo. - -# Prémios - -O projeto *Conta-me Histórias* foi o vencedor do [Prémio Arquivo.pt 2018](https://sobre.arquivo.pt/pt/vencedores-premios-arquivo-pt/) e o vencedor da [Best Demo Presentation](https://ecir2019.org/workshops/) na [41st European Conference on Information Retrieval (ECIR-19)](http://ecir2019.org/) (em inglês). - -# Financiamento - -Ricardo Campos foi financiado por fundos nacionais através do Fundação para a Ciência e Tecnologia (FCT) e pela Fundação Portuguesa para Ciência e Tecnologia (I.P.) com o projeto StorySense (2022.09312.PTDC). - -# Bibliografia - -* Campos, R., Pasquali, A., Jatowt, A., Mangaravite, V., and Jorge, A.. "Automatic Generation of Timelines for Past-Web Events" In *The Past Web: Exploring Web Archives*, edited by D. Gomes, E. Demidova, J. Winters, and T. Risse, 225-242. Springer: 2021. [https://link.springer.com/chapter/10.1007/978-3-030-63291-5_18](https://perma.cc/F3SZ-5MVL) - -* Campos, R., Mangaravite, V., Pasquali, A., Jorge, A., Nunes, C., and Jatowt, A.. "YAKE! Keyword Extraction from Single Documents using Multiple Local Features". *Information Sciences Journal*, vol. 509 (2020): 257-289. [https://doi.org/10.1016/j.ins.2019.09.013](https://doi.org/10.1016/j.ins.2019.09.013) - -* Campos, R., Mangaravite, V., Pasquali, A., Jorge, A., Nunes, C., and Jatowt, A.. "A Text Feature Based Automatic Keyword Extraction Method for Single Documents" In *Advances in Information Retrieval. ECIR 2018 (Grenoble, France. March 26 ? 29). Lecture Notes in Computer Science*, edited by G. Pasi, B. Piwowarski, L. Azzopardi, and A. Hanbury, vol. 10772, 684-691. Springer: 2018. [https://link.springer.com/chapter/10.1007/978-3-319-76941-7_63](https://perma.cc/3V3W-X6MZ) - -* Pasquali, A., Mangaravite, V., Campos, R., Jorge, A., and Jatowt, A.."Interactive System for Automatically Generating Temporal Narratives" In -*Advances in Information Retrieval. ECIR'19 (Cologne, Germany. April 14-18). Lecture Notes in Computer Science*, edited by L. Azzopardi, B. Stein, N. Fuhr, P. Mayr, C. Hauff, and D. Hiemstra, vol. 11438, 251 - 255. Springer: 2019. [https://link.springer.com/chapter/10.1007/978-3-030-15719-7_34](https://perma.cc/MH6W-QQFD) - -* Gomes, D., Demidova, E., Winters, J., and Risse, T. (eds.), *The Past Web: Exploring Web Archives*. Springer, 2021. [https://arquivo.pt/book](https://arquivo.pt/book) [Pre-print](https://perma.cc/Q693-DLPA) - -* Gomes, D., and Costa M.. "The Importance of Web Archives for Humanities". *International Journal of Humanities and Arts Computing*, (April 2014). [http://sobre.arquivo.pt/wp-content/uploads/the-importance-of-web-archives-for-humanities.pdf](https://perma.cc/4WHP-Q534). - -* Alam,Sawood, Weigle, Michele C., Nelson, Michael L., Melo, Fernando, Bicho, Daniel, Gomes, Daniel. "MementoMap Framework for Flexible and Adaptive Web Archive Profiling" In *Proceedings of Joint Conference on Digital Libraries 2019*. Urbana-Champaign, Illinois, US: June 2019. [https://www.cs.odu.edu/~salam/drafts/mementomap-jcdl19-cameraready.pdf](https://perma.cc/7ES7-A7H7). - -* Costa, M.. "Information Search in Web Archives" PhD thesis, Universidade de Lisboa, December 2014. [http://sobre.arquivo.pt/wp-content/uploads/phd-thesis-information-search-in-web-archives.pdf](https://perma.cc/HU5S-M2XE) - -* Mourão, A., Gomes, D.. *The Anatomy of a Web Archive Image Search Engine. Technical Report*. Lisboa, Portugal: Arquivo.pt, dezembro 2021. [https://sobre.arquivo.pt/wp-content/uploads/The_Anatomy_of_a_Web_Archive_Image_Search_Engine_tech_report.pdf](https://perma.cc/2JF4-EF4T) +--- +title: "Sumarização de narrativas acerca de eventos do passado documentados na web utilizando Python: o caso do Arquivo.pt" +slug: sumarizacao-narrativas-web-python +collection: lessons +layout: lesson +date: 2023-04-29 +authors: +- Ricardo Campos +- Daniel Gomes +reviewers: +- Daniela Major +- Salete Farias +editors: +- Josir Cardoso Gomes +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/420 +difficulty: 2 +activity: transforming +topics: [api, python, data-manipulation, web-archiving] +avatar_alt: Homem sentado ensinando várias crianças +abstract: Nesta lição aprenderá a criar automaticamente resumos de eventos do passado a partir de conteúdos históricos arquivados da web. Em particular, demonstraremos como obter resultados relevantes ao combinar o uso da API do Arquivo.pt com a utilização do *Conta-me Histórias* permitindo, desta forma, processar um elevado volume de dados num curto espaço de tempo. +lesson-partners: [Jisc, The National Archives] +partnership-url: /pt/jisc-tna-parceria +doi: 10.46430/phpt0037 +--- + +{% include toc.html %} + +# Introdução + +Ao longo dos séculos a comunicação evoluiu paralelamente à evolução do homem. Esta, que antes se fazia a partir de meios físicos, é hoje digital e tem presença online. A "culpa" é da web, que desde o final dos anos 90 do século passado, se tornou na principal fonte de informação e comunicação do século XXI. Porém, cerca de [80% da informação disponível na web desaparece ou é alterada no prazo de apenas 1 ano](https://dl.acm.org/doi/10.1145/1145581.1145623) (em inglês). Este facto origina a perda de informação fundamental para documentar os eventos da era digital. + +A mudança para um paradigma de comunicação baseado na internet obrigou a uma alteração profunda na forma como as informações publicadas são preservadas. Os arquivos da web assumem especial relevância, ao preservarem as informações publicadas online desde a década de 1990. + +Apesar dos avanços recentes na preservação de informações arquivadas a partir da web, o problema de explorar de forma eficiente o património histórico preservado por estes arquivos permanece por resolver devido às enormes quantidades de dados históricos arquivados ao longo do tempo e à inexistência de ferramentas que possam processar automaticamente esse volume de dados. Neste contexto, as *timelines* (sistemas automáticos de sumarização temporal) surgem como a solução ideal para a produção automática de resumos de eventos ao longo do tempo e para a análise das informações publicadas online que os documentam, como é o caso das notícias. + +Neste tutorial, pretendemos mostrar como explorar o [Arquivo.pt](https://arquivo.pt), o arquivo da web portuguesa, e como criar automaticamente resumos de eventos do passado a partir de conteúdos históricos arquivados da web. Mais concretamente, demonstraremos como obter resultados relevantes ao combinar o uso da [API (Interface de Programação de Aplicações)](https://perma.cc/6ASS-KZFW) do Arquivo.pt com a utilização do [*Conta-me Histórias*](https://contamehistorias.pt), um sistema que permite criar automaticamente narrativas temporais sobre qualquer tema objeto de notícia. Para a concretização desse objetivo disponibilizamos um Jupyter Notebook que os usuários poderão usar para interagir com ambas as ferramentas. + +Na primeira parte do tutorial, iremos apresentar sumariamente as funções de pesquisa e acesso disponibilizadas pelo Arquivo.pt. Demonstraremos como podem ser utilizadas de forma automática através da invocação dos métodos disponibilizados pela API do Arquivo.pt, recorrendo a exemplos simples e práticos. A pesquisa automática de palavras em páginas arquivadas ao longo do tempo é o serviço base para desenvolver rapidamente aplicações informáticas inovadoras, que permitem explorar e tirar maior partido da informação histórica preservada pelo Arquivo.pt, como é caso do projeto *Conta-me Histórias*. + +Na segunda parte, recorremos ao *Conta-me Histórias* para exemplificar o processo de sumarização temporal de um evento. Nesse sentido, demonstraremos a forma como os usuários podem obter informações históricas resumidas sobre um determinado tópico (por exemplo, sobre [Jorge Sampaio](https://perma.cc/AWX8-9CA3), presidente da República Portuguesa entre 1996 e 2006), que envolva notícias do passado preservadas pelo Arquivo.pt. Uma tal infraestrutura permite aos usuários ter acesso a um conjunto de informações históricas a partir de páginas web que, muito provavelmente, já não existirão naquela que convencionalmente designamos como a web atual. + +# Pré-requisitos + +A participação neste tutorial pressupõe conhecimentos básicos de programação (nomeadamente Python) bem como familiarização com a instalação de pacotes python (via [git](https://perma.cc/6BK8-XZKR) (em inglês)), com o [formato JSON](https://www.w3schools.com/js/js_json_intro.asp) (em inglês) e com o consumo de APIs. A execução do código pressupõe o recurso ao Jupyter Notebook. Para a instalação deste *software* recomendamos o tutorial [Introduction to Jupyter Notebooks](/en/lessons/jupyter-notebooks#installing-jupyter-notebooks) (em inglês) ou, em alternativa, o recurso ao [Google Colab](https://colab.research.google.com/). Este tutorial foi testado com a versão 3.6.5 do Python. + +# Objetivos de Aprendizagem + +No final deste tutorial os participantes devem estar aptos a: +- Extrair informação relevante a partir do Arquivo.pt fazendo uso da [Arquivo.pt API (Full-text & URL search)](https://github.com/arquivo/pwa-technologies/wiki/Arquivo.pt-API) (em inglês) +- Saber usar a biblioteca Python do [*Conta-me Histórias*](https://github.com/LIAAD/TemporalSummarizationFramework) (em inglês) no contexto da sumarização temporal automática de eventos a partir de elevados volumes de dados preservados no arquivo da web portuguesa + +# Arquivo.pt + +O [Arquivo.pt](https://www.arquivo.pt) é um serviço público e gratuito disponibilizado pela [Fundação para a Ciência e a Tecnologia I.P.](https://perma.cc/D3XA-5J78), que permite a qualquer pessoa pesquisar e aceder a informação histórica preservada da web desde os anos 90. Embora se foque na preservação de informação de interesse para a comunidade portuguesa, contém também páginas escritas em várias línguas de interesse para a comunidade internacional e cerca de metade dos seus usuários são oriundos de fora de Portugal. + +[Este vídeo](https://www.youtube.com/embed/EnSys0HDnCc) introduz brevemente o Arquivo.pt. + +## Contributos + +O Arquivo.pt contém milhares de milhões de ficheiros recolhidos ao longo do tempo a partir de websites em várias línguas que documentam eventos nacionais e internacionais. Os serviços de pesquisa que fornece incluem a pesquisa de texto integral, a pesquisa de imagens, a listagem do histórico de versões, a pesquisa avançada e [APIs](https://arquivo.pt/api), que facilitam o desenvolvimento por terceiros de aplicações de valor acrescentado. + +Ao longo dos anos, o Arquivo.pt tem sido utilizado como recurso para suportar trabalhos de pesquisa em áreas como as Humanidades ou as Ciências Sociais. Desde 2018, o [Prémio Arquivo.pt](https://perma.cc/8F6F-KZFP) distingue anualmente trabalhos inovadores baseados na informação histórica preservada pelo Arquivo.pt. Os pesquisadores e cidadãos têm vindo a ser sensibilizados para a importância da preservação da informação publicada na web através da realização de sessões de formação gratuitas, por exemplo, sobre a [utilização das APIs disponibilizadas pelo Arquivo.pt](https://sobre.arquivo.pt/pt/ajuda/formacao/modulo-c/). + +Todo o *software* desenvolvido está disponível como [projetos de código-aberto gratuitos](https://github.com/arquivo/) (em inglês) e, desde 2008, tem sido documentado através de [artigos técnicos e científicos](https://arquivo.pt/publica). No decorrer das suas atividades, o Arquivo.pt gera dados que podem ser úteis para suportar novos trabalhos de pesquisa, como por exemplo a lista de Páginas do Governo de Portugal nas redes sociais ou de websites de partidos políticos. Estes [dados estão disponíveis em acesso aberto](https://arquivo.pt/dadosabertos). + +[Este vídeo](https://www.youtube.com/embed/CZ6R4Zydg0Q) detalha os serviços públicos disponibilizados pelo Arquivo.pt. Pode também aceder diretamente aos [slides da apresentação](https://perma.cc/854E-9XEV). Para saber mais detalhes acerca dos serviços disponibilizados pelo Arquivo.pt consulte: +* [Módulo A: Arquivo.pt: uma nova ferramenta para pesquisar o passado (módulo A)](https://sobre.arquivo.pt/pt/ajuda/formacao/modulo-a/) do programa de "Formação acerca de preservação da Web" do Arquivo.pt. + +## Onde posso encontrar o Arquivo.pt? + +O serviço Arquivo.pt encontra-se disponível a partir dos seguintes apontadores: +* [Interfaces de usuário em português e inglês para aceder aos serviços de pesquisa de páginas, imagens e histórico de versões](https://www.arquivo.pt) +* [Website informativo acerca do Arquivo.pt](https://sobre.arquivo.pt) +* [Documentação acerca das APIs do Arquivo.pt](https://perma.cc/FV3U-ZEL9) (em inglês) + +## Como funciona a pesquisa automática via API? + +Periodicamente, o Arquivo.pt recolhe e armazena automaticamente a informação publicada na web. A infraestrutura de *hardware* do Arquivo.pt está alojada no seu próprio centro de dados e é gerida por pessoal a ela dedicado a tempo inteiro. + +O fluxo de trabalho de preservação é realizado através de um [sistema de informação distribuído de grande escala](https://perma.cc/A3Z7-E358). A informação web armazenada é processada automaticamente para realizar atividades de pesquisa sobre [grandes volumes de dados](https://perma.cc/9FMH-DUY8) (em inglês, *big data*), através de uma plataforma de processamento distribuído para dados não estruturados ([Hadoop](https://perma.cc/B5PH-9B4V)). Tal permite, por exemplo, a deteção automática de *spam* na web ou avaliar a acessibilidade web para pessoas com deficiências. + +Os serviços de pesquisa e acesso via APIs permitem que os pesquisadores tirem partido desta infraestrutura de processamento e dos dados históricos preservados sem terem de endereçar a complexidade do sistema que suporta o Arquivo.pt. [Este vídeo](https://www.youtube.com/embed/PPuauEwIwPE) apresenta a [Arquivo.pt API (Full-text & URL search)](https://perma.cc/6ADS-LPLC) (em inglês). Pode também aceder diretamente aos [slides da apresentação](https://perma.cc/RMS4-UD76). + +Neste tutorial iremos abordar apenas a utilização da API Full-text & URL Search do Arquivo.pt. Porém, este disponibiliza também outras APIs: +* [Image Search API v1.1 (beta version)](https://perma.cc/U682-VNKD) (em inglês) +* [CDX-server API (URL search): international standard](https://perma.cc/9M6Y-A4BW) (em inglês) +* [Memento API (URL search): international standard](https://perma.cc/BF5E-32LR) (em inglês) + +Para saber detalhes acerca de [todas as APIs disponibilizadas pelo Arquivo.pt](https://perma.cc/FV3U-ZEL9) (em inglês) consulte os conteúdos de formação disponíveis em: +* [Módulo C: Acesso e processamento automático de informação preservada da Web através de APIs](https://sobre.arquivo.pt/pt/ajuda/formacao/modulo-c/) do programa de "Formação acerca de preservação da Web" do Arquivo.pt. + +## Utilização + +Em seguida, apresentaremos exemplos de como utilizar a [Arquivo.pt API (Full-text & URL search)](https://github.com/arquivo/pwa-technologies/wiki/Arquivo.pt-API) (em inglês) para pesquisar, de forma automática, páginas da web arquivadas entre determinados intervalos de tempo. Como exemplo, executaremos pesquisas acerca de "[Jorge Sampaio](https://pt.wikipedia.org/wiki/Jorge_Sampaio)"(1939-2021), antigo Presidente da Câmara Municipal de Lisboa (1990-1995) e antigo Presidente da República Portuguesa (1996-2006). + +### Definição dos parâmetros de pesquisa + +O parâmetro *query* define a(s) palavra(s) a pesquisar: `Jorge Sampaio`. + +Para facilitar a leitura dos resultados de pesquisa obtidos iremos limitá-los a um máximo de 5 através do parâmetro `maxItems`. + +A totalidade dos parâmetros de pesquisa disponíveis estão definidos na secção [*Request Parameters* da documentação da API do Arquivo.pt](https://perma.cc/2DMP-3XQC) (link em inglês. Em português, parâmetros requeridos). + +```python +import requests +query = "jorge sampaio" +maxItems = 5 +payload = {'q': query,'maxItems': maxItems} +r = requests.get('http://arquivo.pt/textsearch', params=payload) +print("GET",r.url) +``` + +### Percorrer os resultados obtidos no Arquivo.pt + +O seguinte código mostra os resultados de pesquisa obtidos no seu formato original (JSON): + +```python +import pprint +contentsJSon = r.json() +pprint.pprint(contentsJSon) +``` + +### Sumário dos resultados obtidos + +É possível extrair, para cada resultado, a seguinte informação: +* Título (campo `title`) +* Endereço para o conteúdo arquivado (campo `linkToArchive`) +* Data de arquivo (campo `tstamp`) +* Texto extraído da página (campo `linkToExtractedText`) + +Todos os campos obtidos como resposta a pesquisas disponíveis estão definidos na secção [*Response fields* da documentação da API do Arquivo.pt](https://perma.cc/VK9Z-EC83) (link em inglês. Em português, campos de resposta). + +```python +for item in contentsJSon["response_items"]: + title = item["title"] + url = item["linkToArchive"] + time = item["tstamp"] + + print(title) + print(url) + print(time) + + page = requests.get(item["linkToExtractedText"]) + + # Note a existencia de decode, para garantirmos que o conteudo devolvido pelo Arquivo.pt (no formato ISO-8859-1) e impresso no formato (UTF-8) + content = page.content.decode('utf-8') + print(content) + print("\n") +``` + +### Definir o intervalo temporal da pesquisa + +Uma das mais-valias do Arquivo.pt é fornecer o acesso a informação histórica publicada na web ao longo do tempo. + +No processo de acesso à informação os usuários podem definir o intervalo temporal das datas de arquivo das páginas a serem pesquisadas, através da especificação das datas pretendidas nos parâmetros de pesquisa da API `from` e `to`. Estas devem seguir o formato: ano, mês, dia, hora, minuto e segundo (aaaammddhhmmss). Por exemplo, a data 9 de março de 1996 seria representada por: +* 19960309000000 + +O seguinte código executa uma pesquisa por "Jorge Sampaio" de páginas arquivadas entre março de 1996 e março de 2006, período durante o qual este foi Presidente da República Portuguesa. + +```python +query = "jorge sampaio" +maxItems = 5 +fromDate = 19960309000000 +toDate = 20060309000000 +payload = {'q': query,'maxItems': maxItems, 'from': fromDate, 'to': toDate} +r = requests.get('http://arquivo.pt/textsearch', params=payload) +print("GET",r.url) +print("\n") + +contentsJSon = r.json() +for item in contentsJSon["response_items"]: + title = item["title"] + url = item["linkToArchive"] + time = item["tstamp"] + + print(title) + print(url) + print(time) + + page = requests.get(item["linkToExtractedText"]) + + # Note a existencia de decode, para garantirmos que o conteudo devolvido pelo Arquivo.pt (no formato ISO-8859-1) e impresso no formato (UTF-8) + content = page.content.decode('utf-8') + print(content) + print("\n") +``` + +### Restringir a pesquisa a um determinado website + +Se os usuários apenas tiverem interesse na informação histórica publicada por um determinado website, podem restringir a pesquisa através da especificação no parâmetro de pesquisa da API `siteSearch`. O seguinte código executa uma pesquisa por "Jorge Sampaio" de páginas arquivadas apenas a partir do website com o domínio "www.presidenciarepublica.pt", compreendidas entre março de 1996 e março de 2006, e apresenta os resultados obtidos. + + +```python +query = "jorge sampaio" +maxItems = 5 +fromDate = 19960309000000 +toDate = 20060309000000 +siteSearch = "www.presidenciarepublica.pt" +payload = {'q': query,'maxItems': maxItems, 'from': fromDate, 'to': toDate, 'siteSearch': siteSearch} +r = requests.get('http://arquivo.pt/textsearch', params=payload) +print("GET",r.url) +print("\n") + +contentsJSon = r.json() +for item in contentsJSon["response_items"]: + title = item["title"] + url = item["linkToArchive"] + time = item["tstamp"] + + print(title) + print(url) + print(time) + + page = requests.get(item["linkToExtractedText"]) + + # Note a existencia de decode, para garantirmos que o conteudo devolvido pelo Arquivo.pt (no formato ISO-8859-1) e impresso no formato (UTF-8) + content = page.content.decode('utf-8') + print(content) + print("\n") +``` + +### Restringir a pesquisa a um determinado tipo de ficheiro + +Além de páginas da web, o Arquivo.pt também preserva outros formatos de ficheiro vulgarmente publicados online, como por exemplo documentos do tipo PDF. Os usuários podem definir o tipo de ficheiro sobre o qual a pesquisa deverá incidir através da especificação no parâmetro de pesquisa `type` da API. + +O seguinte código executa uma pesquisa por "Jorge Sampaio": +* Sobre ficheiros do tipo PDF +* Arquivados apenas a partir do website com o domínio "www.presidenciarepublica.pt" +* Entre março de 1996 e março de 2006 + +E apresenta os resultados obtidos. Quando o usuário abrir o endereço do conteúdo arquivado fornecido pelo campo de resposta `linkToArchive` terá acesso ao ficheiro PDF. + +```python +query = "jorge sampaio" +maxItems = 5 +fromDate = 19960309000000 +toDate = 20060309000000 +siteSearch = "www.presidenciarepublica.pt" +fileType = "PDF" +payload = {'q': query,'maxItems': maxItems, 'from': fromDate, 'to': toDate, 'siteSearch': siteSearch, 'type': fileType} +r = requests.get('http://arquivo.pt/textsearch', params=payload) +print("GET",r.url) +print("\n") + +contentsJSon = r.json() +for item in contentsJSon["response_items"]: + title = item["title"] + url = item["linkToArchive"] + time = item["tstamp"] + + print(title) + print(url) + print(time) +``` + +# *Conta-me Histórias* + +O projeto *Conta-me Histórias* é desenvolvido por pesquisadores do Laboratório de Inteligência Artificial e Apoio a Decisão ([LIAAD](https://perma.cc/B5U2-R74J)) — [INESCTEC](https://perma.cc/4XN7-A6TR)) e afiliados às instituições [Instituto Politécnico de Tomar](https://perma.cc/7PDB-NRAL) — [Centro de Investigação em Cidades Inteligentes (CI2)](https://perma.cc/M3CE-HQ6U), [Universidade do Porto](https://perma.cc/MGZ3-S9AQ) e [Universidade de Innsbruck](https://perma.cc/THE2-KA3L) (em inglês). O projeto visa oferecer aos usuários a possibilidade de revisitarem tópicos do passado através de uma interface semelhante ao Google que, dada uma pesquisa, devolve uma sumarização temporal das notícias mais relevantes preservadas pelo Arquivo.pt acerca desse tópico. Um vídeo promocional do projeto pode ser visualizado [aqui](https://www.youtube.com/watch?v=fcPOsBCwyu8). + +## Contributos + +Nos últimos anos, o crescente aumento na disponibilização de conteúdos online tem colocado novos desafios àqueles que pretendem entender a estória de um dado evento. Mais recentemente, fenómenos como o [media bias](https://perma.cc/MH2W-5WL4) (em português, viés mediático), as [fake news](https://perma.cc/945E-WVDK) (em português, notícias falsas) e as [filter bubbles](https://perma.cc/7M7E-S5CD) (link em inglês. Em português, filtro de bolha), vieram adensar ainda mais as dificuldades já existentes no acesso transparente à informação. O *Conta-me Histórias* surge, neste contexto, como um importante contributo para todos aqueles que pretendem ter acesso rápido a uma visão histórica de um dado evento, criando automaticamente narrativas resumidas a partir de um elevado volume de dados coletados no passado. A sua disponibilização em 2018, é um importante contributo para que estudantes, jornalistas, políticos, pesquisadores, etc, possam gerar conhecimento e verificar factos de uma forma rápida, a partir da consulta de *timelines* automaticamente geradas, mas também pelo recurso à consulta de páginas web tipicamente inexistentes na web mais convencional, a web do presente. + +## Onde posso encontrar o *Conta-me Histórias*? + +O projeto *Conta-me Histórias* encontra-se disponível, desde 2018, a partir dos seguintes endereços: +- Página web (versão PT): [https://contamehistorias.pt](https://contamehistorias.pt) +- Biblioteca Python: [https://github.com/LIAAD/TemporalSummarizationFramework](https://perma.cc/J7BB-28YX) (em inglês) + +Outros endereços de relevância: +- *Conta-me Histórias front-end*: [https://github.com/LIAAD/contamehistorias-ui](https://perma.cc/J7BB-28YX) (em inglês) +- *Conta-me Histórias back-end*: [https://github.com/LIAAD/contamehistorias-api](https://perma.cc/Q3MH-3T4J) (em inglês) + +Mais recentemente, em setembro de 2021, o Arquivo.pt passou a disponibilizar a funcionalidade "Narrativa", através de um botão adicional na sua interface que redireciona os usuários para o website do *Conta-me Histórias*, para que a partir deste possam criar automaticamente narrativas temporais sobre qualquer tema. A funcionalidade "Narrativa" resulta da colaboração entre a equipa do *Conta-me Histórias*, vencedora do [Prémio Arquivo.pt 2018](https://perma.cc/8F6F-KZFP), e a equipa do Arquivo.pt. + +## Como Funciona? + +Quando um usuário insere um conjunto de palavras acerca de um tema na caixa de pesquisa do Arquivo.pt e clica no botão "Narrativa", é direcionado para o serviço *Conta-me Histórias* que, por sua vez, analisa automaticamente as notícias de 26 websites arquivados pelo Arquivo.pt ao longo do tempo e apresenta-lhe uma cronologia de notícias relacionadas com o tema pesquisado. + +Por exemplo, se pesquisarmos por "Jorge Sampaio" e carregarmos no botão "Narrativa", + +{% include figure.html filename="sumarizacao-narrativas-web-python-1.jpeg" alt="Pesquisa por Jorge Sampaio através do componente narrativa do Arquivo.pt" caption="Figura 1: Pesquisa por 'Jorge Sampaio' através da componente narrativa do Arquivo.pt." %} + +seremos direcionados para o *Conta-me Histórias*, onde obteremos, automaticamente, uma narrativa de notícias arquivadas. Na figura seguinte é possível observar a linha de tempo e o conjunto de notícias relevantes no período compreendido entre 2016-04-07 e 2016-11-17. O último período temporal é referente ao ano de 2019. + +{% include figure.html filename="sumarizacao-narrativas-web-python-2.jpeg" alt="Resultados da pesquisa por Jorge Sampaio no Conta-me Histórias para o periodo compreendido entre 07/04/2016 e 17/11/2016" caption="Figura 2: Resultados da pesquisa por 'Jorge Sampaio' no *Conta-me Histórias* para o periodo compreendido entre 2016-04-07 e 2016-11-17." %} + +Para a seleção das notícias mais relevantes recorremos ao [YAKE!](https://yake.inesctec.pt) (em inglês), um extrator de palavras relevantes (desenvolvido pela nossa equipa de pesquisa) e que, neste contexto, é utilizado para selecionar os excertos mais importantes de uma notícia (mais concretamente os seus títulos) ao longo do tempo. + +Um aspeto interessante da aplicação é o facto desta facilitar o acesso à página web arquivada que dá nome ao título selecionado como relevante. Por exemplo, ao clicar em cima do título "Jorge Sampaio formaliza apoio a Sampaio da Nóvoa" o usuário poderá visualizar a seguinte página web: + +{% include figure.html filename="sumarizacao-narrativas-web-python-3.jpeg" alt="Jorge Sampaio formaliza apoio a Sampaio da Nóvoa" caption="Figura 3: Jorge Sampaio formaliza apoio a Sampaio da Nóvoa." %} + +Paralelamente, poderá ter acesso a um conjunto de "termos relacionados" com o tópico de pesquisa. Na figura abaixo é possível observar, entre outros, a referência aos antigos presidentes da República Mário Soares e Cavaco Silva, bem como aos ex-primeiro-ministros Santana Lopes e Durão Barroso. + +{% include figure.html filename="sumarizacao-narrativas-web-python-4.jpeg" alt="Nuvem de palavras com os termos relacionados com a pesquisa Jorge Sampaio ao longo de 10 anos" caption="Figura 4: Nuvem de palavras com os termos relacionados com a pesquisa por 'Jorge Sampaio' ao longo de 10 anos." %} + +O *Conta-me Histórias* pesquisa, analisa e agrega milhares de resultados para gerar cada narrativa acerca de um tema. Recomenda-se a escolha de palavras descritivas sobre temas bem definidos, personalidades ou eventos para obter boas narrativas. No seção seguinte descrevemos a forma como, através da biblioteca Python, os usuários podem interagir e fazer uso dos dados do *Conta-me Histórias*. + +## Instalação + +Para a instalação da [biblioteca Conta-me Histórias](https://perma.cc/4ZXT-9FB5) (em inglês) necessita de ter o [git](https://perma.cc/6BK8-XZKR) (em inglês) instalado. Após a sua instalação proceda à execução do seguinte código: + +```python +!pip install -U git+https://github.com/LIAAD/TemporalSummarizationFramework +``` + +## Utilização + +### Definição dos parâmetros de pesquisa + +No próximo código o usuário é convidado a definir o conjunto de parâmetros de pesquisa. A variável `domains` lista o conjunto de 24 websites objeto de pesquisa. Um aspeto interessante desta variável é a possibilidade do usuário definir a sua própria lista de fontes noticiosas. Um exercício interessante passa por definir um conjunto de meios de comunicação de âmbito mais regional, por oposição aos meios de comunicação nacionais ali listados. + +Os parâmetros `from` e `to` permitem estabelecer o espectro temporal da pesquisa. Finalmente, na variável `query` o usuário é convidado a definir o tema da pesquisa (e.g., "Jorge Sampaio") para o qual pretende construir uma narrativa temporal. Uma vez executado o código o sistema inicia o processo de pesquisa junto do Arquivo.pt. Para tal, recorre à utilização da [Arquivo.pt API (Full-text & URL search)](https://perma.cc/6ADS-LPLC) (em inglês). + +```python +from contamehistorias.datasources.webarchive import ArquivoPT +from datetime import datetime + +# Especifica o website e o ambito temporal para restringir a pesquisa +domains = [ 'http://publico.pt/', 'http://www.dn.pt/', 'http://dnoticias.pt/', 'http://www.rtp.pt/', 'http://www.cmjornal.pt/', 'http://www.iol.pt/', 'http://www.tvi24.iol.pt/', 'http://noticias.sapo.pt/', 'http://www.sapo.pt/', 'http://expresso.sapo.pt/', 'http://sol.sapo.pt/', 'http://www.jornaldenegocios.pt/', 'http://abola.pt/', 'http://www.jn.pt/', 'http://sicnoticias.sapo.pt/', 'http://www.lux.iol.pt/', 'http://www.ionline.pt/', 'http://news.google.pt/', 'http://www.dinheirovivo.pt/', 'http://www.aeiou.pt/', 'http://www.tsf.pt/', 'http://meiosepublicidade.pt/', 'http://www.sabado.pt/', 'http://economico.sapo.pt/'] + +params = { 'domains':domains, 'from':datetime(year=2011, month=1, day=1), 'to':datetime(year=2021, month=12, day=31) } + +query = 'Jorge Sampaio' + +apt = ArquivoPT() +search_result = apt.getResult(query=query, **params) +``` + +### Percorrer os resultados obtidos no Arquivo.pt + +O objeto `search_result` devolve o número total de resultados obtidos a partir da chamada à API do Arquivo.pt. O número total de resultados excede facilmente as 10.000 entradas, um volume de dados praticamente impossível de processar por qualquer usuário que, a partir dele, queira retirar conhecimento em tempo útil. + +```python +len(search_result) +``` +Para lá do número total de resultados o objeto `search_result` reúne informação extremamente útil para o passo seguinte do algoritmo, i.e., a seleção das notícias mais relevantes ao longo do tempo. Em concreto, este objeto permite ter acesso a: +* `datatime`: data de coleta do recurso +* `domain`: fonte noticiosa +* `headline`: título da notícia +* `url`: url original da notícia + +bastando para tal executar o seguinte código: + +```python +for x in search_result: + print(x.datetime) + print(x.domain) + print(x.headline) + print(x.url) + print() +``` + +### Determinação de datas importantes e seleção das *keywords*/títulos relevantes + +No próximo passo o sistema recorre ao algoritmo do *Conta-me Histórias* para criar um resumo das notícias mais importantes a partir do conjunto de documentos obtidos no Arquivo.pt. Cada bloco temporal determinado como relevante pelo sistema reúne um total de 20 notícias. Os vários blocos temporais determinados automaticamente pelo sistema oferecem ao usuário uma narrativa ao longo do tempo. + +```python +from contamehistorias import engine +language = "pt" + +cont = engine.TemporalSummarizationEngine() +summ_result = cont.build_intervals(search_result, language, query) + +cont.pprint(summ_result) +``` + +#### Estatísticas da pesquisa + +O código seguinte permite ter acesso a um conjunto de estatísticas globais, nomeadamente, ao número total de documentos, de domínios, bem como ao tempo total de execução do algoritmo. + +```python +print(f"Número total de documentos: {summ_result['stats']['n_docs']}") +print(f"Número total de domínios: {summ_result['stats']['n_domains']}") +print(f"Tempo total de execução: {summ_result['stats']['time']}") +``` + +### Obter a lista dos domínios dos resultados da pesquisa + +Para listar todos os domínios execute o seguinte código: + +```python +for domain in summ_result["domains"]: + print(domain) +``` + +### Resultados da pesquisa para a "Narrativa" + +Finalmente, o código seguinte recorre à variável `summ_result ["results"]` para apresentar os resultados gerados com a informação necessária à produção de uma *timeline*, nomeadamente, o período temporal de cada bloco de notícias, as notícias propriamente ditas (um conjunto de 20 notícias relevantes por bloco temporal), a data de coleta, a fonte noticiosa, o url (ligação à página web original) e o título completo da notícia. + +```python +for period in summ_result["results"]: + + print("--------------------------------") + print(period["from"],"until",period["to"]) + + # Cabecalhos selecionados + keyphrases = period["keyphrases"] + + for keyphrase in keyphrases: + print("headline = " + keyphrase.kw) + + # Fontes + for headline in keyphrase.headlines: + print("Date", headline.info.datetime) + print("Source", headline.info.domain) + print("Url", headline.info.url) + print("Headline completa = ", headline.info.headline) + + print() +``` + +# Conclusões + +A web é hoje considerada uma ferramenta essencial de comunicação. Neste contexto, os arquivos web surgem como um importante recurso de preservação dos conteúdos aí publicados. Embora o seu uso seja dominado por pesquisadores, historiadores ou jornalistas, o elevado volume de dados aí disponíveis sobre o nosso passado faz deste tipo de infraestrutura uma fonte de recursos de elevado valor e extrema utilidade para os usuários mais comuns. O acesso generalizado a este tipo de infraestrutura obriga, no entanto, à existência de outro tipo de ferramentas capazes de satisfazer as necessidades de informação do usuário, diminuindo, ao mesmo tempo, os constrangimentos associados à exploração de elevados volumes de dados por parte de usuários não especialistas. + +Neste tutorial, procurámos mostrar como criar automaticamente sumários temporais a partir de eventos coletados no passado, fazendo uso dos dados obtidos no Arquivo.pt e da aplicação da biblioteca de sumarização temporal *Conta-me Histórias*. O tutorial aqui apresentado é um primeiro passo na tentativa de mostrarmos aos interessados na temática uma forma simples de como qualquer usuário pode, utilizando conceitos minímos de programação, fazer uso de APIs e bibliotecas existentes para extrair conhecimento a partir de um elevado volume de dados num curto espaço de tempo. + +# Prémios + +O projeto *Conta-me Histórias* foi o vencedor do [Prémio Arquivo.pt 2018](https://sobre.arquivo.pt/pt/vencedores-premios-arquivo-pt/) e o vencedor da [Best Demo Presentation](https://ecir2019.org/workshops/) na [41st European Conference on Information Retrieval (ECIR-19)](https://ecir2019.org/) (em inglês). + +# Financiamento + +Ricardo Campos foi financiado por fundos nacionais através do Fundação para a Ciência e Tecnologia (FCT) e pela Fundação Portuguesa para Ciência e Tecnologia (I.P.) com o projeto StorySense (2022.09312.PTDC). + +# Bibliografia + +* Campos, R., Pasquali, A., Jatowt, A., Mangaravite, V., and Jorge, A.. "Automatic Generation of Timelines for Past-Web Events" In *The Past Web: Exploring Web Archives*, edited by D. Gomes, E. Demidova, J. Winters, and T. Risse, 225-242. Springer: 2021. [https://link.springer.com/chapter/10.1007/978-3-030-63291-5_18](https://perma.cc/F3SZ-5MVL) + +* Campos, R., Mangaravite, V., Pasquali, A., Jorge, A., Nunes, C., and Jatowt, A.. "YAKE! Keyword Extraction from Single Documents using Multiple Local Features". *Information Sciences Journal*, vol. 509 (2020): 257-289. [https://doi.org/10.1016/j.ins.2019.09.013](https://doi.org/10.1016/j.ins.2019.09.013) + +* Campos, R., Mangaravite, V., Pasquali, A., Jorge, A., Nunes, C., and Jatowt, A.. "A Text Feature Based Automatic Keyword Extraction Method for Single Documents" In *Advances in Information Retrieval. ECIR 2018 (Grenoble, France. March 26 ? 29). Lecture Notes in Computer Science*, edited by G. Pasi, B. Piwowarski, L. Azzopardi, and A. Hanbury, vol. 10772, 684-691. Springer: 2018. [https://link.springer.com/chapter/10.1007/978-3-319-76941-7_63](https://perma.cc/3V3W-X6MZ) + +* Pasquali, A., Mangaravite, V., Campos, R., Jorge, A., and Jatowt, A.."Interactive System for Automatically Generating Temporal Narratives" In +*Advances in Information Retrieval. ECIR'19 (Cologne, Germany. April 14-18). Lecture Notes in Computer Science*, edited by L. Azzopardi, B. Stein, N. Fuhr, P. Mayr, C. Hauff, and D. Hiemstra, vol. 11438, 251 - 255. Springer: 2019. [https://link.springer.com/chapter/10.1007/978-3-030-15719-7_34](https://perma.cc/MH6W-QQFD) + +* Gomes, D., Demidova, E., Winters, J., and Risse, T. (eds.), *The Past Web: Exploring Web Archives*. Springer, 2021. [https://arquivo.pt/book](https://arquivo.pt/book) [Pre-print](https://perma.cc/Q693-DLPA) + +* Gomes, D., and Costa M.. "The Importance of Web Archives for Humanities". *International Journal of Humanities and Arts Computing*, (April 2014). [https://sobre.arquivo.pt/wp-content/uploads/the-importance-of-web-archives-for-humanities.pdf](https://perma.cc/4WHP-Q534). + +* Alam,Sawood, Weigle, Michele C., Nelson, Michael L., Melo, Fernando, Bicho, Daniel, Gomes, Daniel. "MementoMap Framework for Flexible and Adaptive Web Archive Profiling" In *Proceedings of Joint Conference on Digital Libraries 2019*. Urbana-Champaign, Illinois, US: June 2019. [https://www.cs.odu.edu/~salam/drafts/mementomap-jcdl19-cameraready.pdf](https://perma.cc/7ES7-A7H7). + +* Costa, M.. "Information Search in Web Archives" PhD thesis, Universidade de Lisboa, December 2014. [https://sobre.arquivo.pt/wp-content/uploads/phd-thesis-information-search-in-web-archives.pdf](https://perma.cc/HU5S-M2XE) + +* Mourão, A., Gomes, D.. *The Anatomy of a Web Archive Image Search Engine. Technical Report*. Lisboa, Portugal: Arquivo.pt, dezembro 2021. [https://sobre.arquivo.pt/wp-content/uploads/The_Anatomy_of_a_Web_Archive_Image_Search_Engine_tech_report.pdf](https://perma.cc/2JF4-EF4T) diff --git a/pt/licoes/trabalhando-ficheiros-texto-python.md b/pt/licoes/trabalhando-ficheiros-texto-python.md index 8cb76db22d..d80d0dfb9c 100644 --- a/pt/licoes/trabalhando-ficheiros-texto-python.md +++ b/pt/licoes/trabalhando-ficheiros-texto-python.md @@ -1,191 +1,191 @@ ---- -title: Trabalhando com ficheiros de texto em Python -slug: trabalhando-ficheiros-texto-python -layout: lesson -date: 2012-07-17 -translation_date: 2021-05-13 -authors: -- William J. Turkel -- Adam Crymble -reviewers: -- Jim Clifford -editors: -- Miriam Posner -translator: -- Aracele Torres -translation-editor: -- Danielle Sanches -translation-reviewer: -- Bruno Martins -- Renato Rocha Souza -difficulty: 2 -review-ticket: https://github.com/programminghistorian/ph-submissions/issues/317 -activity: transforming -topics: [python] -abstract: "Nesta lição, você aprenderá a manipular ficheiros de texto usando Python." -next: code-reuse-and-modularity -previous: nocoes-basicas-paginas-web-html -python_warning: false -original: working-with-text-files -avatar_alt: Homem de óculos lendo um livro de alfabeto -doi: 10.46430/phpt0003 ---- - -{% include toc.html %} - - - - - -## Objetivos da lição - -Nesta lição, você aprenderá a manipular ficheiros de texto usando Python. -Isto inclui abrir, fechar, ler e gravar ficheiros no formato `.txt` usando instruções nesta linguagem de programação. - -As próximas lições desta série envolverão o download de uma página da web e a reorganização do seu conteúdo em blocos de informação úteis. Você fará a maior parte do trabalho usando código Python escrito e executado no ambiente Komodo Edit. - -## Trabalhando com ficheiros de texto - -A linguagem Python facilita o trabalho com ficheiros e texto. Vamos começar com ficheiros. - -## Criando e gravando um ficheiro de texto - -Vamos começar com uma breve discussão da terminologia. Numa lição anterior (dependendo do seu sistema operativo: [Instalação em Mac][], [Instalação em Windows][], ou [Instalação em Linux][]), você viu como enviar informação para a janela de "Saída de Comando" do seu editor de texto, usando o comando [print][] do Python. - -``` python[´p -print('olá mundo') -``` - -A linguagem de programação Python é *orientada a objetos*. Isso quer dizer que a mesma é construída em torno de um tipo especial de entidade, um *objeto*, que -contém *dados* e vários *métodos* para aceder e alterar esses dados. Depois de um objeto ser criado, ele pode interagir com outros objetos. - -No exemplo acima, vemos um tipo de objeto, a *string* "olá mundo". A *string* é a sequência de caracteres entre aspas. Você pode escrever uma *string* de três maneiras: - -``` -message1 = 'olá mundo' -message2 = "olá mundo" -message3 = """olá -olá -olá mundo""" -``` - -O importante a notar é que nos primeiros dois exemplos você pode usar aspas simples ou duplas / vírgulas invertidas, mas não pode misturar as duas dentro de uma *string*. -No terceiro exemplo, as aspas triplas significam uma *string* que abrange mais de uma linha. - -Por exemplo, as seguintes declarações estão todas erradas: - -``` -message1 = "olá mundo' -message2 = 'olá mundo" -message3 = 'O meu nome é John O'Brian' -``` - -Conte o número de aspas simples na *message3*. Para funcionar você -teria que *libertar* o apóstrofo: - -``` python -message3 = 'O meu nome é John O\'Brian' -``` - -Alternativamente, poderia reescrever a declaração como: - -``` python -message3 = "O meu nome é John O'Brian" -``` - -`Print` é um comando que imprime objetos na forma textual. O comando *print*, quando combinado com a *string*, produz uma *instrução*. - -Você usará `print` como indicado anteriormente nos casos em que deseja apresentar a informação imediatamente. Às vezes, no entanto, você criará informação que deseja guardar, enviar a outra pessoa, ou usar como entrada para processamento posterior por um outro programa ou conjunto de programas. Nestes casos, você desejará enviar a informação para ficheiros no seu disco rígido, em vez de para a janela de "saída de comando". Insira o seguinte programa no seu editor de texto e salve-o como `ficheiro-saida.py`. - -``` python -# ficheiro-saida.py -f = open('olamundo.txt','w') -f.write('olá mundo') -f.close() -``` - -Em Python, qualquer linha que comece com uma marca de hash (\#) é conhecida como um *comentário* e é ignorada pelo interpretador Python. Os comentários têm como objetivo permitir que os programadores comuniquem uns com os outros (ou para se lembrarem do que seu código faz quando o voltam a analisar alguns meses depois). Num sentido mais amplo, os próprios programas são tipicamente escritos e formatados de modo que seja mais fácil para os programadores comunicarem uns com os outros. Quando o código é mais próximo dos requisitos da máquina é conhecido como *baixo nível*, enquanto o que está mais próximo da linguagem natural é de *alto nível*. Um dos benefícios de usar uma linguagem como Python é que ela é de nível muito alto, tornando mais fácil a comunicação (com algum custo em termos de eficiência computacional). - -No programa anterior, *f* é um *objeto ficheiro* (*file object*), e `open` (abrir), `write` (gravar) e `close` (fechar) são *métodos de ficheiro* (*file -methods*). Em outras palavras, abrir, gravar, e fechar fazem algo com o objeto *f* que, neste caso, é definido como um ficheiro `.txt`. Este é provavelmente um uso diferente do termo "método" do que aquele que você poderia esperar e, de vez em quando, você descobrirá que as palavras usadas no contexto de programação têm significados ligeiramente (ou completamente) diferentes do que na fala do dia a dia. Neste caso, lembre-se de que os métodos são código que executa ações. Eles fazem algo a outra coisa e retornam um resultado. Você pode tentar pensar nisto usando um exemplo do mundo real, como dar comandos ao cão da família. O cão (o objeto) entende comandos (ou seja, tem "métodos") como "latir", "sentar", "fingir de morto" e assim por diante. Discutiremos e aprenderemos como usar muitos outros métodos à medida que avançarmos. - -*f* é um nome de variável escolhido por nós; você poderia chamá-lo de qualquer coisa que quisesse. No Python, os nomes das variáveis podem ser constituídos por letras maiúsculas e minúsculas, números, e o símbolo *underline*... mas você não pode usar os nomes dos comandos Python como variáveis. Se você tentasse nomear a sua variável de ficheiro como, por exemplo, "print", o seu programa não funcionaria porque esta é uma [palavra reservada][] que faz parte da linguagem de programação. - -Os nomes das variáveis Python também são *case-sensitive*, ou seja, diferenciam letras maiúsculas de minúsculas, o que significa que *foobar*, *Foobar* e *FOOBAR* seriam todas variáveis diferentes. - -Quando você executa o programa, o método `open` (abrir) vai dizer ao seu computador para criar um novo ficheiro de texto `olamundo.txt` na mesma pasta que você salvou o programa `ficheiro-saida.py`. O parâmetro *w* diz que você pretende gravar conteúdo neste novo ficheiro usando Python. - -Observe que, como o nome do ficheiro e o parâmetro estão entre aspas simples, você sabe que ambos estão armazenados como *strings*; esquecer de incluir as aspas fará com que o seu programa falhe. - -Na próxima linha, o seu programa grava a mensagem "olá mundo" (outra string) no ficheiro e o fecha. (Para obter mais informações sobre estas instruções, consulte a seção [File Objects][] na Referência da biblioteca Python.) - -Clique duas vezes no botão "Executar Python" no Komodo Edit para executar o programa (ou o equivalente em qualquer outro editor de texto que você tenha decidido usar: por exemplo, clique em "\#!" E "Executar" no TextWrangler). Embora nada seja impresso no painel "Saída de Comando", você verá uma mensagem de status que diz algo como - -``` python -`/usr/bin/python ficheiro-saida.py` returned 0. -``` - -em Mac ou Linux, ou - -``` python -'C:\Python27\Python.exe ficheiro-saida.py' returned 0. -``` - -no Windows. - -Isso significa que o seu programa foi executado com sucesso. Se você usar *Arquivo -> Abrir -> Arquivo* no Komodo Edit, você pode abrir o ficheiro `olamundo.txt`. Ele deve conter a sua mensagem numa linha: - -``` python -olá mundo -``` - -Como os ficheiros de texto incluem uma quantidade mínima de informação de formatação, eles tendem a ser pequenos, fáceis de trocar entre plataformas diferentes -(ou seja, do Windows para Linux ou Mac, ou vice-versa) e fáceis de enviar de um programa de computador para outro. Eles geralmente também podem ser lidos por pessoas que usam um editor de texto como o Komodo Edit. - -### Lendo de um ficheiro de texto - -A linguagem Python também possui métodos que permitem obter informação desde ficheiros. Digite o seguinte programa no seu editor de texto e salve-o como -`ficheiro-entrada.py`. Ao clicar em "Executar" para executá-lo, será aberto o ficheiro de texto que você acabou de criar, lida a mensagem numa linha do ficheiro, e -impressa a mensagem no painel "Saída de Comando". - -``` python -# ficheiro-entrada.py -f = open('olamundo.txt','r') -message = f.read() -print(message) -f.close() -``` - -Nesse caso, o parâmetro *r* é usado para indicar que você está abrindo um ficheiro para ler (`read`) a partir dele. Os parâmetros permitem que você escolha entre as diferentes opções que um método específico permite. Voltando ao exemplo do cão da família, o cão pode ser treinado a latir uma vez quando faz um lanche com sabor de carne e duas vezes quando recebe um com sabor de frango. O sabor do lanche é um parâmetro. Cada método é diferente em termos de quais parâmetros aceitará. Você não pode, por exemplo, pedir a um cão que cante uma ópera italiana - a menos que o seu cão seja particularmente talentoso. Você pode pesquisar os parâmetros possíveis para um método específico no site do Python ou, frequentemente, pode encontrá-los digitando o nome do método num motor de busca, junto com o termo "Python". - -`Read` é um outro método de ficheiro. Os conteúdos do ficheiro (a mensagem de uma linha) são copiados para a variável *message*, que é como decidimos chamar esta *string*, e então o comando `print` é usado para enviar os conteúdos de *message* para o painel "Saída do Comando". - -### Anexando conteúdo a um ficheiro de texto pré-existente - -Uma terceira opção é abrir um ficheiro pré-existente e adicionar mais conteúdo a ele. Note que se você abrir (`open`) um ficheiro e usar o método `write` (gravar), *o programa sobrescreverá tudo o que possa estar contido no ficheiro*. Isso não é um problema quando você está criando um novo ficheiro, ou quando deseja sobrescrever os conteúdos de um ficheiro existente, mas pode ser indesejável quando você está criando um registro de eventos ou compilando um grande conjunto de dados em um ficheiro. Neste caso, ao invés de `write`, você vai querer usar o método acrescentar (`append`), designado por `a`. - -Digite o seguinte programa no seu editor de texto e salve-o como`ficheiro-acrescentar.py`. Quando você executar este programa, ele abrirá o mesmo ficheiro `olamundo.txt` criado anteriormente e anexará uma segunda mensagem “olá mundo” ao ficheiro. A sequência '\\n' significa o início de uma nova linha. - -``` python -# ficheiro-acrescentar.py -f = open('olamundo.txt','a') -f.write('\n' + 'olá mundo') -f.close() -``` - -Depois de executar o programa, abra o ficheiro `olamundo.txt` e veja o que aconteceu. Feche o ficheiro de texto e execute mais algumas vezes o programa `ficheiro-acrescentar.py`. Quando você abrir `olamundo.txt` novamente, notará algumas mensagens 'olá mundo' extra esperando por você. - -Na próxima seção, discutiremos a modularidade e a reutilização de código. - -Leituras sugeridas ------------------- - -- [Non-Programmer's Tutorial for Python 3/Hello, World][] - - [Instalação em Mac]: https://programminghistorian.org/lessons/mac-installation - [Instalação em Windows]: https://programminghistorian.org/lessons/windows-installation - [Instalação em Linux]: https://programminghistorian.org/lessons/linux-installation - [print]: https://docs.python.org/2/reference/simple_stmts.html#the-print-statement - [palavra reservada]: http://docs.python.org/release/2.5.4/ref/keywords.html - [File Objects]: https://docs.python.org/2/library/stdtypes.html#bltin-file-objects - [Non-Programmer's Tutorial for Python 3/Hello, World]: https://en.wikibooks.org/wiki/Non-Programmer%27s_Tutorial_for_Python_3/Hello,_World +--- +title: Trabalhando com ficheiros de texto em Python +slug: trabalhando-ficheiros-texto-python +layout: lesson +date: 2012-07-17 +translation_date: 2021-05-13 +authors: +- William J. Turkel +- Adam Crymble +reviewers: +- Jim Clifford +editors: +- Miriam Posner +translator: +- Aracele Torres +translation-editor: +- Danielle Sanches +translation-reviewer: +- Bruno Martins +- Renato Rocha Souza +difficulty: 2 +review-ticket: https://github.com/programminghistorian/ph-submissions/issues/317 +activity: transforming +topics: [python] +abstract: "Nesta lição, você aprenderá a manipular ficheiros de texto usando Python." +next: /pt/licoes/code-reuse-and-modularity +previous: /pt/licoes/nocoes-basicas-paginas-web-html +python_warning: false +original: working-with-text-files +avatar_alt: Homem de óculos lendo um livro de alfabeto +doi: 10.46430/phpt0003 +--- + +{% include toc.html %} + + + + + +## Objetivos da lição + +Nesta lição, você aprenderá a manipular ficheiros de texto usando Python. +Isto inclui abrir, fechar, ler e gravar ficheiros no formato `.txt` usando instruções nesta linguagem de programação. + +As próximas lições desta série envolverão o download de uma página da web e a reorganização do seu conteúdo em blocos de informação úteis. Você fará a maior parte do trabalho usando código Python escrito e executado no ambiente Komodo Edit. + +## Trabalhando com ficheiros de texto + +A linguagem Python facilita o trabalho com ficheiros e texto. Vamos começar com ficheiros. + +## Criando e gravando um ficheiro de texto + +Vamos começar com uma breve discussão da terminologia. Numa lição anterior (dependendo do seu sistema operativo: [Instalação em Mac][], [Instalação em Windows][], ou [Instalação em Linux][]), você viu como enviar informação para a janela de "Saída de Comando" do seu editor de texto, usando o comando [print][] do Python. + +``` python[´p +print('olá mundo') +``` + +A linguagem de programação Python é *orientada a objetos*. Isso quer dizer que a mesma é construída em torno de um tipo especial de entidade, um *objeto*, que +contém *dados* e vários *métodos* para aceder e alterar esses dados. Depois de um objeto ser criado, ele pode interagir com outros objetos. + +No exemplo acima, vemos um tipo de objeto, a *string* "olá mundo". A *string* é a sequência de caracteres entre aspas. Você pode escrever uma *string* de três maneiras: + +``` +message1 = 'olá mundo' +message2 = "olá mundo" +message3 = """olá +olá +olá mundo""" +``` + +O importante a notar é que nos primeiros dois exemplos você pode usar aspas simples ou duplas / vírgulas invertidas, mas não pode misturar as duas dentro de uma *string*. +No terceiro exemplo, as aspas triplas significam uma *string* que abrange mais de uma linha. + +Por exemplo, as seguintes declarações estão todas erradas: + +``` +message1 = "olá mundo' +message2 = 'olá mundo" +message3 = 'O meu nome é John O'Brian' +``` + +Conte o número de aspas simples na *message3*. Para funcionar você +teria que *libertar* o apóstrofo: + +``` python +message3 = 'O meu nome é John O\'Brian' +``` + +Alternativamente, poderia reescrever a declaração como: + +``` python +message3 = "O meu nome é John O'Brian" +``` + +`Print` é um comando que imprime objetos na forma textual. O comando *print*, quando combinado com a *string*, produz uma *instrução*. + +Você usará `print` como indicado anteriormente nos casos em que deseja apresentar a informação imediatamente. Às vezes, no entanto, você criará informação que deseja guardar, enviar a outra pessoa, ou usar como entrada para processamento posterior por um outro programa ou conjunto de programas. Nestes casos, você desejará enviar a informação para ficheiros no seu disco rígido, em vez de para a janela de "saída de comando". Insira o seguinte programa no seu editor de texto e salve-o como `ficheiro-saida.py`. + +``` python +# ficheiro-saida.py +f = open('olamundo.txt','w') +f.write('olá mundo') +f.close() +``` + +Em Python, qualquer linha que comece com uma marca de hash (\#) é conhecida como um *comentário* e é ignorada pelo interpretador Python. Os comentários têm como objetivo permitir que os programadores comuniquem uns com os outros (ou para se lembrarem do que seu código faz quando o voltam a analisar alguns meses depois). Num sentido mais amplo, os próprios programas são tipicamente escritos e formatados de modo que seja mais fácil para os programadores comunicarem uns com os outros. Quando o código é mais próximo dos requisitos da máquina é conhecido como *baixo nível*, enquanto o que está mais próximo da linguagem natural é de *alto nível*. Um dos benefícios de usar uma linguagem como Python é que ela é de nível muito alto, tornando mais fácil a comunicação (com algum custo em termos de eficiência computacional). + +No programa anterior, *f* é um *objeto ficheiro* (*file object*), e `open` (abrir), `write` (gravar) e `close` (fechar) são *métodos de ficheiro* (*file +methods*). Em outras palavras, abrir, gravar, e fechar fazem algo com o objeto *f* que, neste caso, é definido como um ficheiro `.txt`. Este é provavelmente um uso diferente do termo "método" do que aquele que você poderia esperar e, de vez em quando, você descobrirá que as palavras usadas no contexto de programação têm significados ligeiramente (ou completamente) diferentes do que na fala do dia a dia. Neste caso, lembre-se de que os métodos são código que executa ações. Eles fazem algo a outra coisa e retornam um resultado. Você pode tentar pensar nisto usando um exemplo do mundo real, como dar comandos ao cão da família. O cão (o objeto) entende comandos (ou seja, tem "métodos") como "latir", "sentar", "fingir de morto" e assim por diante. Discutiremos e aprenderemos como usar muitos outros métodos à medida que avançarmos. + +*f* é um nome de variável escolhido por nós; você poderia chamá-lo de qualquer coisa que quisesse. No Python, os nomes das variáveis podem ser constituídos por letras maiúsculas e minúsculas, números, e o símbolo *underline*... mas você não pode usar os nomes dos comandos Python como variáveis. Se você tentasse nomear a sua variável de ficheiro como, por exemplo, "print", o seu programa não funcionaria porque esta é uma [palavra reservada][] que faz parte da linguagem de programação. + +Os nomes das variáveis Python também são *case-sensitive*, ou seja, diferenciam letras maiúsculas de minúsculas, o que significa que *foobar*, *Foobar* e *FOOBAR* seriam todas variáveis diferentes. + +Quando você executa o programa, o método `open` (abrir) vai dizer ao seu computador para criar um novo ficheiro de texto `olamundo.txt` na mesma pasta que você salvou o programa `ficheiro-saida.py`. O parâmetro *w* diz que você pretende gravar conteúdo neste novo ficheiro usando Python. + +Observe que, como o nome do ficheiro e o parâmetro estão entre aspas simples, você sabe que ambos estão armazenados como *strings*; esquecer de incluir as aspas fará com que o seu programa falhe. + +Na próxima linha, o seu programa grava a mensagem "olá mundo" (outra string) no ficheiro e o fecha. (Para obter mais informações sobre estas instruções, consulte a seção [File Objects][] na Referência da biblioteca Python.) + +Clique duas vezes no botão "Executar Python" no Komodo Edit para executar o programa (ou o equivalente em qualquer outro editor de texto que você tenha decidido usar: por exemplo, clique em "\#!" E "Executar" no TextWrangler). Embora nada seja impresso no painel "Saída de Comando", você verá uma mensagem de status que diz algo como + +``` python +`/usr/bin/python ficheiro-saida.py` returned 0. +``` + +em Mac ou Linux, ou + +``` python +'C:\Python27\Python.exe ficheiro-saida.py' returned 0. +``` + +no Windows. + +Isso significa que o seu programa foi executado com sucesso. Se você usar *Arquivo -> Abrir -> Arquivo* no Komodo Edit, você pode abrir o ficheiro `olamundo.txt`. Ele deve conter a sua mensagem numa linha: + +``` python +olá mundo +``` + +Como os ficheiros de texto incluem uma quantidade mínima de informação de formatação, eles tendem a ser pequenos, fáceis de trocar entre plataformas diferentes +(ou seja, do Windows para Linux ou Mac, ou vice-versa) e fáceis de enviar de um programa de computador para outro. Eles geralmente também podem ser lidos por pessoas que usam um editor de texto como o Komodo Edit. + +### Lendo de um ficheiro de texto + +A linguagem Python também possui métodos que permitem obter informação desde ficheiros. Digite o seguinte programa no seu editor de texto e salve-o como +`ficheiro-entrada.py`. Ao clicar em "Executar" para executá-lo, será aberto o ficheiro de texto que você acabou de criar, lida a mensagem numa linha do ficheiro, e +impressa a mensagem no painel "Saída de Comando". + +``` python +# ficheiro-entrada.py +f = open('olamundo.txt','r') +message = f.read() +print(message) +f.close() +``` + +Nesse caso, o parâmetro *r* é usado para indicar que você está abrindo um ficheiro para ler (`read`) a partir dele. Os parâmetros permitem que você escolha entre as diferentes opções que um método específico permite. Voltando ao exemplo do cão da família, o cão pode ser treinado a latir uma vez quando faz um lanche com sabor de carne e duas vezes quando recebe um com sabor de frango. O sabor do lanche é um parâmetro. Cada método é diferente em termos de quais parâmetros aceitará. Você não pode, por exemplo, pedir a um cão que cante uma ópera italiana - a menos que o seu cão seja particularmente talentoso. Você pode pesquisar os parâmetros possíveis para um método específico no site do Python ou, frequentemente, pode encontrá-los digitando o nome do método num motor de busca, junto com o termo "Python". + +`Read` é um outro método de ficheiro. Os conteúdos do ficheiro (a mensagem de uma linha) são copiados para a variável *message*, que é como decidimos chamar esta *string*, e então o comando `print` é usado para enviar os conteúdos de *message* para o painel "Saída do Comando". + +### Anexando conteúdo a um ficheiro de texto pré-existente + +Uma terceira opção é abrir um ficheiro pré-existente e adicionar mais conteúdo a ele. Note que se você abrir (`open`) um ficheiro e usar o método `write` (gravar), *o programa sobrescreverá tudo o que possa estar contido no ficheiro*. Isso não é um problema quando você está criando um novo ficheiro, ou quando deseja sobrescrever os conteúdos de um ficheiro existente, mas pode ser indesejável quando você está criando um registro de eventos ou compilando um grande conjunto de dados em um ficheiro. Neste caso, ao invés de `write`, você vai querer usar o método acrescentar (`append`), designado por `a`. + +Digite o seguinte programa no seu editor de texto e salve-o como`ficheiro-acrescentar.py`. Quando você executar este programa, ele abrirá o mesmo ficheiro `olamundo.txt` criado anteriormente e anexará uma segunda mensagem “olá mundo” ao ficheiro. A sequência '\\n' significa o início de uma nova linha. + +``` python +# ficheiro-acrescentar.py +f = open('olamundo.txt','a') +f.write('\n' + 'olá mundo') +f.close() +``` + +Depois de executar o programa, abra o ficheiro `olamundo.txt` e veja o que aconteceu. Feche o ficheiro de texto e execute mais algumas vezes o programa `ficheiro-acrescentar.py`. Quando você abrir `olamundo.txt` novamente, notará algumas mensagens 'olá mundo' extra esperando por você. + +Na próxima seção, discutiremos a modularidade e a reutilização de código. + +Leituras sugeridas +------------------ + +- [Non-Programmer's Tutorial for Python 3/Hello, World][] + + [Instalação em Mac]: https://programminghistorian.org/lessons/mac-installation + [Instalação em Windows]: https://programminghistorian.org/lessons/windows-installation + [Instalação em Linux]: https://programminghistorian.org/lessons/linux-installation + [print]: https://docs.python.org/2/reference/simple_stmts.html#the-print-statement + [palavra reservada]: https://docs.python.org/release/2.5.4/ref/keywords.html + [File Objects]: https://docs.python.org/2/library/stdtypes.html#bltin-file-objects + [Non-Programmer's Tutorial for Python 3/Hello, World]: https://en.wikibooks.org/wiki/Non-Programmer%27s_Tutorial_for_Python_3/Hello,_World diff --git a/pt/licoes/transcricao-automatica-grafias-nao-latinas.md b/pt/licoes/transcricao-automatica-grafias-nao-latinas.md index bc8c75f177..6471e1a822 100644 --- a/pt/licoes/transcricao-automatica-grafias-nao-latinas.md +++ b/pt/licoes/transcricao-automatica-grafias-nao-latinas.md @@ -78,7 +78,7 @@ No estado da arte existe uma grande variedade de arquiteturas e abordagens utili {% include figure.html filename="pt-tr-transcricao-automatica-grafias-nao-latinas-01.png" alt="Esquema das etapas clássicas para o treinamento de um modelo OCR (da anotação dos dados à aplicação do modelo)" caption="Figura 1: Detalhe das etapas clássicas para treinamento de um modelo OCR ou HTR." %}
    -Na prática, o reconhecimento de caracteres consiste apenas num problema simples de classificação em visão computacional. Qualquer que seja a etapa (deteção de conteúdos e reconhecimento do texto propriamente dito), os modelos terão de classificar as informações encontradas e reparti-las por classes conhecidas (por exemplo, considerar uma zona do texto como título ou uma outra forma de transcrever a letra A). Esta abordagem, completamente supervisionada, está muito dependente das escolhas e das necessidades identificadas, e que abordamos na secção Definição de necessidades. +Na prática, o reconhecimento de caracteres consiste apenas num problema simples de classificação em visão computacional. Qualquer que seja a etapa (deteção de conteúdos e reconhecimento do texto propriamente dito), os modelos terão de classificar as informações encontradas e reparti-las por classes conhecidas (por exemplo, considerar uma zona do texto como título ou uma outra forma de transcrever a letra A). Esta abordagem, completamente supervisionada, está muito dependente das escolhas e das necessidades identificadas, e que abordamos na secção Definição de necessidades.
    ## O caso das línguas e grafias não latinas @@ -443,7 +443,7 @@ Justifica-se uma abordagem por "baselines" (a encarnado na figura 10 encontra-se ```xml - + Calfa 2022-08-23T14:48:18+00:00 @@ -763,7 +763,7 @@ Os dados gerados neste artigo e no âmbito do projeto CGPG estão disponíveis n ## Notas de fim -[^1]: Os volumes da PG estão disponíveis em formato PDF, por exemplo, nos links [http://patristica.net/graeca](http://patristica.net/graeca) e [https://www.roger-pearse.com/weblog/patrologia-graeca-pg-pdfs](https://www.roger-pearse.com/weblog/patrologia-graeca-pg-pdfs) (em inglês). Mas apenas parte da PG está codificada em formato de "texto", por exemplo, no corpus do [Thesaurus Linguae Graecae](http://stephanus.tlg.uci.edu) (em inglês). +[^1]: Os volumes da PG estão disponíveis em formato PDF, por exemplo, nos links [https://patristica.net/graeca](https://patristica.net/graeca) e [https://www.roger-pearse.com/weblog/patrologia-graeca-pg-pdfs](https://www.roger-pearse.com/weblog/patrologia-graeca-pg-pdfs) (em inglês). Mas apenas parte da PG está codificada em formato de "texto", por exemplo, no corpus do [Thesaurus Linguae Graecae](https://stephanus.tlg.uci.edu) (em inglês). [^2]: A associação Calfa (Paris, França) e o projeto GRE*g*ORI (Université Catholique de Louvain, Louvain-la-Neuve, Bélgica) desenvolvem conjuntamente sistemas de reconhecimento de caracteres e sistemas de análise automática de textos: lematização, rotulagem morfossintática, POS_tagging). Esses desenvolvimentos já foram adaptados, testados e utilizados para processar textos em arménio, em georgiano e em sírio. O projeto CGPG continua esses desenvolvimentos no domínio do grego, propondo um processamento completo (OCR e análise) de textos editados da PG. Para os exemplos de processamento morfossintático do grego antigo realizado em conjunto: Kindt, Bastien, Chahan Vidal-Gorène, Saulo Delle Donne. "Analyse automatique du grec ancien par réseau de neurones. Évaluation sur le corpus De Thessalonica Capta". *BABELAO*, 10-11 (2022), 525-550. [https://doi.org/10.14428/babelao.vol1011.2022.65073](https://doi.org/10.14428/babelao.vol1011.2022.65073) (em francês). @@ -841,7 +841,7 @@ Os dados gerados neste artigo e no âmbito do projeto CGPG estão disponíveis n [^38]: *Ibid.* -[^39]: Bastien Kindt e Vidal-Gorène Chahan, "From Manuscript to Tagged Corpora. An Automated Process for Ancient Armenian or Other Under-Resourced Languages of the Christian East". *Armeniaca. International Journal of Armenian Studies* 1, 73-96, 2022. [http://doi.org/10.30687/arm/9372-8175/2022/01/005]( http://doi.org/10.30687/arm/9372-8175/2022/01/005) (em inglês). +[^39]: Bastien Kindt e Vidal-Gorène Chahan, "From Manuscript to Tagged Corpora. An Automated Process for Ancient Armenian or Other Under-Resourced Languages of the Christian East". *Armeniaca. International Journal of Armenian Studies* 1, 73-96, 2022. [https://doi.org/10.30687/arm/9372-8175/2022/01/005]( https://doi.org/10.30687/arm/9372-8175/2022/01/005) (em inglês). [^40]: Vidal-Gorène, Lucas, Salah, Decours-Perez, e Dupin. "RASAM–A Dataset for the Recognition and Analysis of Scripts in Arabic Maghrebi", 265-281. diff --git a/pt/pesquisa.md b/pt/pesquisa.md index 527d700d4d..8a51e43c97 100755 --- a/pt/pesquisa.md +++ b/pt/pesquisa.md @@ -8,16 +8,16 @@ original: research A equipe do projeto e membros da comunidade em geral estão envolvidos em várias iniciativas académicas relacionadas com o nosso trabalho aqui no *Programming Historian em português*. Tal incluí eventos, artigos em periódicos, resenhas (da comunidade) e pósteres. Se estiver a desenvolver pesquisa académica usando este projeto, por favor contate a nossa assistente de publicação Anisa Hawes. ## *Programming Historian* original -* William J. Turkel e Alan MacEachern, [_The Programming Historian_](http://niche-canada.org/wp-content/uploads/2013/09/programming-historian-1.pdf) 1a Edição (Network in Canadian History & Environment: 2007-2008). +* William J. Turkel e Alan MacEachern, [_The Programming Historian_](https://niche-canada.org/wp-content/uploads/2013/09/programming-historian-1.pdf) 1a Edição (Network in Canadian History & Environment: 2007-2008). * Tradução ao japonês de William J. Turkel e Alan MacEachern, [_The Programming Historian_](https://www.dh.ku-orcas.kansai-u.ac.jp/?cat=2) 1a Edição (Network in Canadian History & Environment: 2007-2008). ## Resenhas -* Björn Ekström, Elisa Tattersall Wallin e Hana Marčetić, '[_Programming Historian_: Novice-friendly tutorials on digital methods](http://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1508542&dswid=7551)', _Tidskrift för ABM_, Vol. 5, no 1 (2020), pp. 71-75. +* Björn Ekström, Elisa Tattersall Wallin e Hana Marčetić, '[_Programming Historian_: Novice-friendly tutorials on digital methods](https://www.diva-portal.org/smash/record.jsf?pid=diva2%3A1508542&dswid=7551)', _Tidskrift för ABM_, Vol. 5, no 1 (2020), pp. 71-75. * Dries Daems, '[A Review and Roadmap of Online Learning Platforms and Tutorials in Digital Archaeology](https://doi.org/10.1017/aap.2019.47)', _Advances in Archaeological Practice_, vol. 8, no 1 (2020), pp. 87-92. * Martin Dröge, '[Review of: The Programming Historian](https://www.hsozkult.de/webreview/id/rezwww-184)', _H-Soz-Kult_ (2019). * Priscila Pilatowsky Goñi, '[Reseña a The programming historian](https://revistas.uned.es/index.php/RHD/article/view/22420)', _Revista de Humanidades Digitales_, vol. 2 (2018). -* Lincoln Mullen, '[Review of the Programming Historian](http://jah.oxfordjournals.org/content/103/1/299.2.full)', _The Journal of American History_, vol. 103, no. 1 (2016), pp. 299-301. -* Cameron Blevins, '[Review of the Programming Historian](http://jitp.commons.gc.cuny.edu/review-of-the-programming-historian/)', _The Journal of Interactive Technology & Pedagogy_, vol. 8 (2015). +* Lincoln Mullen, '[Review of the Programming Historian](https://jah.oxfordjournals.org/content/103/1/299.2.full)', _The Journal of American History_, vol. 103, no. 1 (2016), pp. 299-301. +* Cameron Blevins, '[Review of the Programming Historian](https://jitp.commons.gc.cuny.edu/review-of-the-programming-historian/)', _The Journal of Interactive Technology & Pedagogy_, vol. 8 (2015). ## Pesquisa publicada @@ -28,16 +28,16 @@ A equipe do projeto e membros da comunidade em geral estão envolvidos em vária * Jennifer Isasi, Riva Quiroga, Nabeel Sidiqqui, Joana Vieira Paulino, Alex Wermer-Colan, [“A Model for Multilingual and Multicultural Digital Scholarship Methods Publishing"](https://www.taylorfrancis.com/chapters/edit/10.4324/9781003393696-3/model-multilingual-multicultural-digital-scholarship-methods-publishing-jennifer-isasi-riva-quiroga-nabeel-siddiqui-joana-vieira-paulino-alex-wermer-colan), em _Multilingual Digital Humanities_, editado por Viola, L., & Spence, P., Routledge, 2023. * Adam Crymble & Charlotte M. H. Im, ['Measuring digital humanities learning requirements in Spanish & English-speaking practitioner communities'](https://doi.org/10.1007/s42803-023-00066-x), International Journal of Digital Humanities, (2023). * Eric Brasil, '[_pyHDB - Ferramenta Heurística para a Hemeroteca Digital Brasileira: utilizando técnicas de web scraping para a pesquisa em História_'](https://doi.org/10.15848/hh.v15i40.1904), _História Da Historiografia: International Journal of Theory and History of Historiography_, 15(40) (2022), 186–217. -* Matthew Lincoln, Sarah Melton, Jennifer Isasi, François Dominic Laramée, '[Relocating Complexity: The Programming Historian and Multilingual Static Site Generation](http://www.digitalhumanities.org/dhq/vol/16/2/000585/000585.html)', _Digital Humanities Quarterly_ 16, 2 (2022). +* Matthew Lincoln, Sarah Melton, Jennifer Isasi, François Dominic Laramée, '[Relocating Complexity: The Programming Historian and Multilingual Static Site Generation](https://www.digitalhumanities.org/dhq/vol/16/2/000585/000585.html)', _Digital Humanities Quarterly_ 16, 2 (2022). * Jennifer Isasi e Antonio Rojas Castro, ‘[¿Sin equivalencia? Una reflexión sobre la traducción al español de recursos educativos abiertos](https://muse.jhu.edu/article/842253)’, _Hispania_, 104, no. 4 (2021), 613-624. * Adam Crymble e Maria José Afanador Llach, ‘The Globally Unequal Promise of Digital Tools for History: UK and Colombia Case Study’ em _Teaching History for the Contemporary World_, editado por Adele Nye, 85-98, Springer, 2021. * Daniel Alves, ['Ensinar Humanidades Digitais sem as Humanidades Digitais: um olhar a partir das licenciaturas em História'](https://novaresearch.unl.pt/files/32228034/Ensinar_Humanidades_Digitais.pdf), _Revista EducaOnline_, v. 15, n. 2 (2021). * Adam Crymble, [_Technology & the Historian: Transformations in the Digital Age_](https://www.press.uillinois.edu/books/catalog/57hxp7wr9780252043710.html), (University of Illinois Press, 2021). * Anna-Maria Sichani, James Baker, Maria José Afanador Llach, e Brandon Walsh, [‘Diversity and Inclusion in Digital Scholarship and Pedagogy: The Case of The Programming Historian’](https://doi.org/10.1629/uksg.465), _Insights_, (2019). -* Katrina Navickas e Adam Crymble, ['From Chartist Newspaper to Digital Map of Grass-roots Meetings, 1841-44: Documenting Workflows'](http://www.tandfonline.com/doi/full/10.1080/13555502.2017.1301179), _Journal of Victorian Culture_, (2017). +* Katrina Navickas e Adam Crymble, ['From Chartist Newspaper to Digital Map of Grass-roots Meetings, 1841-44: Documenting Workflows'](https://www.tandfonline.com/doi/full/10.1080/13555502.2017.1301179), _Journal of Victorian Culture_, (2017). * Adam Crymble, ['Identifying and Removing Gender Barriers in Open Learning Communities: The Programming Historian'](https://www.herts.ac.uk/__data/assets/pdf_file/0016/138013/Blip-2016-Autumn-2016-Final-Autumn-2016.pdf), _Blended Learning in Practice_, (2016), 49-60. [[pre-print pdf](/researchpapers/openLearningCommunities2016.pdf)] -* Fred Gibbs, ['Editorial Sustainability and Open Peer Review at Programming Historian',](http://web.archive.org/web/20180713014622/http://dhcommons.org/journal/issue-1/editorial-sustainability-and-open-peer-review-programming-historian) _DH Commons_, Vol. 1 (2015). -* Shawn Graham, Ian Milligan, and Scott Weingart, [_Exploring Big Historical Data: The Historian's Macroscope_](http://www.themacroscope.org/2.0/), (Imperial College Press, 2015). +* Fred Gibbs, ['Editorial Sustainability and Open Peer Review at Programming Historian',](https://web.archive.org/web/20180713014622/https://dhcommons.org/journal/issue-1/editorial-sustainability-and-open-peer-review-programming-historian) _DH Commons_, Vol. 1 (2015). +* Shawn Graham, Ian Milligan, and Scott Weingart, [_Exploring Big Historical Data: The Historian's Macroscope_](https://www.themacroscope.org/2.0/), (Imperial College Press, 2015). ## Relatórios @@ -62,7 +62,7 @@ A equipe do projeto e membros da comunidade em geral estão envolvidos em vária * Alex Wermer-Colan, ['Learning Digital Methods with the _Programming Historian_'](https://charlesstudy.temple.edu/event/11953011), Temple University [Em linha], (22 de fevereiro de 2024). * Carlo Blum, Adam Crymble, Vicky Garnett, Timothée Giraud, Alíz Horváth, Stefan Krebs, Ralph Marschall, Sofia Papastamkou, & Lorella Viola, 'Invisible College of Digital History: Workshop on Multilingual Educational Resources', C²DH [Em linha], (8 de novembro de 2023). * Nabeel Siddiqui, 'Convolutional Neural Networks for Image Classification', University of Edinburgh [Em linha], (7 de novembro de 2023). -* Eric Brasil, '[História Digital e História Digital da Educação: Caminhos Cruzados](http://www.iea.usp.br/eventos/historia-digital-educacao-caminhos-cruzados)', Instituto de Estudos Avançados, USP, São Paulo, Brasil, (17 de outubro de 2023). +* Eric Brasil, '[História Digital e História Digital da Educação: Caminhos Cruzados](https://www.iea.usp.br/eventos/historia-digital-educacao-caminhos-cruzados)', Instituto de Estudos Avançados, USP, São Paulo, Brasil, (17 de outubro de 2023). * Scott Kleinman, Alex Wermer-Colan, Joana Vieira Paulino, Nabeel Siddiqui, Zoe LeBlanc, 'Developing a Digital Humanities Tutorial', [DH 2023](https://dh2023.adho.org/), Graz, Áustria, (10 de julho de 2023). * Daphné Mathelier, 'Atelier Markdown', [11e journées du réseau Medici](https://web.archive.org/web/20230629084307/https://medici2023.sciencesconf.org/resource/page/id/2), Université de Liège, Bélgica, (29 de junho de 2023). * María José Afanador Llach, Jennifer Isasi, Riva Quiroga, 'Sobre _Programming Historian en español_ y cómo contribuir a la publicación', Semana de Humanidades Digitales 2023 [Em linha], (10 de Maio de 2023). @@ -146,10 +146,10 @@ A equipe do projeto e membros da comunidade em geral estão envolvidos em vária * Adam Crymble, 'Facilitating Making in Digital Humanities', The Archaeology of Making, University of London, Reino Unido, 5 de Maio de 2021. * Daniel Alves, Jennifer Isasi, Sarah Melton, Sofia Papastamkou, Jessica Parr, Riva Quiroga, Nabeel Siddiqui, Brandon Walsh, '[The Programming Historian: A Global Case Study in Multilingual Open Access and DH Tutelage/Instruction](https://msuglobaldh.org/abstracts/#programming-historian)' (panel), _Global Digital Humanities Symposium_, Michigan State University, East Lansing, USA, 12 de Abril de 2021. * Jessica Parr, '[Cambridge Cultural Heritage Data School: Final plenary](https://www.cdh.cam.ac.uk/events/cambridge-cultural-heritage-data-school-final-plenary)', University of Cambridge, Reino Unido, 30 de Março de 2021. -* Jennifer Isasi & Riva Quiroga, ['_Programming Historian_: Un proyecto colaborativo para poner la programación al alcance de los humanistas'](http://ixa2.si.ehu.eus/intele/?q=webinars), _INTELE : INfraestructura de TEcnologías del LEnguaje_, España, 25 de Março de 2021. +* Jennifer Isasi & Riva Quiroga, ['_Programming Historian_: Un proyecto colaborativo para poner la programación al alcance de los humanistas'](https://ixa2.si.ehu.eus/intele/?q=webinars), _INTELE : INfraestructura de TEcnologías del LEnguaje_, España, 25 de Março de 2021. * Sofia Papastamkou, Jessica Parr & Riva Quiroga, 'Challenges for Digital Literacy in the Humanities: The Open, Community-Based and Multilinguistic Approach of _The Programming Historian_', NewsEye’s International Conference, Europe, 17 de Março de 2021. * Riva Quiroga, ['Multilingual Digital Humanites'](https://mediacentral.ucl.ac.uk/Play/59506), Digital Humanities Long View Seminar, UCLDH, UK & CESTA, USA, 10 de Março de 2021. -* Brandon Walsh, '[The Programming Historian and Editorial Process in Digital Publishing](http://walshbr.com/blog/the-programming-historian-and-editorial-process-in-digital-publishing/)', Modern Languages Association Conference 2021, 7-10 de Janeiro de 2021. +* Brandon Walsh, '[The Programming Historian and Editorial Process in Digital Publishing](https://walshbr.com/blog/the-programming-historian-and-editorial-process-in-digital-publishing/)', Modern Languages Association Conference 2021, 7-10 de Janeiro de 2021. * Sofia Papastamkou, François Dominic Laramée, Martin Grandjean, '[Le Programming Historian en français: quelles ressources éducatives libres pour les méthodes numériques ?](https://zenodo.org/record/3819954)', *Humanistica 2020 Conference*, Bordeaux, France, 12-14 de Maio de 2020. * Sofia Papastamkou, 'A Beating Heart of Digital History: The Programming Historian', [Teaching Digital History Workshop](https://cas.au.dk/en/cedhar/events/show/artikel/teaching-digital-history-workshop), Center for Digital History Aarhus, University of Aarhus, Denmark, 23 de Outubro de 2019. * Jennifer Isasi, Maria José Afanador y Antonio Rojas Castro, 'Retos en la producción de tutoriales de HD en contexto hispanohablantes', Conferencia ACH 2019, The Association for Computers and the Humanities, Pittsburgh, 23-26 de Julho de 2019, Pittsburgh. @@ -162,7 +162,7 @@ A equipe do projeto e membros da comunidade em geral estão envolvidos em vária * Victor Gayol, 'La investigación del pasado y la historia digital: análisis de datos y cómo aprender (The Programming Historian en español)', _Humanidades Digitales_, IV Feria Internacional de Ciencias Sociales y Humanidades, Centro Universitario de Los Lagos - Universidad de Guadalajara, Lagos de Moreno, Jalisco (9 de Março de 2017). * Victor Gayol, 'The Programming Historian: 'un modelo colaborativo para la investigación y la ensenñanza en ciencias sociales y humanidades digitales', _Mesa de Trabajo sobre Ciencias Sociales y Humanidades Digitales_, El Colegio De Michoacán, Mexico (21 de Fevereiro de 2017). * Adam Crymble, 'Bringing Digital Humanities into the University for Free', University of Cape Town, South Africa (27-28 de Junho de 2016). -* Fred Gibbs, 'The Programming Historian' (Poster), _American Historical Association_, New York (Janeiro de 2015). +* Fred Gibbs, 'The Programming Historian' (Poster), _American Historical Association_, New York (Janeiro de 2015). * Adam Crymble, 'The Programming Historian 2', _Digital History Seminar_, Institute of Historical Research, London (13 de Outubro de 2013). * Adam Crymble, 'The Programming Historian 2', _Digital Humanities 2012_, Hamburg (Julho de 2012). @@ -173,11 +173,11 @@ A equipe do projeto e membros da comunidade em geral estão envolvidos em vária * Matthew Lincoln, 'Multilingual Jekyll: How The Programming Historian Does That', *matthewlincoln.net*, 1 de Março de 2020, . * Sue Levine, 'The Early-Stage Ph.D.'s Guide to Summer', _Inside Higher Education_, 10 de Junho, 2019, . * 'Championing open access with online digital history journal', _University of Sussex Press Office_, 9 de Outubro, 2018, . -* Adam Crymble, 'A Decade of Programming Historians', _Network in Canadian History & Environment_, 23 de Março, 2018, . +* Adam Crymble, 'A Decade of Programming Historians', _Network in Canadian History & Environment_, 23 de Março, 2018, . * Fred Gibbs, "Sustainable Publishing: Reflections of a Former Programming Historian Editor", FredGibbs.net, 2017, . -* Anaclet Pons, "The Programming Historian en español", _Clionauta: Blog de historia_, 14 de Junho, 2017, . +* Anaclet Pons, "The Programming Historian en español", _Clionauta: Blog de historia_, 14 de Junho, 2017, . * Seth Denbo, “Historian, Program! Self-Help for Digital Neophytes,” _Perspectives on History: The Newsmagazine of the American Historical Association_, Maio 2017, . -* Víctor Gayol, '*The Programming Historian* en español', *Blog de Humanidades Digitales*, 17 de Março, 2017, . +* Víctor Gayol, '*The Programming Historian* en español', *Blog de Humanidades Digitales*, 17 de Março, 2017, . ## Projetos que utilizam o *Programming Historian* diff --git a/pt/ppi.md b/pt/ppi.md index d2a088537a..782b9efd35 100644 --- a/pt/ppi.md +++ b/pt/ppi.md @@ -51,7 +51,7 @@ Ao aderir ao Programa de Parceiros Institucionais receberá os seguintes benefí
    - + diff --git a/pt/reportar-um-erro.md b/pt/reportar-um-erro.md index aa75317f71..f89477737f 100755 --- a/pt/reportar-um-erro.md +++ b/pt/reportar-um-erro.md @@ -12,7 +12,7 @@ Seguiu as instruções de uma lição meticulosamente e ainda assim encontrou um Definimos erros/bugs como: "Um erro em um programa de computador que produz um resultado inesperado ou que se comporta de forma diferente das instruções numa lição." Observe que não podemos atender a erros causados pelo usuário ao editar o código ou alterar materiais (conjuntos de dados, ficheiros de entrada, etc.) -Primeiro, pedimos que você verifique no nosso [rastreador de questões](https://github.com/orgs/programminghistorian/projects/6) se alguém já sinalizou o erro e, nesse caso, deixe um comentário nessa questão. Se o problema não foi relatado, siga uma destas opções: +Primeiro, pedimos que você verifique no nosso [rastreador de questões](https://github.com/programminghistorian/jekyll/issues) se alguém já sinalizou o erro e, nesse caso, deixe um comentário nessa questão. Se o problema não foi relatado, siga uma destas opções:
    Por favor, não abra um "Pull Request" com a correção. diff --git a/pt/sobre.md b/pt/sobre.md index bb816f7fb8..c6ec5e9231 100755 --- a/pt/sobre.md +++ b/pt/sobre.md @@ -14,7 +14,7 @@ O processo de revisão é um componente essencial de um esforço colaborativo, p ## Código Aberto -A equipe do _Programming Historian em Português_ está comprometida com os princípios do código aberto. Sempre que possível, todas as lições apresentadas usam linguagens de programação e software de código aberto. Esta política visa minimizar custos para todas as partes envolvidas e permitir o mais amplo nível de participação. Acreditamos que todos devem se beneficiar destes tutoriais, não apenas aqueles que têm acesso a orçamentos de pesquisa elevados para software proprietário. Desde 2016, tem sido depositada no [Zenodo](https://zenodo.org/) uma versão citável do projeto _The Programming Historian_. A cópia de 2022 está disponível em [doi.org/10.5281/zenodo.7313045](https://doi.org/10.5281/zenodo.7313045). Desde 2018, o [UK Web Archive](https://www.webarchive.org.uk/) faz rastreamentos regulares ao Programming Historian. Estes são arquivados e disponibilizados ao público no seu [website](https://www.webarchive.org.uk/wayback/en/archive/*/http://programminghistorian.org/). +A equipe do _Programming Historian em Português_ está comprometida com os princípios do código aberto. Sempre que possível, todas as lições apresentadas usam linguagens de programação e software de código aberto. Esta política visa minimizar custos para todas as partes envolvidas e permitir o mais amplo nível de participação. Acreditamos que todos devem se beneficiar destes tutoriais, não apenas aqueles que têm acesso a orçamentos de pesquisa elevados para software proprietário. Desde 2016, tem sido depositada no [Zenodo](https://zenodo.org/) uma versão citável do projeto _The Programming Historian_. A cópia de 2022 está disponível em [doi.org/10.5281/zenodo.7313045](https://doi.org/10.5281/zenodo.7313045). Desde 2018, o [UK Web Archive](https://www.webarchive.org.uk/) faz rastreamentos regulares ao Programming Historian. Estes são arquivados e disponibilizados ao público no seu [website](https://www.webarchive.org.uk/wayback/en/archive/*/https://programminghistorian.org/). ## Acesso Aberto _Diamante_ @@ -25,7 +25,7 @@ Não cobramos taxas de processamento de artigos (APCs) ou assinaturas para bibli O _Programming Historian em Português_ (ISSN {{ site.data.snippets.issn[page.lang] }}) está indexado no [Directory of Open Access Journals](https://doaj.org/toc/2397-2068). ## Prémios -O _Programming Historian_ ganhou vários prémios que reconhecem as suas conquistas nas esferas das publicações em acesso aberto e da pesquisa digital. Em 2016 a nossa versão em inglês ganhou o [Digital Humanities Awards](http://dhawards.org/dhawards2016/results/) na categoria de _Best Series of Posts_. No ano seguinte, em 2017, o _Programming Historian en español_ ganhou o mesmo louvor e, no ano seguinte, venceu a 'Mejor iniciativa formativa desarrollada durante el año 2018', [Humanidades Digitales Hispánicas Association](http://humanidadesdigitaleshispanicas.es/). Ganhámos o Canadian Social Knowledge Institute's Open Scholarship Award em 2020 e, em 2021, foi-nos atribuído o [Coko Foundation's Open Publishing Award](https://web.archive.org/web/20220408041024/https://openpublishingawards.org/results/2021/index.html) na categoria _Open Content_. Em 2022, ganhámos a categoria de Melhor Material de Formação de DH do [Digital Humanities Awards](http://dhawards.org/dhawards2022/results/). +O _Programming Historian_ ganhou vários prémios que reconhecem as suas conquistas nas esferas das publicações em acesso aberto e da pesquisa digital. Em 2016 a nossa versão em inglês ganhou o [Digital Humanities Awards](https://dhawards.org/dhawards2016/results/) na categoria de _Best Series of Posts_. No ano seguinte, em 2017, o _Programming Historian en español_ ganhou o mesmo louvor e, no ano seguinte, venceu a 'Mejor iniciativa formativa desarrollada durante el año 2018', [Humanidades Digitales Hispánicas Association](https://humanidadesdigitaleshispanicas.es/). Ganhámos o Canadian Social Knowledge Institute's Open Scholarship Award em 2020 e, em 2021, foi-nos atribuído o [Coko Foundation's Open Publishing Award](https://web.archive.org/web/20220408041024/https://openpublishingawards.org/results/2021/index.html) na categoria _Open Content_. Em 2022, ganhámos a categoria de Melhor Material de Formação de DH do [Digital Humanities Awards](https://dhawards.org/dhawards2022/results/). ## Política de Diversidade @@ -35,9 +35,9 @@ A equipe do _Programming Historian em Português_ está empenhada com a diversid O _Programming Historian_ é um projeto internacional impulsionado por voluntários, cujas atividades financeiras são administradas pela ProgHist Limited, uma instituição de caridade registada na Inglaterra e País de Gales com o número ([1195875](https://register-of-charities.charitycommission.gov.uk/charity-search/-/charity-details/5181272/charity-overview)) e constituída como uma empresa limitada por garantia, na Inglaterra e no País de Gales, com o número ([12192946](https://find-and-update.company-information.service.gov.uk/company/12192946)). É um projeto publicado pelo _Conselho Editorial do Programming Historian_. -Veja a página ['Apoie o projeto']({{site.baseurl}}/pt/apoie-nos) para uma lista dos nossos financiadores e apoios. +Veja a página ['Apoie o projeto']({{site.baseurl}}/pt/ppi) para uma lista dos nossos financiadores e apoios. ## História do Projeto -O _Programming Historian_ foi fundado em 2008 por William J. Turkel e Alan MacEachern. Na altura, Turkel publicou uma entrada no [blog](http://digitalhistoryhacks.blogspot.com/2008/01/programming-historian.html), definindo as suas intenções para o projeto. Inicialmente concentrou-se na linguagem de programação Python e foi publicado em acesso aberto como um projeto de 'infraestrutura digital' da Network in Canadian History & Environment (NiCHE). Em 2012, o Programming Historian expandiu a sua equipe editorial e tornou-se numa revista académica de acesso aberto com revisão por pares sobre metodologia para historiadores digitais. Em 2016, adicionámos uma versão em espanhol à publicação inicial em inglês e, em 2017, começámos a publicar lições traduzidas sob o título _[Programming Historian en español]({{site.baseurl}}/es)_. Em 2018, [organizámos o nosso primeiro workshop de escrita em espanhol](/posts/bogota-workshop-report), publicámos uma chamada para [a contribuição de novas lições em espanhol](/posts/convocatoria-de-tutoriales) e iniciámos um plano para traduzir lições de espanhol para inglês. No mesmo ano adicionámos uma versão em francês e em 2019 lançámos o _[Programming Historian en français]({{site.baseurl}}/fr)_. Em 2021, adicionámos uma [versão em português]({{site.baseurl}}/pt). +O _Programming Historian_ foi fundado em 2008 por William J. Turkel e Alan MacEachern. Na altura, Turkel publicou uma entrada no [blog](https://digitalhistoryhacks.blogspot.com/2008/01/programming-historian.html), definindo as suas intenções para o projeto. Inicialmente concentrou-se na linguagem de programação Python e foi publicado em acesso aberto como um projeto de 'infraestrutura digital' da Network in Canadian History & Environment (NiCHE). Em 2012, o Programming Historian expandiu a sua equipe editorial e tornou-se numa revista académica de acesso aberto com revisão por pares sobre metodologia para historiadores digitais. Em 2016, adicionámos uma versão em espanhol à publicação inicial em inglês e, em 2017, começámos a publicar lições traduzidas sob o título _[Programming Historian en español]({{site.baseurl}}/es)_. Em 2018, [organizámos o nosso primeiro workshop de escrita em espanhol](/posts/bogota-workshop-report), publicámos uma chamada para [a contribuição de novas lições em espanhol](/posts/convocatoria-de-tutoriales) e iniciámos um plano para traduzir lições de espanhol para inglês. No mesmo ano adicionámos uma versão em francês e em 2019 lançámos o _[Programming Historian en français]({{site.baseurl}}/fr)_. Em 2021, adicionámos uma [versão em português]({{site.baseurl}}/pt). diff --git a/translation-concordance.md b/translation-concordance.md index d2a9fcb51b..ae38f87d0c 100644 --- a/translation-concordance.md +++ b/translation-concordance.md @@ -1,6 +1,8 @@ --- layout: blank title: Translation Concordance +permalink: /translation-concordance/ +lang: en --- An automatically-generated list of page translation relationships across our publications. @@ -20,7 +22,11 @@ An automatically-generated list of page translation relationships across our pub
    {% for l in site.data.snippets.language-list %} {% assign sp = page_versions | where: "lang", l | first %} - + {% endfor %} {% endfor %} @@ -41,7 +47,11 @@ An automatically-generated list of page translation relationships across our pub {% for l in site.data.snippets.language-list %} {% assign sp = page_versions | where: "lang", l | first %} - + {% endfor %} {% endfor %} diff --git a/troubleshooting.md b/troubleshooting.md index c5c27c63ad..66d95d7c51 100644 --- a/troubleshooting.md +++ b/troubleshooting.md @@ -136,30 +136,30 @@ HTML learning. Other programming languages have equally valuable sets of introductory texts and websites which you can find online. - [Python for - Non-programmers](http://wiki.python.org/moin/BeginnersGuide/NonProgrammers) -- [LearnPython.org](http://learnpython.org/) This tutorial offers + Non-programmers](https://wiki.python.org/moin/BeginnersGuide/NonProgrammers) +- [LearnPython.org](https://learnpython.org/) This tutorial offers in-browser coding windows. - [Non-Programmer's Tutorial for Python 2.6](https://en.wikibooks.org/wiki/Non-Programmer's_Tutorial_for_Python_2.6) - [W3 Schools HTML - Tutorial](http://www.w3schools.com/html/default.asp) + Tutorial](https://www.w3schools.com/html/default.asp) As you proceed (or if you already have some programming experience) you'll probably prefer more general references like: - [Python for - Programmers](http://wiki.python.org/moin/BeginnersGuide/Programmers) -- [Python documentation page](http://docs.python.org/) + Programmers](https://wiki.python.org/moin/BeginnersGuide/Programmers) +- [Python documentation page](https://docs.python.org/) - [Python tutorial](https://docs.python.org/3/tutorial/index.html) - [Python library reference](https://docs.python.org/3/library/index.html) -- Pilgrim, [Dive into Python](http://www.diveintopython.net) +- Pilgrim, [Dive into Python](https://www.diveintopython.net) We also like to have a few printed books ready-to-hand, especially -- Lutz, *[Learning Python](http://www.worldcat.org/oclc/156890981)* -- Lutz, *[Programming Python](http://www.worldcat.org/oclc/65765375)* +- Lutz, *[Learning Python](https://www.worldcat.org/oclc/156890981)* +- Lutz, *[Programming Python](https://www.worldcat.org/oclc/65765375)* - Martelli, Ravenscroft and Ascher, *[Python - Cookbook](http://www.worldcat.org/oclc/59007845)* + Cookbook](https://www.worldcat.org/oclc/59007845)* Bring On The Code! ------------------ @@ -167,6 +167,6 @@ Bring On The Code! Now that you have Step One (Troubleshooting) mastered, you are ready to dive in. Let's get programming. - [Stack Overflow]: http://stackoverflow.com/ - [Tutor]: http://mail.python.org/mailman/listinfo/tutor - [FAQ page]: http://web.archive.org/web/20130101093828/http://stackoverflow.com/faq + [Stack Overflow]: https://stackoverflow.com/ + [Tutor]: https://mail.python.org/mailman/listinfo/tutor + [FAQ page]: https://web.archive.org/web/20130101093828/https://stackoverflow.com/faq
    {{ sp.title }} + {% if sp %} + {{ sp.title }} + {% endif %} +
    {{ sp.title }} + {% if sp %} + {{ sp.title }} + {% endif %} +