diff --git a/doi2cite/Makefile b/doi2cite/Makefile new file mode 100644 index 00000000..8ce09052 --- /dev/null +++ b/doi2cite/Makefile @@ -0,0 +1,22 @@ +DIFF ?= diff --strip-trailing-cr -u + +test: + @pandoc --lua-filter=doi2cite.lua --wrap=preserve --output=output.md sample1.md + @$(DIFF) expected1.md output.md + @rm -f output.md + +expected1.md: sample1.md doi2cite.lua + pandoc --lua-filter=doi2cite.lua --wrap=preserve --output $@ $< + +expected1.pdf: sample1.md sample1.csl doi2cite.lua + pandoc --lua-filter=doi2cite.lua --filter=pandoc-crossref --citeproc --csl=sample1.csl --output $@ $< + +expected2.md: sample2.md doi2cite.lua + pandoc --lua-filter=doi2cite.lua --wrap=preserve --output $@ $< + +clean: + @rm -f expected1.md + @rm -f expected2.md + @rm -f expected1.pdf + +.PHONY: test diff --git a/doi2cite/README.md b/doi2cite/README.md new file mode 100644 index 00000000..4e3c0068 --- /dev/null +++ b/doi2cite/README.md @@ -0,0 +1,74 @@ +# pandoc-doi2cite +This pandoc lua filiter helps users to insert references in a document +with using DOI(Digital Object Identifier) tags. With this filter, user +s do not need to make bibtex file by themselves. Instead, the filter +automatically generate bib file from the DOI tags, and convert the DOI +tags into citation keys available by --citeproc. + + + +What the filter do are as follows: +1. Search citations with DOI tags in the document +2. Search corresponding bibtex data from `__from_DOI.bib` file +3. If not found, get bibtex data of the DOI from + http://api.crossref.org +4. Add reference data to `__from_DOI.bib` file +5. Check duplications of reference keys +6. Replace DOI tags to the correspoinding citation keys + +# Prerequisites +- Pandoc version 2.0 or newer +- This filter does not need any external dependencies +- This filter should be executed before `pandoc-crossref` or + `--citeproc` + +# DOI tags +Following DOI tags can be used: +- @https://doi.org/ +- @doi.org/ +- @DOI: +- @doi: + +The first one (@https://doi.org/) may be the most useful because it is +same as the accessible URL. + +# YAML header +The file **name** of the auto-generated bibliography file **MUST** be +`__from_DOI.bib`, but the **place** of the file can be changed (e.g. +`'./refs/__from_DOI.bib'` or `'refs\\__from_DOI.bib'` for Windows). Yo +u can designate the filepath in the document yaml header. The yaml key + is `bibliography`, which is also used by --citeproc. + +# Example +example1.md: +```{.md} +--- +bibliography: + - 'my_refs.bib' + - '__from_DOI.bib' +--- + +# Introduction +The Laemmli system is one of the most widely used gel systems for the +separation of proteins.[@LAEMMLI_1970] By the way, Einstein is genius. +[@https://doi.org/10.1002/andp.19053220607; @doi.org/10.1002/andp.1905 +3220806; @doi:10.1002/andp.19053221004] +``` + +Example command 1 (.md -\> .md) + +``` {.sh} +pandoc --lua-filter=doi2cite.lua --wrap=preserve \ +-s example1.md -o expected1.md +``` + +Example command 2 (.md -\> .pdf with +[ACS](https://pubs.acs.org/journal/jacsat) style): + +``` {.sh} +pandoc --lua-filter=doi2cite.lua --filter=pandoc-crossref --citeproc \ +--csl=sample1.csl -s example1.md -o expected1.pdf +``` + +Example result +![expected1](https://user-images.githubusercontent.com/30950088/119964566-4d952200-bfe4-11eb-90d9-ed2366c639e8.png) diff --git a/doi2cite/__from_DOI.bib b/doi2cite/__from_DOI.bib new file mode 100644 index 00000000..10ba5c25 --- /dev/null +++ b/doi2cite/__from_DOI.bib @@ -0,0 +1,36 @@ +@article{Einstein_1905, + doi = {10.1002/andp.19053220607}, + url = {https://doi.org/10.1002%2Fandp.19053220607}, + year = 1905, + publisher = {Wiley}, + volume = {322}, + number = {6}, + pages = {132--148}, + author = {A. Einstein}, + title = {Über einen die Erzeugung und Verwandlung des Lichtes betreffenden heuristischen Gesichtspunkt}, + journal = {Annalen der Physik} +} +@article{Einstein_1905_10.1002/andp.19053220806, + doi = {10.1002/andp.19053220806}, + url = {https://doi.org/10.1002%2Fandp.19053220806}, + year = 1905, + publisher = {Wiley}, + volume = {322}, + number = {8}, + pages = {549--560}, + author = {A. Einstein}, + title = {Über die von der molekularkinetischen Theorie der Wärme geforderte Bewegung von in ruhenden Flüssigkeiten suspendierten Teilchen}, + journal = {Annalen der Physik} +} +@article{Einstein_1905_10.1002/andp.19053221004, + doi = {10.1002/andp.19053221004}, + url = {https://doi.org/10.1002%2Fandp.19053221004}, + year = 1905, + publisher = {Wiley}, + volume = {322}, + number = {10}, + pages = {891--921}, + author = {A. Einstein}, + title = {Zur Elektrodynamik bewegter Körper}, + journal = {Annalen der Physik} +} diff --git a/doi2cite/doi2cite.lua b/doi2cite/doi2cite.lua new file mode 100644 index 00000000..48a2d041 --- /dev/null +++ b/doi2cite/doi2cite.lua @@ -0,0 +1,252 @@ +-------------------------------------------------------------------------------- +-- Copyright © 2021 Takuro Hosomi +-- This library is free software; you can redistribute it and/or modify it +-- under the terms of the MIT license. See LICENSE for details. +-------------------------------------------------------------------------------- + + +-------------------------------------------------------------------------------- +-- Global variables -- +-------------------------------------------------------------------------------- +base_url = "http://api.crossref.org" +mailto = "pandoc.doi2cite@gmail.com" +bibname = "__from_DOI.bib" +key_list = {}; +doi_key_map = {}; +doi_entry_map = {}; +error_strs = {}; +error_strs["Resource not found."] = 404 +error_strs["No acceptable resource available."] = 406 +error_strs["

503 Service Unavailable

\n" + .."No server is available to handle this request.\n" + ..""] = 503 + + +-------------------------------------------------------------------------------- +-- Pandoc Functions -- +-------------------------------------------------------------------------------- +-- Get bibliography filepath from yaml metadata +function Meta(m) + local bib_data = m.bibliography + local bibpaths = get_paths_from(bib_data) + bibpath = find_filepath(bibname, bibpaths) + bibpath = verify_path(bibpath) + local f = io.open(bibpath, "r") + if f then + entries_str = f:read('*all') + if entries_str then + doi_entry_map = get_doi_entry_map(entries_str) + doi_key_map = get_doi_key_map(entries_str) + for doi,key in pairs(doi_key_map) do + key_list[key] = true + end + end + f:close() + else + make_new_file(bibpath) + end +end + +-- Get bibtex data of doi-based citation.id and make bibliography. +-- Then, replace "citation.id" +function Cite(c) + for _, citation in pairs(c.citations) do + local id = citation.id:gsub('%s+', ''):gsub('%%2F', '/') + if id:sub(1,16) == "https://doi.org/" then + doi = id:sub(17):lower() + elseif id:sub(1,8) == "doi.org/" then + doi = id:sub(9):lower() + elseif id:sub(1,4) == "DOI:" or id:sub(1,4) == "doi:" then + doi = id:sub(5):lower() + else + doi = nil + end + if doi then + if doi_key_map[doi] then + citation.id = doi_key_map[doi] + else + local entry_str = get_bibentry(doi) + if entry_str == nil or error_strs[entry_str] then + print("Failed to get ref from DOI: " .. doi) + else + entry_str = tex2raw(entry_str) + local entry_key = get_entrykey(entry_str) + if key_list[entry_key] then + entry_key = entry_key.."_"..doi + entry_str = replace_entrykey(entry_str, entry_key) + end + key_list[entry_key] = true + doi_key_map[doi] = entry_key + citation.id = entry_key + local f = io.open(bibpath, "a+") + if f then + f:write(entry_str .. "\n") + f:close() + else + error("Unable to open file: "..bibpath) + end + end + end + end + end + return c +end + + +-------------------------------------------------------------------------------- +-- Common Functions -- +-------------------------------------------------------------------------------- +-- Get bib of DOI from http://api.crossref.org +function get_bibentry(doi) + local entry_str = doi_entry_map[doi] + if entry_str == nil then + print("Request DOI: " .. doi) + local url = base_url.."/works/" + ..doi.."/transform/application/x-bibtex" + .."?mailto="..mailto + mt, entry_str = pandoc.mediabag.fetch(url) + end + return entry_str +end + +-- Extract designated filepaths from 1 or 2 dimensional metadata +function get_paths_from(metadata) + local filepaths = {}; + if metadata then + if metadata[1].text then + filepaths[metadata[1].text] = true + elseif type(metadata) == "table" then + for _, datum in pairs(metadata) do + if datum[1] then + if datum[1].text then + filepaths[datum[1].text] = true + end + end + end + end + end + return filepaths +end + +-- Extract filename and dirname from a given a path +function split_path(filepath) + local delim = nil + local len = filepath:len() + local reversed = filepath:reverse() + if filepath:find("/") then + delim = "/" + elseif filepath:find([[\]]) then + delim = [[\]] + else + return {filename = filepath, dirname = nil} + end + local pos = reversed:find(delim) + local dirname = filepath:sub(1, len - pos) + local filename = reversed:sub(1, pos - 1):reverse() + return {filename = filename, dirname = dirname} +end + +-- Find bibname in a given filepath list and return the filepath if found +function find_filepath(filename, filepaths) + for path, _ in pairs(filepaths) do + local filename = split_path(path)["filename"] + if filename == bibname then + return path + end + end + return nil +end + +-- Make some TeX descriptions processable by citeproc +function tex2raw(string) + local symbols = {}; + symbols["{\textendash}"] = "–" + symbols["{\textemdash}"] = "—" + symbols["{\textquoteright}"] = "’" + symbols["{\textquoteleft}"] = "‘" + for tex, raw in pairs(symbols) do + local string = string:gsub(tex, raw) + end + return string +end + +-- get bibtex entry key from bibtex entry string +function get_entrykey(entry_string) + local key = entry_string:match('@%w+{(.-),') or '' + return key +end + +-- get bibtex entry doi from bibtex entry string +function get_entrydoi(entry_string) + local doi = entry_string:match('doi%s*=%s*["{]*(.-)["}],?') or '' + return doi +end + +-- Replace entry key of "entry_string" to newkey +function replace_entrykey(entry_string, newkey) + entry_string = entry_string:gsub('(@%w+{).-(,)', '%1'..newkey..'%2') + return entry_string +end + +-- Make hashmap which key = DOI, value = bibtex entry string +function get_doi_entry_map(bibtex_string) + local entries = {}; + for entry_str in bibtex_string:gmatch('@.-\n}\n') do + local doi = get_entrydoi(entry_str) + entries[doi] = entry_str + end + return entries +end + +-- Make hashmap which key = DOI, value = bibtex key string +function get_doi_key_map(bibtex_string) + local keys = {}; + for entry_str in bibtex_string:gmatch('@.-\n}\n') do + local doi = get_entrydoi(entry_str) + local key = get_entrykey(entry_str) + keys[doi] = key + end + return keys +end + +-- function to make directories and files +function make_new_file(filepath) + if filepath then + print("doi2cite: creating "..filepath) + local dirname = split_path(filepath)["dirname"] + if dirname then + os.execute("mkdir "..dirname) + end + f = io.open(filepath, "w") + if f then + f:close() + else + error("Unable to make bibtex file: "..bibpath..".\n" + .."This error may come from the missing directory. \n" + ) + end + end +end + +-- Verify that the given filepath is correct. +-- Catch common Pandoc user mistakes about Windows-formatted filepath. +function verify_path(bibpath) + if bibpath == nil then + print("[WARNING] doi2cite: " + .."The given file path is incorrect or empty. " + .."In Windows-formatted filepath, Pandoc recognizes " + .."double backslash ("..[[\\]]..") as the delimiters." + ) + return "__from_DOI.bib" + else + return bibpath + end +end + +-------------------------------------------------------------------------------- +-- The main function -- +-------------------------------------------------------------------------------- +return { + { Meta = Meta }, + { Cite = Cite } +} diff --git a/doi2cite/expected1.md b/doi2cite/expected1.md new file mode 100644 index 00000000..7843ea03 --- /dev/null +++ b/doi2cite/expected1.md @@ -0,0 +1,4 @@ +# Introduction + +The Laemmli system is one of the most widely used gel systems for the separation of proteins.[@LAEMMLI_1970] +By the way, Einstein is genius.[@Einstein_1905; @Einstein_1905_10.1002/andp.19053220806; @Einstein_1905_10.1002/andp.19053221004] diff --git a/doi2cite/expected1.pdf b/doi2cite/expected1.pdf new file mode 100644 index 00000000..b1ad6a46 Binary files /dev/null and b/doi2cite/expected1.pdf differ diff --git a/doi2cite/expected2.md b/doi2cite/expected2.md new file mode 100644 index 00000000..24e67793 --- /dev/null +++ b/doi2cite/expected2.md @@ -0,0 +1,3 @@ +# Introduction + +People sometimes make mistakes.[@DOI:10.1002/THIS.IS.NOT.VALID.DOI.SAMPLE] diff --git a/doi2cite/my_refs.bib b/doi2cite/my_refs.bib new file mode 100644 index 00000000..4022178e --- /dev/null +++ b/doi2cite/my_refs.bib @@ -0,0 +1,13 @@ +@article{LAEMMLI_1970, + doi = {10.1038/227680a0}, + url = {https://doi.org/10.1038%2F227680a0}, + year = 1970, + month = {aug}, + publisher = {Springer Science and Business Media {LLC}}, + volume = {227}, + number = {5259}, + pages = {680--685}, + author = {U. K. LAEMMLI}, + title = {Cleavage of Structural Proteins during the Assembly of the Head of Bacteriophage T4}, + journal = {Nature} +} diff --git a/doi2cite/sample1.csl b/doi2cite/sample1.csl new file mode 100644 index 00000000..e4283279 --- /dev/null +++ b/doi2cite/sample1.csl @@ -0,0 +1,279 @@ + + diff --git a/doi2cite/sample1.md b/doi2cite/sample1.md new file mode 100644 index 00000000..1d29c2b3 --- /dev/null +++ b/doi2cite/sample1.md @@ -0,0 +1,9 @@ +--- +bibliography: + - 'my_refs.bib' + - '__from_DOI.bib' +--- + +# Introduction +The Laemmli system is one of the most widely used gel systems for the separation of proteins.[@LAEMMLI_1970] +By the way, Einstein is genius.[@https://doi.org/10.1002/andp.19053220607; @doi.org/10.1002/andp.19053220806; @doi:10.1002/andp.19053221004] \ No newline at end of file diff --git a/doi2cite/sample2.md b/doi2cite/sample2.md new file mode 100644 index 00000000..79c0ab02 --- /dev/null +++ b/doi2cite/sample2.md @@ -0,0 +1,8 @@ +--- +bibliography: + - "my_refs.bib" + - "__from_DOI.bib" +--- + +# Introduction +People sometimes make mistakes.[@DOI:10.1002/THIS.IS.NOT.VALID.DOI.SAMPLE] \ No newline at end of file