From 88a19e82ee855d7ef2af5bbcd6c434f1c53ff92e Mon Sep 17 00:00:00 2001
From: Virginia Partridge <vcpartridge@gmail.com>
Date: Wed, 4 Jun 2025 11:32:31 -0400
Subject: [PATCH 1/2] Got rid of wildcard in documents path & fixed some typos

---
 dvc.lock                                 | 11 ++++++-----
 dvc.yaml                                 |  2 +-
 notebooks/word_count_prototype.ipynb     |  2 +-
 src/cdstemplate/corpus_counter_script.py | 15 ++++++++-------
 src/cdstemplate/word_count.py            |  2 +-
 5 files changed, 17 insertions(+), 15 deletions(-)

diff --git a/dvc.lock b/dvc.lock
index aa51e59..22d096e 100644
--- a/dvc.lock
+++ b/dvc.lock
@@ -2,7 +2,7 @@ schema: '2.0'
 stages:
   count-words:
     cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv
-      data/gutenberg/*.txt --case-insensitive
+      data/gutenberg --case-insensitive
     deps:
     - path: data/gutenberg
       md5: 41d960155f1a7f55480c03cea68ba2a7.dir
@@ -10,9 +10,10 @@ stages:
       nfiles: 11
     - path: src/cdstemplate/corpus_counter_script.py
       hash: md5
-      md5: a4bb400c0cfd7050ac4b761b550a0a56
-      size: 2582
+      md5: 0db1baa08c580811414d608bd98bd7d9
+      size: 2596
     outs:
     - path: data/gutenberg_counts.csv
-      md5: 74abc508b4e4015ab4136405df251a57
-      size: 4922
+      hash: md5
+      md5: 346733ebed78882d3b9871db6ef32043
+      size: 4519
diff --git a/dvc.yaml b/dvc.yaml
index ed00753..d02a37c 100644
--- a/dvc.yaml
+++ b/dvc.yaml
@@ -1,6 +1,6 @@
 stages:
   count-words:
-    cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv data/gutenberg/*.txt --case-insensitive
+    cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv data/gutenberg --case-insensitive
     deps:
     - src/cdstemplate/corpus_counter_script.py
     - data/gutenberg
diff --git a/notebooks/word_count_prototype.ipynb b/notebooks/word_count_prototype.ipynb
index 02f797b..48f594a 100644
--- a/notebooks/word_count_prototype.ipynb
+++ b/notebooks/word_count_prototype.ipynb
@@ -358,7 +358,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.13.3"
   },
   "orig_nbformat": 4
  },
diff --git a/src/cdstemplate/corpus_counter_script.py b/src/cdstemplate/corpus_counter_script.py
index f0f3ae7..63f45e1 100644
--- a/src/cdstemplate/corpus_counter_script.py
+++ b/src/cdstemplate/corpus_counter_script.py
@@ -24,9 +24,9 @@ def main_cli():
     parser.add_argument("csv", help="Path to the output CSV storing token counts. Required.")
 
     parser.add_argument(
-        "documents",
-        nargs="+",
-        help="Paths to at least one raw text document that make up the corpus. Required.",
+        "document_dir",
+        type=Path,
+        help="Path to folder containing raw .txt documents that make up the corpus. Required.",
     )
     parser.add_argument(
         "--case-insensitive",
@@ -38,19 +38,20 @@ def main_cli():
     args = parser.parse_args()
     utils.configure_logging()
     logger.info("Command line arguments: %s", args)
-    main(args.csv, args.documents, args.case_insensitive)
+    main(args.csv, args.document_dir, args.case_insensitive)
 
 
-def main(csv_out, documents, case_insensitive=False):
+def main(csv_out, document_dir, case_insensitive=False):
     """Determine cumulative word counts for a list of documents and write the results to a CSV file
 
     :param csv_out: output CSV file path
     :type csv_out: str or Path
-    :param documents: list of paths to documents to parse word counts from
-    :type documents: list of str
+    :param document_dir: Path to folder containing .txt files
+    :type document_dir: Path
     :param case_insensitive: Set to True to lowercase all words in cumulative counts, defaults to False
     :type case_insensitive: bool, optional
     """
+    documents = Path(document_dir).glob("*.txt")
     cc = word_count.CorpusCounter(case_insensitive=case_insensitive)
     for i, doc in enumerate(documents):
         if i % 2 == 0:
diff --git a/src/cdstemplate/word_count.py b/src/cdstemplate/word_count.py
index ce2e561..34521a5 100644
--- a/src/cdstemplate/word_count.py
+++ b/src/cdstemplate/word_count.py
@@ -99,7 +99,7 @@ def get_token_counts_as_dataframe(self):
         return dataframe
 
     def save_token_counts(self, csv_file):
-        """Saves the counts of tokens the corpus to a specified
+        """Saves the counts of tokens from the corpus to a specified
         CSV file in alphabetical order
 
         :param csv_file: Path to desired CSV output file

From 67127f72c7627835b2bba4f0d14325a6fa1b9e5e Mon Sep 17 00:00:00 2001
From: Virginia Partridge <vcpartridge@gmail.com>
Date: Wed, 4 Jun 2025 11:37:23 -0400
Subject: [PATCH 2/2] Changlog note about wildcard deps

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4396d07..d583675 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,7 @@ You should also add project tags for each release in Github, see [Managing relea
 
 ### Removed
 - GitHub action to run flake8 for linting in build
-
+- Removed wildcard from corpus-counter script dependency
 
 ## [2.0.0] - 2024-05-29
 ### Added