UMassCDS · ginic · Jun 4, 2025 · Jun 4, 2025 · Jun 4, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,7 +12,7 @@ You should also add project tags for each release in Github, see [Managing relea
 
 ### Removed
 - GitHub action to run flake8 for linting in build
-
+- Removed wildcard from corpus-counter script dependency
 
 ## [2.0.0] - 2024-05-29
 ### Added

diff --git a/dvc.lock b/dvc.lock
@@ -2,17 +2,18 @@ schema: '2.0'
 stages:
   count-words:
     cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv
-      data/gutenberg/*.txt --case-insensitive
+      data/gutenberg --case-insensitive
     deps:
     - path: data/gutenberg
       md5: 41d960155f1a7f55480c03cea68ba2a7.dir
       size: 10940
       nfiles: 11
     - path: src/cdstemplate/corpus_counter_script.py
       hash: md5
-      md5: a4bb400c0cfd7050ac4b761b550a0a56
-      size: 2582
+      md5: 0db1baa08c580811414d608bd98bd7d9
+      size: 2596
     outs:
     - path: data/gutenberg_counts.csv
-      md5: 74abc508b4e4015ab4136405df251a57
-      size: 4922
+      hash: md5
+      md5: 346733ebed78882d3b9871db6ef32043
+      size: 4519
diff --git a/dvc.yaml b/dvc.yaml
@@ -1,6 +1,6 @@
 stages:
   count-words:
-    cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv data/gutenberg/*.txt --case-insensitive
+    cmd: python src/cdstemplate/corpus_counter_script.py data/gutenberg_counts.csv data/gutenberg --case-insensitive
     deps:
     - src/cdstemplate/corpus_counter_script.py
     - data/gutenberg

diff --git a/notebooks/word_count_prototype.ipynb b/notebooks/word_count_prototype.ipynb
@@ -358,7 +358,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.13"
+   "version": "3.13.3"
   },
   "orig_nbformat": 4
  },

diff --git a/src/cdstemplate/corpus_counter_script.py b/src/cdstemplate/corpus_counter_script.py
@@ -24,9 +24,9 @@ def main_cli():
     parser.add_argument("csv", help="Path to the output CSV storing token counts. Required.")
 
     parser.add_argument(
-        "documents",
-        nargs="+",
-        help="Paths to at least one raw text document that make up the corpus. Required.",
+        "document_dir",
+        type=Path,
+        help="Path to folder containing raw .txt documents that make up the corpus. Required.",
     )
     parser.add_argument(
         "--case-insensitive",
@@ -38,19 +38,20 @@ def main_cli():
     args = parser.parse_args()
     utils.configure_logging()
     logger.info("Command line arguments: %s", args)
-    main(args.csv, args.documents, args.case_insensitive)
+    main(args.csv, args.document_dir, args.case_insensitive)
 
 
-def main(csv_out, documents, case_insensitive=False):
+def main(csv_out, document_dir, case_insensitive=False):
     """Determine cumulative word counts for a list of documents and write the results to a CSV file
 
     :param csv_out: output CSV file path
     :type csv_out: str or Path
-    :param documents: list of paths to documents to parse word counts from
-    :type documents: list of str
+    :param document_dir: Path to folder containing .txt files
+    :type document_dir: Path
     :param case_insensitive: Set to True to lowercase all words in cumulative counts, defaults to False
     :type case_insensitive: bool, optional
     """
+    documents = Path(document_dir).glob("*.txt")
     cc = word_count.CorpusCounter(case_insensitive=case_insensitive)
     for i, doc in enumerate(documents):
         if i % 2 == 0:

diff --git a/src/cdstemplate/word_count.py b/src/cdstemplate/word_count.py
@@ -99,7 +99,7 @@ def get_token_counts_as_dataframe(self):
         return dataframe
 
     def save_token_counts(self, csv_file):
-        """Saves the counts of tokens the corpus to a specified
+        """Saves the counts of tokens from the corpus to a specified
         CSV file in alphabetical order
 
         :param csv_file: Path to desired CSV output file