fix median divide by 0 warning?

conjuncts · Sep 4, 2024 · ffa24ac · ffa24ac
1 parent 9c9f1e4
commit ffa24ac
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 6 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,17 @@
+## v0.2.3-pre
+
+Bugfix:
+- divide by 0 when taking median of empty list in row height estimate
+
+## v0.2.2
+
+- `is_projecting_row` is removed, with the information now available under `FormattedTable._projecting_indices`
+- Formally removed `timm` as a dependency
+- Slight tweak to captions with the aim to better reflect paragraph word height, still WIP. See #8 and be93159
+- Fix: return result so image can be used outside of notebook by @brycedrennan in https://github.com/conjuncts/gmft/pull/15
+
+**Full Changelog**: https://github.com/conjuncts/gmft/compare/v0.2.1...v0.2.2
+
 ## v0.2.1
 
 - GPU support, thank you @MathiasToftas!
@@ -30,10 +44,10 @@
 
 ## v0.1.1
 
-- Created AutoTableFormatter and AutoTableDetector for future flexibility
 
-## v0.1.0
+Older:
+- Created AutoTableFormatter and AutoTableDetector for future flexibility (v0.1.1, a840488)
+- Renamed is_spanning_row to is_projecting_row (v0.1.1, a840488)
+- Added support for rotated tables (v0.0.4, 5aeb80d)
+- Even better accuracy for large tables (v0.1.0, 8c537ed)
 
-- Added support for rotated tables (since v0.0.4)
-- Even better accuracy for large tables
-- Renamed is_spanning_row to is_projecting_row
diff --git a/gmft/table_function_algorithm.py b/gmft/table_function_algorithm.py
@@ -721,8 +721,27 @@ def extract_to_df(table: TATRFormattedTable, config: TATRFormatConfig=None):
                 bins[i].append(yavg)
         known_means = [np.mean(x) for x in bins if len(x)]
 
+        if not known_means:
+            # no text was detected
+            outliers['no text'] = True
+            table.effective_rows = []
+            table.effective_columns = []
+            table.effective_headers = []
+            table.effective_projecting = []
+            table.effective_spanning = []
+            table._top_header_indices = []
+            table._projecting_indices = []
+            table._hier_left_indices = []
+            table._df = pd.DataFrame()
+            table.outliers = outliers
+            return table._df
+
         differences = [known_means[i+1] - known_means[i] for i in range(len(known_means) - 1)]
-        known_height = np.median(differences)
+        if len(differences):
+            known_height = np.median(differences)
+        else:
+            # if there is only one row, then we're stuck. set to table height.
+            known_height = bottom - top
 
         # means are within 0.2 * known_height of each other, consolidate them
         # actually no - use 0.6 * WORD_HEIGHT