2 This is a Optical-Character-Recognition program
3 Copyright (C) 2000-2007 Joerg Schulenburg
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 as published by the Free Software Foundation; either version 2
8 of the License, or (at your option) any later version.
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 see README for EMAIL-address
21 sometimes I have written comments in german language, sorry for that
23 - look for ??? for preliminary code
24 - space: avX=22 11-13 (empirical estimated)
28 ToDo: - add filter (r/s mismatch) g300c1
29 - better get_line2 function (problems on high resolution)
30 - write parallelizable code!
31 - learnmode (optimize filter)
32 - use ispell for final control or if unsure
33 - better line scanning (if not even)
34 - step 5: same chars differ? => expert mode
35 - chars dx>dy and above 50% hor-crossing > 4 is char-group ?
36 - detect color of chars and background
37 - better word space calculation (look at the examples)
38 (distance: left-left, middle-middle, left-right, thickness of e *0.75)
40 GLOBAL DATA (mostly structures)
41 - pix : image - one byte per pixel bits0-2=working
42 - lines : rows of the text (points to pix)
43 - box : list of bounding box for character
44 - obj : objects (lines, splines, etc. building a character)
53 #include "../../config.h"
60 // #include "pcx.h" /* needed for writebmp (removed later) */
61 /* ocr1 is the test-engine - remember: this is development version */
70 /* wew: will be exceeded by capitals at 1200dpi */
71 #define MaxBox (100*200) // largest possible letter (buffersize)
72 #define MAX(a,b) ((a) >= (b) ? (a) : (b))
74 /* if the system does not know about wchar.h, define functions here */
76 /* typedef unsigned wchar_t; */
77 /* Find the first occurrence of WC in WCS. */
78 const wchar_t *wcschr (const wchar_t *wcs, const wchar_t wc) {
79 int i; for(i=0;wcs[i];i++) if (wcs[i]==wc) return wcs+i; return NULL;
81 const wchar_t *wcscpy (wchar_t *dest, const wchar_t *src) {
82 int i; for(i=0;src[i];i++) dest[i]=src[i]; dest[i]=0; return dest;
84 size_t wcslen (const wchar_t *s){
85 size_t i; for(i=0;s[i];i++); return i;
89 wchar_t * wcsdup (const wchar_t *WS) { /* its a gnu extension */
91 copy = (wchar_t *) malloc((wcslen(WS)+1)*sizeof(wchar_t));
92 if (!copy)return NULL;
98 // ------------------------ feature extraction -----------------
99 // -------------------------------------------------------------
100 // detect maximas in of line overlaps (return in %) and line coordinates
101 // this is for future use
102 #define HOR 1 // horizontal
103 #define VER 2 // vertical
104 #define RIS 3 // rising=steigend
105 #define FAL 4 // falling=fallend
107 /* exchange two variables */
108 static void swap(int *a, int *b) {
114 // calculate the overlapping of the line (0-1) with black points
115 // by recursive bisection
116 // line: y=dy/dx*x+b, implicit form: d=F(x,y)=dy*x-dx*y+b*dx=0
117 // incremental y(i+1)=m*(x(i)+1)+b, F(x+1,y+1)=f(F(x,y))
118 // ret & 1 => inverse pixel!
119 // d=2*F(x,y) integer numbers
120 int get_line(int x0, int y0, int x1, int y1, pix *p, int cs, int ret){
121 int dx,dy,incrE,incrNE,d,x,y,r0,r1,ty,tx,
122 *px,*py,*pdx,*pdy,*ptx,*pty,*px1;
123 dx=abs(x1-x0); tx=((x1>x0)?1:-1); // tx=x-spiegelung (new)
124 dy=abs(y1-y0); ty=((y1>y0)?1:-1); // ty=y-spiegelung (new)
125 // rotate coordinate system if dy>dx
126 /*bbg: can be faster if instead of pointers we use the variables and swaps? */
127 /*js: Do not know, I am happy that the current code is working and is small */
128 if(dx>dy){ pdx=&dx;pdy=&dy;px=&x;py=&y;ptx=&tx;pty=&ty;px1=&x1; }
129 else { pdx=&dy;pdy=&dx;px=&y;py=&x;ptx=&ty;pty=&tx;px1=&y1; }
130 if( *ptx<0 ){ swap(&x0,&x1);swap(&y0,&y1);tx=-tx;ty=-ty; }
131 d=((*pdy)<<1)-(*pdx); incrE=(*pdy)<<1; incrNE=((*pdy)-(*pdx))<<1;
132 x=x0; y=y0; r0=r1=0; /* dd=tolerance (store max drift) */
133 while( (*px)<=(*px1) ){
134 if( ((getpixel(p,x,y)<cs)?1:0)^(ret&1) ) r0++; else r1++;
135 (*px)++; if( d<=0 ){ d+=incrE; } else { d+=incrNE; (*py)+=(*pty); }
137 return (r0*(ret&~1))/(r0+r1); // ret==100 => percentage %
140 // this function should detect whether a direct connection between points
141 // exists or not, not finally implemented
142 // ret & 1 => inverse pixel!
143 // d=2*F(x,y) integer numbers, ideal line: ,I pixel: I@
144 // ..@ @@@ .@. ...,@2@. +1..+3 floodfill around line ???
145 // ..@ .@@ .@. ...,.@@@ +2..+4 <= that's not implemented yet
146 // ..@ ..@ .@. ...,.@@@ +2..+4
147 // @.@ @.. .@. ...,@@@. +1..+3
148 // @.@ @@. .@. ...I@@@. 0..+3
149 // @@@ @@@ .@. ..@1@@.. 0..+2
150 // 90% 0% 100% 90% r1-r2
151 // I am not satisfied with it
152 int get_line2(int x0, int y0, int x1, int y1, pix *p, int cs, int ret){
153 int dx,dy,incrE,incrNE,d,x,y,r0,r1,ty,tx,q,ddy,rx,ry,
154 *px,*py,*pdx,*pdy,*ptx,*pty,*px1;
155 dx=abs(x1-x0); tx=((x1>x0)?1:-1); // tx=x-spiegelung (new)
156 dy=abs(y1-y0); ty=((y1>y0)?1:-1); // ty=y-spiegelung (new)
157 // rotate coordinate system if dy>dx
158 if(dx>dy){ pdx=&dx;pdy=&dy;px=&x;py=&y;ptx=&tx;pty=&ty;px1=&x1;rx=1;ry=0; }
159 else { pdx=&dy;pdy=&dx;px=&y;py=&x;ptx=&ty;pty=&tx;px1=&y1;rx=0;ry=1; }
160 if( *ptx<0 ){ swap(&x0,&x1);swap(&y0,&y1);tx=-tx;ty=-ty; }
161 d=((*pdy)<<1)-(*pdx); incrE=(*pdy)<<1; incrNE=((*pdy)-(*pdx))<<1;
162 x=x0; y=y0; r0=r1=0; ddy=3; // tolerance = bit 1 + bit 0 = left+right
163 // int t=(*pdx)/16,tl,tr; // tolerance, left-,right delimiter
164 while( (*px)<=(*px1) ){ // not finaly implemented
165 q=((getpixel(p,x,y)<cs)?1:0)^(ret&1);
166 if ( !q ){ // tolerance one pixel perpenticular to the line
167 // what about 2 or more pixels tolerance???
168 ddy&=(~1)|(((getpixel(p,x+ry,y+rx)<cs)?1:0)^(ret&1));
169 ddy&=(~2)|(((getpixel(p,x-ry,y-rx)<cs)?1:0)^(ret&1))*2;
171 if( ddy ) r0++; else r1++;
172 (*px)++; if( d<=0 ){ d+=incrE; } else { d+=incrNE; (*py)+=(*pty); }
174 return (r0*(ret&~1))/(r0+r1); // ret==100 => percentage %
177 /* Look for dots in the rectangular region x0 <= x <= x1 and y0 <= y
178 <= y1 in pixmap p. The two low order bits in mask indicate the color
179 of dots to look for: If mask==1 then look for black dots (where a
180 pixel value less than cs is considered black). If mask==2 then look
181 for white dots. If mask==3 then look for both black and white dots.
182 If the dots are found, the corresponding bits are set in the returned
183 value. Heavily used by the engine ocr0*.cc */
184 char get_bw(int x0, int x1, int y0, int y1, pix * p, int cs, int mask) {
185 char rc = 0; // later with error < 2% (1 dot)
189 if (x1 >= p->x) x1 = p->x - 1;
191 if (y1 >= p->y) y1 = p->y - 1;
193 for ( y = y0; y <= y1; y++)
194 for ( x = x0; x <= x1; x++) {
195 rc |= ((getpixel(p, x, y) < cs) ? 1 : 2); // break if rc==3
196 if ((rc & mask) == mask)
197 return mask; // break loop
202 /* more general Mar2000 (x0,x1,y0,y1 instead of x0,y0,x1,y1! (history))
203 * look for black crossings throw a line from x0,y0 to x1,y1 and count them
204 * follow line and count crossings ([white]-black-transitions)
205 * ex: horizontal num_cross of 'm' would return 3 */
206 int num_cross(int x0, int x1, int y0, int y1, pix *p, int cs) {
207 int rc = 0, col = 0, k, x, y, i, d; // rc=crossings col=0=white
208 int dx = x1 - x0, dy = y1 - y0;
210 d = MAX(abs(dx), abs(dy));
211 for (i = 0, x = x0, y = y0; i <= d; i++) {
216 k = ((getpixel(p, x, y) < cs) ? 1 : 0); // 0=white 1=black
217 if (col == 0 && k == 1)
224 /* check if test matches pattern
225 * possible pattern: "a-zA-Z0-9+\-\\" (x-y dont work for c>127)
226 * ToDo: wchar_t cc + matching UTF-8 pattern for nonASCII
228 int my_strchr( char *pattern, wchar_t cc ) {
230 if (pattern==(char *)NULL) return 0;
232 /* if (!(cc&0x80)) s1=strchr(pattern,(char)cc); else */
233 s1=strstr(pattern,decode(cc, UTF8));
237 if ((!s1) || s1-pattern<1 || *(s1-1)!='\\') return 0;
240 if (s1) return 1; /* cc simply matches */
243 if ((!s1[0]) || (!s1[1])) return 0; /* end of string */
244 if (*(s1-1)!='\\' && *(s1-1)<=cc && *(s1+1)>=cc) return 1;
245 s1=strchr(s1+1,'-'); /* look for next '-' */
251 /* set alternate chars and its weight, called from the engine
252 if a char is recognized to (weight) percent
253 can be used for filtering (only numbers etc)
254 often usefull if Il1 are looking very similar
255 should this function stay in box.c ???
256 weight is between 0 and 100 in percent, 100 means absolutely sure
257 - not final, not time critical (js)
258 - replace it by a string-function setaobj(*b,"string",weight)
259 and let call setac the setas function
262 int setas(struct box *b, char *as, int weight){
264 if (b->num_ac > NumAlt || b->num_ac<0) {
265 fprintf(stderr,"\nDBG: There is something wrong with setas()!");
269 fprintf(stderr,"\nDBG: setas(NULL) makes no sense!"); return 0; }
271 fprintf(stderr,"\nDBG: setas(\"\") makes no sense!"
272 " x= %d %d", b->x0, b->y0);
277 /* char filter (ex: only numbers) ToDo: cfilter as UTF-8 */
278 if (JOB->cfg.cfilter) {
279 /* do not accept chars which are not in the cfilter string */
280 if ( as[0]>0 && as[1]==0 )
281 if ( !my_strchr(JOB->cfg.cfilter,as[0]) ) return 0;
283 #if 0 /* obsolete, done in setac */
284 /* not sure that this is the right place, but where else? */
285 if ( as[0]>0 && as[1]==0 )
286 if (b->modifier != SPACE && b->modifier != 0) {
288 newac = compose(as[0], b->modifier);
289 as = (char *)decode(newac, UTF8); /* was (const char *) */
290 if (newac == as[0]) { /* nothing composed */
291 fprintf(stderr, "\nDBG setas compose was useless %d %d",b->x0,b->y0);
297 /* only the first run gets the full weight */
298 weight=(100-JOB->tmp.n_run)*weight/100;
300 /* remove same entries from table */
301 for (i=0;i<b->num_ac;i++)
303 if (strcmp(as,b->tas[i])==0) break;
304 if (b->num_ac>0 && i<b->num_ac){
305 if (weight<=b->wac[i]) return 0; /* if found + less weight ignore it */
306 /* to insert the new weigth on the right place, we remove it first */
307 if (b->tas[i]) free(b->tas[i]);
308 for (j=i;j<b->num_ac-1;j++){ /* shift lower entries */
309 b->tac[j]=b->tac[j+1]; /* copy the char */
310 b->tas[j]=b->tas[j+1]; /* copy the pointer to the string */
311 b->wac[j]=b->wac[j+1]; /* copy the weight */
313 b->num_ac--; /* shrink table */
315 /* sorting and add it to the table */
316 for (i=0;i<b->num_ac;i++) if (weight>b->wac[i]) break;
317 if (b->num_ac<NumAlt-1) b->num_ac++; /* enlarge table */
318 for (j=b->num_ac-1;j>i;j--){ /* shift lower entries */
319 b->tac[j]=b->tac[j-1]; /* copy the char */
320 b->tas[j]=b->tas[j-1]; /* copy the pointer to the string */
321 b->wac[j]=b->wac[j-1]; /* copy the weight */
323 if (i<b->num_ac) { /* insert new entry */
324 b->tac[i]=0; /* insert the char=0 ... */
325 b->tas[i]=(char *)malloc(strlen(as)+1); /* ... string */
326 if (b->tas[i]) memcpy(b->tas[i],as,strlen(as)+1);
327 b->wac[i]=weight; /* ... and its weight */
329 if (i==0) b->c=b->tac[0]; /* char or 0 for string */
333 /* ToDo: this function will be replaced by a call of setas() later */
334 int setac(struct box *b, wchar_t ac, int weight){
336 if ((!b) || b->num_ac > NumAlt || b->num_ac<0) {
337 fprintf(stderr,"\nDBG: This is a bad call to setac()!");
340 if (ac==0 || ac==UNKNOWN) {
341 fprintf(stderr,"\nDBG: setac(0) makes no sense!");
344 /* char filter (ex: only numbers) ToDo: cfilter as UTF-8 */
345 if (JOB->cfg.cfilter) {
346 /* do not accept chars which are not in the cfilter string */
347 /* if ( ac>255 || !strchr(JOB->cfg.cfilter,(char)ac) ) return 0; */
348 if ( !my_strchr(JOB->cfg.cfilter,ac) ) return 0;
350 /* not sure that this is the right place, but where else? */
351 if (b->modifier != SPACE && b->modifier != 0) {
353 newac = compose(ac, b->modifier);
354 if (newac == ac) { /* nothing composed */
355 if(JOB->cfg.verbose & 7)
356 fprintf(stderr, "\nDBG setac(%s): compose was useless @ %d %d",
357 decode(ac,ASCII), b->x0, b->y0);
362 /* only the first run gets the full weight */
363 weight=(100-JOB->tmp.n_run)*weight/100;
365 /* remove same entries from table */
366 for (i=0;i<b->num_ac;i++) if (ac==b->tac[i]) break;
367 if (b->num_ac>0 && i<b->num_ac){
368 if (weight<=b->wac[i]) return 0;
369 if (b->tas[i]) free(b->tas[i]);
370 for (j=i;j<b->num_ac-1;j++){ /* shift lower entries */
371 b->tac[j]=b->tac[j+1]; /* copy the char */
372 b->tas[j]=b->tas[j+1]; /* copy the pointer to the string */
373 b->wac[j]=b->wac[j+1]; /* copy the weight */
375 b->num_ac--; /* shrink table */
377 /* sorting it to the table */
378 for (i=0;i<b->num_ac;i++) if (weight>b->wac[i]) break;
379 if (b->num_ac<NumAlt-1) b->num_ac++; /* enlarge table */
380 for (j=b->num_ac-1;j>i;j--){ /* shift lower entries */
381 b->tac[j]=b->tac[j-1]; /* copy the char */
382 b->tas[j]=b->tas[j-1]; /* copy the pointer to the string */
383 b->wac[j]=b->wac[j-1]; /* copy the weight */
385 if (i<b->num_ac) { /* insert new entry */
386 b->tac[i]=ac; /* insert the char ... */
387 b->tas[j]=NULL; /* ... no string (?) */
388 b->wac[i]=weight; /* ... and its weight */
390 if (i==0) b->c=ac; /* store best result to b->c (will be obsolete) */
395 /* test if ac in wac-table
396 usefull for contextcorrection and box-splitting
397 return 0 if not found
398 return wac if found (wac>0)
400 int testac(struct box *b, wchar_t ac){
402 if (b->num_ac > NumAlt || b->num_ac<0) {
403 fprintf(stderr,"\n#DEBUG: There is something wrong with testac()!");
406 /* search entries in table */
407 for (i=0;i<b->num_ac;i++) if (ac==b->tac[i]) return b->wac[i];
412 /* look for edges: follow a line from x0,y0 to x1,y1, record the
413 * location of each transition, and return their number.
414 * ex: horizontal num_cross of 'm' would return 6
415 * remark: this function is not used, obsolete? ToDo: remove?
417 int follow_path(int x0, int x1, int y0, int y1, pix *p, int cs, path_t *path) {
418 int rc = 0, prev, x, y, i, d, color; // rc=crossings col=0=white
419 int dx = x1 - x0, dy = y1 - y0;
421 d = MAX(abs(dx), abs(dy));
422 prev = getpixel(p, x0, y0) < cs; // 0=white 1=black
424 for (i = 1, x = x0, y = y0; i <= d; i++) {
429 color = getpixel(p, x, y) < cs; // 0=white 1=black
432 int n=path->max*2+10;
433 path->x = (int *) xrealloc(path->x, n*sizeof(int));
434 path->y = (int *) xrealloc(path->y, n*sizeof(int));
447 /* ToDo: only used in follow_path, which is obsolete, remove? */
448 void *xrealloc(void *ptr, size_t size){
450 p = realloc(ptr, size);
452 fprintf(stderr, "insufficient memory");
459 * -------------------------------------------------------------
461 * - first move forward until b/w-edge
462 * - more than 2 pixel?
464 * - if forward pixel : go up, rotate right
465 * - if forward no pixel : rotate left
466 * - stop if found first 2 pixel in same order
467 * go_along_the_right_wall strategy is very similar and used otherwhere
468 * --------------------------------------------------------------
469 * turmite game: inp: start-x,y, regel r_black=UP,r_white=RIght until border
472 * could be used to extract more features:
473 * by counting stepps, dead-end streets ,xmax,ymax,ro-,ru-,lo-,lu-edges
475 * use this little animal to find features, I first was happy about it
476 * but now I prefer the loop() function
479 void turmite(pix *p, int *x, int *y,
480 int x0, int x1, int y0, int y1, int cs, int rw, int rb) {
482 if (outbounds(p, x0, y0)) // out of pixmap
484 while (*x >= x0 && *y >= y0 && *x <= x1 && *y <= y1) {
485 r = ((getpixel(p, *x, *y) < cs) ? rb : rw); // select rule
487 case UP: (*y)--; break;
488 case DO: (*y)++; break;
489 case RI: (*x)++; break;
490 case LE: (*x)--; break;
494 if( r==ST ) break; /* leave the while-loop */
498 /* search a way from p0 to p1 without crossing pixels of type t
499 * only two directions, useful to test if there is a gap 's'
500 * labyrinth algorithm - do you know a faster way? */
501 int joined(pix *p, int x0, int y0, int x1, int y1, int cs){
502 int t,r,x,y,dx,dy,xa,ya,xb,yb;
504 if(x1>x0){xa=x0;xb=x1;} else {xb=x0;xa=x1;}
505 if(y1>y0){ya=y0;yb=y1;} else {yb=y0;ya=y1;}
506 t=((getpixel(p,x,y)<cs)?1:0);
508 if( t==((getpixel(p,x+dy,y-dx)<cs)?1:0) // right free?
509 && x+dy>=xa && x+dy<=xb && y-dx>=ya && y-dx<=yb) // wall
510 { r=dy;dy=-dx;dx=r;x+=dx;y+=dy; } // rotate right and step forward
511 else { r=dx;dx=-dy;dy=r; } // rotate left
512 // fprintf(stderr," path xy %d-%d %d-%d %d %d %d %d\n",xa,xb,ya,yb,x,y,dx,dy);
513 if( x==x1 && y==y1 ) return 1;
514 if( x==x0 && y==y0 && dx==1) return 0;
516 // return 0; // endless loop ?
519 /* move from x,y to direction r until pixel of color col is found
520 * or maximum of l steps
521 * return the number of steps done */
522 int loop(pix *p,int x,int y,int l,int cs,int col, DIRECTION r){
524 if(x>=0 && y>=0 && x<p->x && y<p->y){
527 for( ;i<l && y>=0;i++,y--)
528 if( (getpixel(p,x,y)<cs)^col )
532 for( ;i<l && y<p->y;i++,y++)
533 if( (getpixel(p,x,y)<cs)^col )
537 for( ;i<l && x>=0;i++,x--)
538 if( (getpixel(p,x,y)<cs)^col )
542 for( ;i<l && x<p->x;i++,x++)
543 if( (getpixel(p,x,y)<cs)^col )
552 /* Given a point, frames a rectangle containing all points of the same
553 * color surrounding it, and mark these points.
554 * ToDo: obsolate and replaced by frame_vector
556 * looking for better algo: go horizontally and look for upper/lower non_marked_pixel/nopixel
557 * use lowest three bits for mark
558 * - recursive version removed! AmigaOS has no Stack-OVL-Event
559 * run around the chape using laby-robot
560 * bad changes can lead to endless loop!
561 * - this is not absolutely sure but mostly works well
562 * diag - 0: only pi/2 direction, 1: pi/4 directions (diagonal)
563 * mark - 3 bit marker, mark each valid pixel with it
565 int frame_nn(pix *p, int x, int y,
566 int *x0, int *x1, int *y0, int *y1, // enlarge frame
567 int cs, int mark,int diag){
568 #if 1 /* flood-fill to detect black objects, simple and faster? */
569 int rc = 0, dx, col, maxstack=0; static int overflow=0;
570 int bmax=1024, blen=0, *buf; /* buffer as replacement for recursion stack */
573 if (outbounds(p, x, y)) return 0;
574 /* check if already marked (with mark since v0.4) */
575 if ((marked(p,x,y)&mark)==mark) return 0;
577 col = ((getpixel(p, x, y) < cs) ? 0 : 1);
578 buf=(int *)malloc(bmax*sizeof(int)*2);
579 if (!buf) { fprintf(stderr,"malloc failed (frame_nn)\n");return 0;}
584 g_debug(fprintf(stderr,"\nframe_nn x=%4d y=%4d",x,y);)
586 /* max stack depth is complexity of the object */
587 if (blen>maxstack) maxstack=blen;
588 blen--; /* reduce the stack */
591 if (y < *y0) *y0 = y;
592 if (y > *y1) *y1 = y;
593 /* first go to leftmost pixel */
594 for ( ; x>0 && (col == ((getpixel(p, x-1, y) < cs) ? 0 : 1)) ; x--);
595 if ((marked(p,x,y)&mark)==mark) continue; /* already scanned */
596 for (dx=-1;dx<2;dx+=2) /* look at upper and lower line, left */
597 if ( diag && x<p->x && x-1>0 && y+dx >=0 && y+dx < p->y
598 && col != ((getpixel(p, x , y+dx) < cs) ? 0 : 1)
599 && col == ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1)
600 && !((marked(p,x-1,y+dx)&mark)==mark)
602 if (blen+1>=bmax) { overflow|=1; continue; }
607 if (x < *x0) *x0 = x;
608 /* second go right, mark and get new starting points */
609 for ( ; x<p->x && (col == ((getpixel(p, x , y) < cs) ? 0 : 1)) ; x++) {
610 p->p[x + y * p->x] |= (mark & 7); rc++; /* mark pixel */
612 if (x > *x1) *x1 = x;
613 for (dx=-1;dx<2;dx+=2) /* look at upper and lower line */
614 if ( col == ((getpixel(p, x , y+dx) < cs) ? 0 : 1)
616 col != ((getpixel(p, x-1, y ) < cs) ? 0 : 1)
617 || col != ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1) )
618 && !((marked(p,x,y+dx)&mark)==mark) && y+dx<p->y && y+dx>=0
620 if (blen+1>=bmax) { overflow|=1; continue; }
626 for (dx=-1;dx<2;dx+=2) /* look at upper and lower line, right */
627 if ( diag && x<p->x && x-1>0 && y+dx >=0 && y+dx < p->y
628 && col == ((getpixel(p, x-1, y ) < cs) ? 0 : 1)
629 && col != ((getpixel(p, x , y ) < cs) ? 0 : 1)
630 && col != ((getpixel(p, x-1, y+dx) < cs) ? 0 : 1)
631 && col == ((getpixel(p, x , y+dx) < cs) ? 0 : 1)
632 && !((marked(p,x,y+dx)&mark)==mark)
634 if (blen+1>=bmax) { overflow|=1; continue; }
641 /* debug, ToDo: use info maxstack and pixels for image classification */
642 g_debug(fprintf(stderr," maxstack= %4d pixels= %6d",maxstack,rc);)
645 fprintf(stderr,"# Warning: frame_nn stack oerflow\n");
648 #else /* old version, ToDo: improve it for tmp04/005*.pgm.gz */
649 int i, j, d, dx, ox, oy, od, nx, ny, rc = 0, rot = 0, x2 = x, y2 = y, ln;
651 static const int d0[8][2] = { { 0, -1} /* up */, {-1, -1},
652 {-1, 0} /* left */, {-1, 1},
653 { 0, 1} /* down */, { 1, 1},
654 { 1, 0} /* right */, { 1, -1}};
657 if (outbounds(p, x, y))
659 /* check if already marked */
660 if ((marked(p,x,y)&mark)==mark)
663 i = ((getpixel(p, x, y) < cs) ? 0 : 1);
666 g_debug(fprintf(stderr," start frame:");)
668 for (ln = 0; ln < 2 && rot >= 0; ln++) { // repeat if right-loop
669 g_debug(fprintf(stderr," ln=%d diag=%d cs=%d x=%d y=%d - go to border\n",ln,diag,cs,x,y);)
671 od=d=(8+4*ln-diag)&7; // start robot looks up, right is a wall
672 // go to right (left) border
676 /* start on leftmost position */
677 for (dx = 1 - 2*ln; x + dx < p->x && x + dx >= 0 /* bounds */ &&
678 i == ((getpixel(p, x + dx, y) < cs) ? 0 : 1) /* color */;
681 g_debug(fprintf(stderr," ln=%d diag=%d cs=%d x=%d y=%d\n",ln,diag,cs,x,y);)
683 /* robot stores start-position */
685 for (rot = 0; abs(rot) <= 64; ) { /* for sure max. 8 spirals */
686 /* leftmost position */
687 if (ln == 0 && x < x2) {
691 g_debug(fprintf(stderr," x=%3d y=%3d d=%d i=%d p=%3d rc=%d\n",x,y,d,i,getpixel(p,x,y),rc);)
693 if ( abs(d0[d][1]) ) { /* mark left (right) pixels */
694 for (j = 0, dx = d0[d][1]; x + j >= 0 && x + j < p->x
695 && i == ((getpixel(p, x + j, y) < cs) ? 0 : 1); j += dx) {
696 if (!((marked(p, x + j, y)&mark)==mark))
698 p->p[x + j + y * p->x] |= (mark & 7);
701 /* look to the front of robot */
704 /* if right is a wall */
705 if ( outbounds(p, nx, ny) || i != ((getpixel(p,nx,ny)<cs) ? 0 : 1) ) {
707 d=(d+2-diag) & 7; rot-=2-diag;
709 else { /* if no wall, go, turn back and rotate left */
710 x=nx; y=ny; d=(d+4+2-diag) & 7; rot+=2-diag+4;
712 if (x < *x0) *x0 = x;
713 if (x > *x1) *x1 = x;
714 if (y < *y0) *y0 = y;
715 if (y > *y1) *y1 = y;
717 if(x==ox && y==oy && d==od) break; // round trip finished
720 g_debug(fprintf(stderr," rot=%d\n",rot);)
725 /* obsolete! replaced by vectors
726 * mark neighbouring pixel of same color, return number
727 * better with neighbours of same color (more general) ???
728 * parameters: (&~7)-pixmap, start-point, critical_value, mark
729 * recursion is removed */
730 int mark_nn(pix * p, int x, int y, int cs, int r) {
731 /* out of bounds or already marked? */
732 if (outbounds(p, x, y) || (marked(p, x, y)&r)==r)
737 y0 = y1 = y; // not used
738 return frame_nn(p, x, y, &x0, &x1, &y0, &y1, cs, r, JOB->tmp.n_run & 1);
743 /* ToDo: finish to replace old frame by this new one
745 * @...........#@@@@@@@. # = marked as already scanned black pixels
746 * @........@@@@@@@@@@@# only left and right border
747 * .......#@@@@@@@@@@@@@ left side on even y
748 * ......@@@@@@@@#.@@@@# right side on odd y
749 * .....#@@@@@......#@@@ no border is marked twice
750 * ....@@@@@#......@@@#. works also for thinn lines
751 * ...#@@@@........#@@@. - outer loop is stored as first
752 * ..@@@@#........@@@#.. - inner loop is stored as second
753 * .#@@@@........#@@@@.. 1st in an extra box (think on white chars)
754 * @@@@#.......@@@@#.... 2nd merge in an extra step
755 * #@@@@@....#@@@@@.....
756 * @@@@@@@@@@@@@@#......
757 * .#@@@@@@@@@@@@.......
759 * run around the chape using laby-robot
760 * - used for scanning boxes, look for horizontal b/w transitions
761 * with unmarked black pixels and call this routine
762 * - stop if crossing a marked box in same direction (left=up, right=down)
763 * box - char box, store frame_vectors and box
764 * x,y - starting point
765 * mark - 3 bit marker, mark each valid pixel with it
766 * diag - 0: only pi/2 direction, 1: pi/4 directions (diagonal)
767 * ds - start direction, 6=right of right border, 2=left of left border
768 * ret - 0=ok, -1=already marked, -2=max_num_frames_exceeded
769 * -7=no border in direction ds
775 /* grep keywords: scan_vectors frame_vector */
776 int frame_vector(struct box *box1, int x, int y,
777 int cs, int mark, int diag, int ds) {
779 new_x=1, /* flag for storing the vector x,y */
780 steps=1, /* steps between stored vectors, speedup for big frames */
782 ox, oy, /* starting point */
783 nx, ny, mx, my, /* used for simplification */
784 /* ToDo: add periphery to box (german: Umfang?) */
785 rc = 1, /* return code, circumference, sum vector lengths */
786 rot = 0, /* memory for rotation, rot=8 means one full rotation */
787 vol = 0; /* volume inside frame, negative for white inside black */
790 /* translate the 8 directions to (x,y) pairs,
791 * if only four directions are used, only every 2nd vector is accessed,
792 * +1 turn left, -1 turn right
794 static const int d0[8][2] =
795 { { 0, -1}, /* up */ {-1, -1}, /* up-le */
796 {-1, 0}, /* left */ {-1, 1}, /* do-le */
797 { 0, 1}, /* down */ { 1, 1}, /* do-ri */
798 { 1, 0}, /* right */ { 1, -1} }; /* up-ri */
801 if (outbounds(p, x, y))
804 /* pixel color we are looking for, 0=black, 1=white */
806 i1 = ((getpixel(p, x, y ) < cs) ? 0 : 1);
807 i2 = ((getpixel(p, x + d0[d][0], y + d0[d][1]) < cs) ? 0 : 1);
809 g_debug(fprintf(stderr,"\nLEV2 frame_vector @ %3d %3d d%d %2d %2d"
810 " %d-%d pix=%3d mark=%d cs=%d",\
811 x,y,ds,d0[ds][0],d0[ds][1],i1,i2,getpixel(p,x,y),mark,cs);)
814 fprintf(stderr,"ERROR frame_vector: no border\n");
815 return -7; /* no border detected */
818 /* initialize boxframe outside this function
823 /* initialize boxvector outside this function
825 num_frame_vectors[0]=0 ???
826 and store start value
828 if (box1->num_frames > MaxNumFrames) return -2;
829 /* index to next (x,y) */
830 i2o=i2=( (box1->num_frames==0)?0:
831 box1->num_frame_vectors[ box1->num_frames ] );
832 #if 0 // obsolete v0.43
833 box1->frame_vector[i2][0]=x;
834 box1->frame_vector[i2][1]=y;
836 box1->num_frame_vectors[ box1->num_frames ]=i2;
840 /* robot stores start-position */
841 ox = x; oy = y; /* look forward to white pixel */
843 for (;;) { /* stop if same marked pixel touched */
845 g_debug(fprintf(stderr,"\nLEV3: x= %3d %3d d= %d rot= %2d %3d",x,y,d,rot,i2);)
847 /* ToDo: store max. abs(rot) ??? for better recognition */
849 g_debug(fprintf(stderr,"\nLEV2: markB xy= %3d %3d ", x, y);)
850 p->p[x + y * p->x] |= (mark & 7); /* mark black pixel */
853 /* store a new vector or enlarge the predecessor */
854 if (new_x && (rc%steps)==0) { /* dont store everything on big chars */
855 if (i2>=MaxFrameVectors) {
856 box1->num_frame_vectors[ box1->num_frames-1 ]=i2;
857 reduce_vectors(box1,1); /* simplify loop */
858 i2=box1->num_frame_vectors[ box1->num_frames-1 ];
859 /* enlarge steps on big chars getting speedup */
860 steps=(box1->y1-box1->y0+box1->x1-box1->x0)/32+1;
862 /* store frame-vector */
863 if (i2<MaxFrameVectors) {
864 box1->frame_vector[i2][0]=x;
865 box1->frame_vector[i2][1]=y;
866 /* test if older vector points to the same direction */
868 /* get predecessor */
869 nx=box1->frame_vector[i2-1][0]-box1->frame_vector[i2-2][0];
870 ny=box1->frame_vector[i2-1][1]-box1->frame_vector[i2-2][1];
871 mx=x -box1->frame_vector[i2-1][0];
872 my=y -box1->frame_vector[i2-1][1];
873 /* same direction? */
874 if (nx*my-ny*mx==0 && nx*mx>=0 && ny*my>=0) {
875 /* simplify by removing predecessor */
877 box1->frame_vector[i2][0]=x;
878 box1->frame_vector[i2][1]=y;
879 } /* do not simplify */
882 box1->num_frame_vectors[ box1->num_frames-1 ]=i2;
884 g_debug(fprintf(stderr," stored @ %3d steps= %d", i2-1, steps);)
886 new_x=0; /* work for new pixel (x,y) done */
888 /* check if round trip is finished */
889 if (x==ox && y==oy && abs(rot)>=8) break;
891 /* look to the front of robot (turtle or ant) */
895 /* next step, if right is a wall turn the turtle left */
896 if ( outbounds(p, nx, ny) || i1 != ((getpixel(p,nx,ny)<cs) ? 0 : 1) ) {
897 if (y==ny && nx>=0 && nx<p->x) { /* if inbound */
898 g_debug(fprintf(stderr,"\nLEV2: markW xy= %3d %3d ", nx, ny);)
899 p->p[nx + ny * p->x] |= (mark & 7); /* mark white pixel */
901 /* rotate left 90 or 45 degrees */
902 d=(d+2-diag) & 7; rot+=2-diag;
903 /* calculate volume inside frame */
905 case 2+2: vol-=x-1; break;
906 case 6+2: vol+=x; break;
909 else { /* if no wall, go forward and turn right (90 or 45 degrees) */
911 /* turn back and rotate left */
912 d=(d+4+2-diag) & 7; rot+=2-diag-4;
913 rc++; /* counting steps, used for speedup */
916 if (x < box1->x0) box1->x0 = x;
917 if (x > box1->x1) box1->x1 = x;
918 if (y < box1->y0) box1->y0 = y;
919 if (y > box1->y1) box1->y1 = y;
925 /* to distinguish inner and outer frames, store volume as +v or -v */
926 box1->frame_vol[ box1->num_frames-1 ] = vol;
927 box1->frame_per[ box1->num_frames-1 ] = rc-1;
929 /* dont count and store the first vector twice */
931 i2--; rc--; box1->num_frame_vectors[ box1->num_frames-1 ]=i2;
933 /* output break conditions */
934 g_debug(fprintf(stderr,"\nLEV2 o= %3d %3d x= %3d %3d r=%d v=%d",ox,oy,x,y,rot,vol);)
935 /* rc=1 for a single point, rc=2 for a two pixel sized point */
936 g_debug(fprintf(stderr," steps= %3d vectors= %3d",rc,i2);)
937 /* out_x(box1); ToDo: output only the first thousend */
938 return rc; /* return number of bordering pixels = periphery? */
943 /* clear lowest 3 (marked) bits (they are used for marking) */
944 void clr_bits(pix * p, int x0, int x1, int y0, int y1) {
946 for ( y=y0; y <= y1; y++)
947 for ( x=x0; x <= x1; x++)
948 p->p[x+y*p->x] &= ~7;
951 /* look for white holes surrounded by black points
952 * at the moment look for white point with black in all four directions
953 * - store position of hole in coordinates relativ to box!
954 * ToDo: count only holes with vol>10% ???
955 * ToDo: rewrite for frame vectors (faster, no malloc)
956 * holes are frames rotating left hand
957 * obsolete, do it with vectors
959 int num_hole(int x0, int x1, int y0, int y1, pix * p, int cs, holes_t *holes) {
960 int num_holes = 0, x, y, hole_size;
961 pix b; // temporary mini-page
962 int dx = x1 - x0 + 1, dy = y1 - y0 + 1;
963 unsigned char *buf; // 2nd copy of picture, for working
965 if (holes) holes->num=0;
966 if(dx<3 || dy<3) return 0;
967 b.p = buf = (unsigned char *) malloc( dx * dy );
969 fprintf( stderr, "\nFATAL: malloc(%d) failed, skip num_hole", dx*dy );
972 if (copybox(p, x0, y0, dx, dy, &b, dx * dy))
973 { free(b.p); return -1;}
975 // printf(" num_hole(");
976 /* --- mark white-points connected with border */
977 for (x = 0; x < b.x; x++) {
978 if (getpixel(&b, x, 0) >= cs)
979 mark_nn(&b, x, 0, cs, AT);
980 if (getpixel(&b, x, b.y - 1) >= cs)
981 mark_nn(&b, x, b.y - 1, cs, AT);
983 for (y = 0; y < b.y; y++) {
984 if (getpixel(&b, 0, y) >= cs)
985 mark_nn(&b, 0, y, cs, AT);
986 if (getpixel(&b, b.x - 1, y) >= cs)
987 mark_nn(&b, b.x - 1, y, cs, AT);
990 //g_debug(out_b(NULL,&b,0,0,b.x,b.y,cs);)
991 // --- look for unmarked white points => hole
992 for (x = 0; x < b.x; x++)
993 for (y = 0; y < b.y; y++)
994 if (!((marked(&b, x, y)&AT)==AT)) // unmarked
995 if (getpixel(&b, x, y) >= cs) { // hole found
997 hole_size=mark_nn(&b, x, y, cs, AT); /* old version */
998 if (hole_size > 1 || dx * dy <= 40)
1001 { /* new version, for future store of hole characteristics */
1002 int x0, x1, y0, y1, i, j;
1004 y0 = y1 = y; // not used
1005 hole_size=frame_nn(&b, x, y, &x0, &x1, &y0, &y1, cs, AT, JOB->tmp.n_run & 1);
1006 // store hole for future use, num is initialized with 0
1007 if (hole_size > 1 || dx * dy <= 40){
1011 for (i=0;i<holes->num && i<MAX_HOLES;i++)
1012 if (holes->hole[i].size < hole_size) break;
1013 for (j=MAX_HOLES-2;j>=i;j--)
1014 holes->hole[j+1]=holes->hole[j];
1016 // printf(" i=%d size=%d\n",i,hole_size);
1017 holes->hole[i].size=hole_size;
1020 holes->hole[i].x0=x0;
1021 holes->hole[i].y0=y0;
1022 holes->hole[i].x1=x1;
1023 holes->hole[i].y1=y1;
1032 // printf(")=%d",num_holes);
1036 /* count for black nonconnected objects --- used for i,auml,ouml,etc. */
1037 /* ToDo: obsolete, replaced by vectors and box.num_boxes */
1038 int num_obj(int x0, int x1, int y0, int y1, pix * p, int cs) {
1039 int x, y, rc = 0; // rc=num_obj
1040 unsigned char *buf; // 2nd copy of picture, for working
1043 if(x1<x0 || y1<y0) return 0;
1044 b.p = buf = (unsigned char *) malloc( (x1-x0+1) * (y1-y0+1) );
1046 fprintf( stderr, "\nFATAL: malloc(%d) failed, skip num_obj",(x1-x0+1)*(y1-y0+1) );
1049 if (copybox(p, x0, y0, x1 - x0 + 1, y1 - y0 + 1, &b, (x1-x0+1) * (y1-y0+1)))
1050 { free(b.p); return -1; }
1051 // --- mark black-points connected with neighbours
1052 for (x = 0; x < b.x; x++)
1053 for (y = 0; y < b.y; y++)
1054 if (getpixel(&b, x, y) < cs)
1055 if (!((marked(&b, x, y)&AT)==AT)) {
1057 mark_nn(&b, x, y, cs, AT);
1064 // ----------------------------------------------------------------------
1065 // first idea for making recognition based on probability
1066 // - start with a list of all possible chars
1067 // - call recognition_of_char(box *)
1068 // - remove chars from list which could clearly excluded
1069 // - reduce probability of chars which have wrong features
1070 // - font types list could also build
1071 // at the moment it is only an idea, I should put it to the todo list
1073 char *list="0123456789,.\0xe4\0xf6\0xfc" // "a=228 o=246 u=252
1074 "abcdefghijklmnopqrstuvwxyz"
1075 "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
1077 int listlen=0,numrest=0;
1078 // initialize a new character list (for future)
1079 void ini_list(){ int i;
1080 for(i=0;list[i]!=0 && i<100;i++) wert[i]=0;
1081 numrest=listlen=i; }
1082 // exclude??? (for future) oh it was long time ago, I wrote that :/
1083 void exclude(char *filt){ int i,j;
1084 for(j=0;filt[j]!=0 && j<100;j++)
1085 for(i=0;list[i]!=0 && i<100;i++)
1086 if( filt[j]==list[i] ) { if(!wert[i])numrest--; wert[i]++; } }
1087 // get the result after all the work (for future)
1088 char getresult(){ int i;
1090 for(i=0;list[i]!=0 && i<100;i++) if(!wert[i]) return list[i];
1095 // look at the environment of the pixel too (contrast etc.)
1096 // detailed analysis only of diff pixels!
1098 // 100% * "distance", 0 is ideal fit
1099 // = similarity of two chars for recognition of garbled (verstuemmelter) chars
1100 // weight of pixels with only one same neighbour set to 0
1101 // look at contours too! v0.2.4: B==H
1102 // changed for v0.41, Mar06
1103 int distance( pix *p1, struct box *box1,
1104 pix *p2, struct box *box2, int cs){
1105 int rc=0,x,y,v1,v2,i1,i2,rgood=0,rbad=0,x1,y1,x2,y2,dx,dy,dx1,dy1,dx2,dy2;
1106 x1=box1->x0;y1=box1->y0;x2=box2->x0;y2=box2->y0;
1107 dx1=box1->x1-box1->x0+1; dx2=box2->x1-box2->x0+1; dx=((dx1>dx2)?dx1:dx2);
1108 dy1=box1->y1-box1->y0+1; dy2=box2->y1-box2->y0+1; dy=((dy1>dy2)?dy1:dy2);
1109 if(abs(dx1-dx2)>1+dx/16 || abs(dy1-dy2)>1+dy/16) return 100;
1110 // compare relations to baseline and upper line
1111 if(2*box1->y1>box1->m3+box1->m4 && 2*box2->y1<box2->m3+box2->m4) rbad+=128;
1112 if(2*box1->y0>box1->m1+box1->m2 && 2*box2->y0<box2->m1+box2->m2) rbad+=128;
1115 for( x=0;x<dx;x++ ) { // try global shift too ???
1116 v1 =((getpixel(p1,x1+x ,y1+y )<cs)?1:0); i1=8; // better gray?
1117 v2 =((getpixel(p2,x2+x ,y2+y )<cs)?1:0); i2=8; // better gray?
1118 if(v1==v2) { rgood+=8; continue; } // all things are right!
1119 // what about different pixel???
1120 // test overlap of 8 surounding pixels ??? bad if two nb. are bad
1122 for(i1=-1;i1<2;i1++)
1123 for(i2=-1;i2<2;i2++)if(i1!=0 || i2!=0){
1124 if( ((getpixel(p1,x1+x+i1*(1+dx/32),y1+y+i2*(1+dy/32))<cs)?1:0)
1125 !=((getpixel(p2,x2+x+i1*(1+dx/32),y2+y+i2*(1+dy/32))<cs)?1:0) ) v1++;
1127 if (v1>0) rbad+=16*v1;
1130 if(rgood+rbad) rc= (100*rbad+(rgood+rbad-1))/(rgood+rbad); else rc=99;
1131 if(rc<10 && JOB->cfg.verbose & 7){
1132 fprintf(stderr,"\n# distance rc=%d good=%d bad=%d",rc,rgood,rbad);
1133 // out_x(box1);out_x(box2);
1140 // ============================= call OCR engine ================== ;)
1141 // nrun=0 from outside, nrun=1 from inside (allows modifications, oobsolete)
1142 wchar_t whatletter(struct box *box1, int cs, int nrun){
1143 wchar_t bc=UNKNOWN; // best letter
1144 wchar_t um=SPACE; // umlaut? '" => modifier
1145 pix *p=box1->p; // whole image
1146 int x,y,dots,xa,ya,x0,x1,y0,y1,dx,dy,i;
1148 struct box bbuf=*box1; // restore after modifikation!
1150 if (box1->num_ac>0 && box1->wac[0]>=JOB->cfg.certainty && bc==UNKNOWN) {
1153 // if (bc!=UNKNOWN) return bc;
1154 // if whatletter() called again, only unknown chars are processed
1155 // bad for splitting!
1157 // store box data, which can be modified for modified chars in 2nd run
1158 bbuf.x0=box1->x0; bbuf.y0=box1->y0;
1159 bbuf.x1=box1->x1; bbuf.y1=box1->y1;
1161 xa=box1->x; ya=box1->y;
1162 x0=box1->x0; y0=box1->y0;
1163 x1=box1->x1; y1=box1->y1;
1164 // int vol=(y1-y0+1)*(x1-x0+1); // volume
1165 // crossed l-m , divided chars
1166 while( get_bw(x0,x1,y0,y0,p,cs,1)!=1 && y0+1<y1) y0++;
1167 while( get_bw(x0,x1,y1,y1,p,cs,1)!=1 && y0+1<y1) y1--;
1171 // better to proof the white frame too!!! ????
1172 // --- test for german umlaut and points above, not robust enough???
1173 // if three chars are connected i-dots (ari) sometimes were not detected
1174 // - therefore after division a test could be useful
1175 // modify y0 only in second run!?
1176 // we need it here to have the right copybox
1177 if (um==SPACE && dy>5 && box1->num_boxes>1)
1178 testumlaut(box1,cs,2,&um); /* set box1->modifier + new y0 */
1181 y0 =box1->y0; // dots==2 => y0 below double dots
1184 // move upper and lower border (for divided letters)
1185 while( get_bw(x0,x1,y0,y0,p,cs,1)==0 && y0+1<y1) y0++;
1186 while( get_bw(x0,x1,y1,y1,p,cs,1)==0 && y0+1<y1) y1--;
1187 while( get_bw(x0,x0,y0,y1,p,cs,1)==0 && x0+1<x1) x0++;
1188 while( get_bw(x1,x1,y0,y1,p,cs,1)==0 && x0+1<x1) x1--;
1191 box1->x0=x0; box1->y0=y0; // set reduced frame
1192 box1->x1=x1; box1->y1=y1;
1194 // set good startpoint (probably bad from division)?
1195 if( xa<x0 || xa>x1 || ya<y0 || ya>y1
1196 || getpixel(p,xa,ya)>=cs /* || 2*ya<y0+y1 */ || dots>0 ){
1197 // subfunction? also called after division of two glued chars?
1198 for(y=y1;y>=y0;y--) // low to high (not i-dot)
1199 for(x=(x0+x1)/2,i=0;x>=x0 && x<=x1;i++,x+=((2*i&2)-1)*i) /* is that ok? */
1200 if (getpixel(p,x,y)<cs && (getpixel(p,x+1,y)<cs
1201 || getpixel(p,x,y+1)<cs)){ xa=x;ya=y;y=-1;break; }
1202 /* should box1->x,y be set? */
1205 // ----- create char-only-box -------------------------------------
1206 // ToDo: this will be obsolete if vectors are used only
1207 if(dx<1 || dy<1) return bc; /* should not happen */
1208 b.p = (unsigned char *) malloc( dx * dy );
1209 if (!b.p) fprintf(stderr,"Warning: malloc failed L%d\n",__LINE__);
1210 if( copybox(p,x0,y0,dx,dy,&b,dx*dy) )
1211 { free(b.p); return bc; }
1212 // clr_bits(&b,0,b.x-1,0,b.y-1);
1213 // ------ use diagonal too (only 2nd run?)
1214 /* following code failes on ! and ? obsolete if vectors are used
1216 - mark pixels neighoured to pixels outside and remove them from &b
1218 will be replaced by list of edge vectors
1219 - mark accents, dots and remove them from &b
1221 #if 1 /* becomes obsolate by vector code */
1222 if (y0>0) // mark upper overlap
1223 for ( x=x0; x<=x1; x++) {
1224 if (getpixel(p,x,y0-1)<cs
1225 && getpixel(p,x,y0 )<cs && (marked(&b,x-x0,0)&1)!=1)
1226 mark_nn(&b,x-x0,0,cs,1);
1228 if (x0>0) // mark left overlap
1229 for ( y=y0; y<=y1; y++) {
1230 if (getpixel(p,x0-1,y)<cs
1231 && getpixel(p,x0 ,y)<cs && (marked(&b,0,y-y0 )&1)!=1)
1232 mark_nn(&b,0,y-y0,cs,1);
1234 if (x1<p->x-1) // mark right overlap
1235 for ( y=y0; y<=y1; y++) {
1236 if (getpixel(p,x1+1,y)<cs
1237 && getpixel(p,x1 ,y)<cs && (marked(&b,x1-x0,y-y0)&1)!=1)
1238 mark_nn(&b,x1-x0,y-y0,cs,1);
1240 mark_nn(&b,xa-x0,ya-y0,cs,2); // not glued chars
1243 if ( (marked(&b,x,y )&3)==1 && getpixel(&b,x,y )<cs )
1244 b.p[x+y*b.x] = 255&~7; /* reset pixel */
1248 // if (bc == UNKNOWN) // cause split to fail
1249 bc=ocr0(box1,&b,cs);
1251 /* ToDo: try to change pixels near cs?? or melt? */
1252 if (box1->num_ac>0 && box1->wac[0]>=JOB->cfg.certainty && bc==UNKNOWN) {
1256 if (um!=0 && um!=SPACE && bc<127) { /* ToDo: is that obsolete now? */
1258 newbc = compose(bc, um );
1259 if (newbc == bc) { /* nothing composed */
1260 if(JOB->cfg.verbose & 7)
1261 fprintf(stderr, "\nDBG whatletter: compose(%s) was useless (%d,%d)",
1262 decode(bc,ASCII), box1->x0, box1->y0);
1263 // if(JOB->cfg.verbose & 6) out_x(box1);
1267 // restore modified boxes
1268 box1->x0=bbuf.x0; box1->y0=bbuf.y0;
1269 box1->x1=bbuf.x1; box1->y1=bbuf.y1;
1270 // if (box1->c==UNKNOWN) out_b(box1,&b,0,0,dx,dy,cs); // test
1277 ** creates a list of boxes/frames around objects detected
1278 ** on the pixmap p for further work
1279 ** returns number of boxes created.
1280 ** - by the way: get average X, Y (avX=sumX/numC,..)
1282 int scan_boxes( pix *p ){
1283 int x, y, nx, cs, rc, ds;
1286 if (JOB->cfg.verbose)
1287 fprintf(stderr,"# scanning boxes");
1290 JOB->res.sumX = JOB->res.sumY = JOB->res.numC = 0;
1292 /* clear the lowest bits of each pixel, later used as "scanned"-marker */
1293 clr_bits( p, 0, p->x - 1, 0, p->y - 1);
1295 for (y=0; y < p->y; y++)
1296 for (x=0; x < p->x; x++)
1297 for (ds=2; ds<7; ds+=4) { // NO - dust of size 1 is not removed !!!
1298 nx=x+((ds==2)?-1:+1);
1299 if (nx<0 || nx>=p->x) continue; /* out of image, ex: recframe */
1300 if ( getpixel(p, x,y)>=cs || getpixel(p,nx,y)< cs) // b/w transition?
1302 if ((marked(p, x,y) & 1)&&(marked(p, nx, y) & 1))
1304 /* check (and mark) only horizontal b/w transitions */
1305 // --- insert new box in list
1306 box3 = (struct box *)malloc_box(NULL);
1307 box3->x0=box3->x1=box3->x=x;
1308 box3->y0=box3->y1=box3->y=y;
1312 box3->num_subboxes=0;
1313 box3->modifier='\0';
1314 box3->num=JOB->res.numC;
1315 box3->line=0; // not used here
1316 box3->m1=0; box3->m2=0; box3->m3=0; box3->m4=0;
1318 box3->num_ac=0; // for future use
1320 /* frame, vectorize and mark only odd/even horizontal b/w transitions
1321 * args: box, x,y, cs, mark, diag={0,1}, ds={2,6}
1322 * ds - start direction, 6=right of right border, 2=left of left border
1323 * ret - 0=ok, -1=already marked, -2=max_num_frames_exceeded
1324 * -7=no border in direction ds
1325 * ToDo: count errors and print out for debugging
1327 rc=frame_vector(box3, x, y, cs, 1, 1, ds);
1328 g_debug(fprintf(stderr,"\n# ... scan xy= %3d %3d rc= %2d", x, y, rc);)
1329 if (rc<0) { free_box(box3); continue; }
1330 if (box3->num_frames && !box3->num_frame_vectors[0])
1331 fprintf(stderr,"\nERROR scan_boxes: no vector in frame (%d,%d)",x,y);
1334 JOB->res.sumX += box3->x1 - box3->x0 + 1;
1335 JOB->res.sumY += box3->y1 - box3->y0 + 1;
1337 box3->c=(((box3->y1-box3->y0+1)
1338 *(box3->x1-box3->x0+1)>=MaxBox)? PICTURE : UNKNOWN);
1339 list_app(&(JOB->res.boxlist), box3); // append to list
1341 // if (JOB->cfg.verbose && box3->y0==29) out_x(box3);
1344 if (JOB->cfg.verbose)
1345 fprintf(stderr," nC= %3d avD= %2d %2d\n",JOB->res.numC,
1346 (JOB->res.sumX+JOB->res.numC/2)/JOB->res.numC,
1347 (JOB->res.sumY+JOB->res.numC/2)/JOB->res.numC);
1349 return JOB->res.numC;
1352 /* compare ints for sorting. Return -1, 0, or 1 according to
1353 whether *vr < *vs, vr == *vs, or *vr > *vs */
1355 intcompare (const void *vr, const void *vs)
1360 if (*r < *s) return -1;
1361 if (*r > *s) return 1;
1366 * measure_pitch - detect monospaced font and measure the pitch
1367 * measure overall pitch for difficult lines,
1368 * after that measure pitch per line
1369 * dists arrays are limited to 1024 elements to reduce
1370 * cpu usage for qsort on images with extreme high number of objects
1371 * insert space if dist>=pitch in list_insert_spaces()
1373 * - min/max distance-matrix a-a,a-b,a-c,a-d ... etc; td,rd > ie,el,es
1374 * - OR measuring distance as min. pixel distance instead of box distance
1375 * especially useful for italic font!
1377 void measure_pitch( job_t *job ){
1378 int numdists=0, spc=0, /* number of stored distances */
1379 pitch_p=2, pdist, pdists[1024], /* proportional distances */
1380 pitch_m=6, mdist, mdists[1024], /* monospaced distances */
1382 struct box *box2, *prev=NULL;
1384 if(job->cfg.verbose){ fprintf(stderr,"# check for word pitch"); }
1385 for (l1=0; l1<job->res.lines.num; l1++)
1386 { /* 0 means all lines */
1387 if(job->cfg.verbose){ fprintf(stderr,"\n# line %2d",l1); }
1388 numdists = 0; /* clear distance lists */
1389 for_each_data(&(job->res.boxlist)) {
1390 box2 = (struct box *)list_get_current(&(job->res.boxlist));
1391 if (l1>0 && box2->line!=l1) continue; /* ignore other lines */
1392 /* ignore dots and pictures (min. font is 4x6) */
1393 if (box2->y1 - box2->y0 + 1 < 4 || box2->c==PICTURE) { prev=NULL; }
1394 if (!prev) { prev=box2; continue; } /* we need a predecessor */
1395 /* use center distance for monospaced fonts */
1396 mdist = ((box2->x0 + box2->x1) - (prev->x0 + prev->x1) + 1)/2;
1397 /* use gap for proportional fonts */
1398 pdist = box2->x0 - prev->x1 + 1;
1399 /* ToDo: better take 3 instead of 2 neighbours?, smallest font 4x6 */
1400 /* fonts are expected to be 6 to 60 pixels high, which is about
1401 4 to 50 pixels wide. We allow some extra margin. */
1402 if (3 < mdist && mdist < 150) { /* better mdist < 3*Xaverage ? */
1403 /* two options for overflow: 1) ignore, 2) store randomly */
1404 if (numdists<1024) { /* we do ignore here */
1405 mdists[numdists] = mdist;
1406 pdists[numdists] = pdist;
1411 } end_for_each(&(job->res.boxlist));
1413 if(job->cfg.verbose){ fprintf(stderr," num_gaps= %2d",numdists); }
1415 if (job->cfg.verbose && l1==0) /* only for all lines */
1416 fprintf(stderr," (WARNING num_gaps<8)");
1419 int i,diff,ni_min,max,best_m,best_p,ni; double v;
1420 /* aware: takes long time for big data sets */
1421 /* dilute? (german: ausduennen?) */
1422 qsort (mdists, numdists, sizeof (int), intcompare);
1423 qsort (pdists, numdists, sizeof (int), intcompare);
1424 /* the new method, div0? */
1425 v = (mdists[numdists*7/10]-mdists[numdists/5])
1426 /(double)mdists[numdists/5];
1427 /* measurements showed v=.09 for Courier and .44 for Times-Roman */
1428 if (l1==0) monospaced = (v < .22);
1430 best_p=4*numdists/5;
1431 /* try to find better pitch for monospaced font (ok for prop) */
1432 for (i=numdists/5+1;i<numdists;i++) {
1433 if (2*mdists[i]>=3*mdists[best_m]) { best_m=i-1; break; }
1435 /* try to find better pitch for proportional font */
1436 // the largest diff could be the best, if diff is always 1,
1437 // take the diff with the lowest weight
1438 for (ni=ni_min=1024,max=0,i=numdists/2+1;i<numdists-numdists/16;i++) {
1439 diff=pdists[i]-pdists[i-1];
1441 max=diff; best_p=i-1;
1442 if ((job->cfg.verbose&(32+16))==48)
1443 fprintf(stderr," best_p=%d maxdiff=%d\n# ...", pdists[best_p], max);
1444 if (max>3 && 3*pdists[i]>=4*pdists[i-1]) { break; }
1448 // do not try to divide one word per line
1449 ni_min=ni; if (max<=1 && numdists>16) best_p=i-1;
1450 if ((job->cfg.verbose&(32+16))==48)
1451 fprintf(stderr," best_p=%d ni_min=%d\n# ...", pdists[best_p], ni_min);
1456 if (numdists<16 && max<=1 && ni_min>1) best_p=numdists-1; // one word
1457 #if 1 /* debugging */
1458 if ((job->cfg.verbose&(32+16))==48) {
1459 fprintf(stderr,"\n# ...");
1460 for (i=0;i<numdists;i++) fprintf(stderr," %2d",mdists[i]);
1461 fprintf(stderr," <- mdist[%d]\n# ...",l1);
1462 for (i=0;i<numdists;i++) fprintf(stderr," %2d",pdists[i]);
1463 fprintf(stderr," <- pdist[%d]\n# ...",l1);
1464 fprintf(stderr," maxdiff=%d min_samediffs=%d\n# ...",max,ni_min);
1467 /* we measure spaces in two different ways (mono, prop) */
1468 /* prop: gap between boxes, mono: distance of middle */
1469 if (best_p<numdists-1) pitch_p = ((pdists[best_p]+pdists[best_p+1])/2+1);
1470 else pitch_p = (pdists[best_p]+1 );
1471 pitch_m = (mdists[best_m]*4/3);
1473 if ( pdists[numdists-1]*2 <= pdists[0]*3
1474 || pdists[numdists-1] <= pdists[0]+3) {
1475 /* line is just a single word */
1476 pitch_p = pdists[numdists-1]+10;
1478 if (l1>0 && job->cfg.spc==0) {
1479 job->res.lines.pitch[l1]=(monospaced?pitch_m:pitch_p);
1480 job->res.lines.mono[l1]=monospaced;
1482 if (job->cfg.verbose) {
1483 fprintf(stderr,"\n# ..."
1484 " mono: v=%f (v<0.22) line=%d numdists=%d\n# ...",
1486 fprintf(stderr," mono: min=%3d max=%3d pitch=%3d @ %2d%%\n# ...",
1487 mdists[0],mdists[numdists-1],pitch_m,best_m*100/numdists);
1488 fprintf(stderr," prop: min=%3d max=%3d pitch=%3d @ %2d%%\n# ...",
1489 pdists[0],pdists[numdists-1],pitch_p,best_p*100/numdists);
1490 fprintf(stderr," result: distance >= %d considered space\n# ...",
1491 job->res.lines.pitch[l1]);
1493 } /* if (not) enough spaces */
1494 if (l1==0) { /* set default spaces to each line */
1497 if (spc==0) /* set only if not set by option */
1498 spc = ((monospaced)?pitch_m:pitch_p);
1499 for (l2=0; l2<job->res.lines.num; l2++ )
1500 job->res.lines.pitch[l2]=spc;
1503 if (job->cfg.spc==0)
1505 if (job->cfg.verbose)
1506 fprintf(stderr," overall space width is %d %s\n",
1507 spc, ((monospaced)?"monospaced":"proportional"));
1512 /* ---- count subboxes (white holes within black area) --------
1513 * new: count boxes lying inside another box (usually holes, ex: "aeobdg")
1514 * needed for glue_boxes, dont glue textboxes, tables and other complex
1516 * ToDo: count only frames of invers spin? do we need sorted list here? -> no
1518 int count_subboxes( pix *pp ){
1519 int ii=0, num_mini=0, num_same=0, cnt=0;
1520 struct box *box2,*box4;
1521 progress_counter_t *pc = NULL;
1522 if (JOB->cfg.verbose) { fprintf(stderr,"# count subboxes\n# ..."); }
1524 pc = open_progress(JOB->res.boxlist.n,"count_subboxes");
1525 for_each_data(&(JOB->res.boxlist)) {
1526 box2 = (struct box *)list_get_current(&(JOB->res.boxlist));
1527 box2->num_subboxes=0;
1529 if ( (box2->x1 - box2->x0)<2
1530 || (box2->y1 - box2->y0)<2) continue; /* speedup for dotted bg */
1531 // holes inside box2 char, aoebdqg, 0.41
1532 for_each_data(&(JOB->res.boxlist)) {
1533 box4=(struct box *)list_get_current(&(JOB->res.boxlist));
1534 if (box4->y0 > box2->y1) break; // faster, but boxes need to be sorted
1535 // ToDo: better use binary tree (above/below x) to find near boxes?
1536 if (box4==box2) continue;
1537 if( box4->x0==box2->x0 && box4->x1==box2->x1
1538 && box4->y0==box2->y0 && box4->y1==box2->y1)
1539 num_same++; /* erroneous!? */
1540 if ( box4->x0 >= box2->x0 && box4->x1 <= box2->x1
1541 && box4->y0 >= box2->y0 && box4->y1 <= box2->y1
1542 && box4->num_subboxes==0 ) /* box4 inside box2? */
1544 box2->num_subboxes++; ii++;
1545 if ((box4->x1 - box4->x0 + 1)
1546 *(box4->y1 - box4->y0 + 1)<17) num_mini++;
1548 } end_for_each(&(JOB->res.boxlist));
1550 if (cnt < 1000 && JOB->cfg.verbose)
1551 fprintf(stderr," %4d box %4d %4d %+3d %+3d subboxes %4d\n# ...",
1552 cnt, box2->x0, box2->y0, box2->x1-box2->x0,
1553 box2->y1-box2->y0, box2->num_subboxes);
1555 } end_for_each(&(JOB->res.boxlist));
1557 if (JOB->cfg.verbose)
1558 fprintf(stderr," %3d subboxes counted (mini=%d, same=%d) nC= %d\n",
1559 ii, num_mini, num_same/2 /* counted twice */, cnt);
1563 /* ---- glue holes tochars( before step1 ) v0.42 -----------------------
1564 glue boxes lying inside another box (usually holes, ex: "aeobdg46890")
1565 Dont add dust to a char!
1566 lines are not detected yet
1568 int glue_holes_inside_chars( pix *pp ){
1569 int ii, cs, x0, y0, x1, y1, cnt=0,
1570 glued_same=0, glued_holes=0;
1571 struct box *box2, *box4;
1572 progress_counter_t *pc = NULL;
1575 count_subboxes( pp ); /* move to pgm2asc() later */
1577 pc = open_progress(JOB->res.boxlist.n,"glue_holes_inside_chars");
1578 if (JOB->cfg.verbose)
1579 fprintf(stderr,"# glue holes to chars nC= %d\n# ...",JOB->res.numC);
1581 for_each_data(&(JOB->res.boxlist)) {
1582 // get the smaller box which may be extended by bigger boxes around it
1583 box2 = (struct box *)list_get_current(&(JOB->res.boxlist));
1584 x0 = box2->x0; x1 = box2->x1;
1585 y0 = box2->y0; y1 = box2->y1;
1589 // would it better than moving vectors to build a sub-box-tree?
1591 // do not remove chars inside pictures (car plates on photos)
1592 if( box2->c == PICTURE || box2->num_subboxes > 7) continue;
1594 // holes inside char, aoebdqg, 0.41
1595 // dont merge boxes which have subboxes by itself!
1596 // search boxes inside box2
1597 // if (x1-x0+1>2 || y1-y0+1>2) /* skip tiny boxes, bad for 4x6 */
1598 for_each_data(&(JOB->res.boxlist)) {
1599 box4=(struct box *)list_get_current(&(JOB->res.boxlist));
1600 if(box4!=box2 && box4->c != PICTURE )
1602 // ToDo: dont glue, if size differs by big factors (>16?)
1603 if ( ( box4->x0==x0 && box4->x1==x1
1604 && box4->y0==y0 && box4->y1==y1 ) /* do not happen !? */
1605 || ( box4->x0>=x0 && box4->x1<=x1
1606 && box4->y0>=y0 && box4->y1<=y1
1607 && box4->num_subboxes==0 ) ) /* no or very small subboxes? */
1608 { // fkt melt(box2,box4)
1609 // same box, if very small but hollow char (4x5 o)
1610 if( box4->x0==x0 && box4->x1==x1
1611 && box4->y0==y0 && box4->y1==y1) glued_same++; else glued_holes++;
1612 // fprintf(stderr,"\n# DEBUG merge:");
1613 // out_x(box2); // small
1614 // out_x(box4); // big
1615 if ((JOB->cfg.verbose & 7)==7) // LEV3
1616 fprintf(stderr," glue hole (%4d %4d %+3d %+3d %+4d)"
1617 " (%4d %4d %+3d %+3d %+4d) %d\n# ...",
1618 x0, y0, x1-x0+1, y1-y0+1, box2->frame_vol[0],
1620 box4->x1-box4->x0+1, box4->y1-box4->y0+1,
1621 box4->frame_vol[0], glued_same);
1622 if ((box4->x1-box4->x0+1)< 8*(x1-x0+1)
1623 || (box4->y1-box4->y0+1)<12*(y1-y0+1)) // skip dust
1624 merge_boxes( box2, box4 ); // add box4 to box2
1626 x0 = box2->x0; x1 = box2->x1;
1627 y0 = box2->y0; y1 = box2->y1;
1628 JOB->res.numC--; // dont count fragments as chars
1629 ii++; // count removed
1630 list_del(&(JOB->res.boxlist), box4); // remove box4
1632 // now search another hole inside box2
1635 } end_for_each(&(JOB->res.boxlist));
1637 } end_for_each(&(JOB->res.boxlist));
1639 if (JOB->cfg.verbose)
1640 fprintf(stderr," glued: %3d holes, %3d same, nC= %d\n",
1641 glued_holes, glued_same, JOB->res.numC);
1648 /* ---- glue broken chars ( before step1 ??? ) -----------------------
1649 use this carefully, do not destroy previous detection ~fi, broken K=k' g
1650 glue if boxes are near or diagonally connected
1651 other strategy: mark boxes for deleting and delete in extra loop at end
1652 faster: check only next two following boxes because list is sorted!
1653 ToDo: store m4 of upper line to m4_of_prev_line, and check that "-points are below
1654 done: glue boxes lying inside another box (usually holes, ex: "aeobdg")
1655 Dont add dust to a char!
1656 lines should be detected already (Test it for m1-m4 unknown)
1657 ToDo: divide in glue_idots, glue_thin_chars etc. and optimize it
1659 int glue_broken_chars( pix *pp ){
1660 int ii, y, cs, x0, y0, x1, y1, cnt=0,
1661 num_frags=0, glued_frags=0, glued_hor=0;
1662 struct box *box2, *box4;
1663 progress_counter_t *pc = NULL;
1666 count_subboxes( pp ); /* move to pgm2asc() later */
1668 pc = open_progress(JOB->res.boxlist.n,"glue_broken_chars");
1669 if (JOB->cfg.verbose)
1670 fprintf(stderr,"# glue broken chars nC= %d\n# ...",JOB->res.numC);
1672 for_each_data(&(JOB->res.boxlist)) {
1673 // get the box which may be extended by boxes around it
1674 box2 = (struct box *)list_get_current(&(JOB->res.boxlist));
1675 x0 = box2->x0; x1 = box2->x1;
1676 y0 = box2->y0; y1 = box2->y1;
1680 // vertical broken (g965T umlauts etc.)
1683 // would it better than moving vectors to build a sub-box-tree?
1685 // do not remove chars inside pictures (car plates on photos)
1686 if( box2->c == PICTURE || box2->num_subboxes > 7) continue;
1688 /* continue loop if box is below or above line */
1689 if( box2->m4>0 && y0>box2->m4 ) continue; /* dust outside ? */
1690 if( box2->m1>0 && y0<box2->m1-(box2->m3-box2->m2) ) continue;
1692 * - check that y0 is greater as m3 of the char/line above
1695 // check small boxes (box2) whether they belong
1696 // to near same size or bigger boxes (box4)
1697 if( 2*(y1-y0) < box2->m4 - box2->m1 // care for dots etc.
1698 && ( 2*y1<=(box2->m3+box2->m2) // upper fragments
1699 || 2*y0>=(box2->m3+box2->m2)) ) { // lower fragments
1700 struct box *box5=NULL, *box6=NULL; // nearest and next nearest box
1702 num_frags++; /* count for debugging */
1703 // get the [2nd] next x-nearest box in the same line
1704 for_each_data(&(JOB->res.boxlist)) {
1705 box4=(struct box *)list_get_current(&(JOB->res.boxlist));
1706 if (box4 == box2 || box4->c == PICTURE) continue;
1707 /* 0.42 speed up for backround pixel pattern, box4 to small */
1708 if ( box4->x1 - box4->x0 + 1 < x1-x0+1
1709 && box4->y1 - box4->y0 + 1 < y1-y0+1 ) continue;
1710 // have in mind that line number may be wrong for dust
1711 if (box4->line>=0 && box2->line>=0 && box4->line==box2->line)
1713 if (!box5) box5=box4;
1714 if ( abs(box4->x0 + box4->x1 - 2*box2->x0)
1715 <abs(box5->x0 + box5->x1 - 2*box2->x0))
1716 { box6=box5; box5=box4; }
1718 } end_for_each(&(JOB->res.boxlist));
1719 box4=box5; // next nearest box within the same line
1721 #if 0 /* set this to 1 for debugging of melting bugs */
1722 if (JOB->cfg.verbose & 7) {
1723 fprintf(stderr,"\n# next two boxes are candidates for melting ");
1727 if( /* umlaut "a "o "u, ij; box2 is the small dot, box4 the body */
1729 && box4->y1 >= box2->m2 // dont melt dots together
1730 && 2* y1 < box4->y1 + box4->y0 // box2 above box4
1731 && box4->x1+JOB->res.avX/2>=x0
1732 && box4->x0-JOB->res.avX/2<=x1
1733 && (y1 < box4->y0 || x0 < box4->x1) // dont melt "d'"
1734 && 3* ( y1 - box4->y0)
1735 <= 2* (box4->y1 - box4->y0) // too far away? dust!
1736 && 8* ( x1 - x0 + 1)
1737 >= (box4->x1 - box4->x0 + 1) // dot must have minimum size
1738 && 10* ( y1 - y0 + 1)
1739 >= (box4->y1 - box4->y0 + 1) // dot must have minimum size
1740 ) || ( 0 && /* broken T */
1741 3*(box2->x1 - box2->x0) > 2*JOB->res.avX
1742 && 4*box4->x0>3*box2->x0+box2->x1
1743 && 4*box4->x1<box2->x0+3*box2->x1
1745 || /* !?; box2 is the dot, box4 the body */
1746 ( 2*box4->x1>=x0+x1 /* test if box4 is around box2 */
1747 && 2*box4->x0<=2*x1 /* +x0+1 Jan00 */
1748 && ( x1-x0 <= box4->x1-box4->x0+2 )
1749 && 2*y0>=box2->m2+box2->m3
1750 && 4*y1>=box2->m2+3*box2->m3
1751 && 4*(y1-y0)<box2->m4-box2->m1
1752 && (8*box4->y1 < box4->m2+7*box4->m3
1753 || box4->m4-box4->m1<16) /* Jan00 */
1755 || /* =;: box2 is the upper box, box4 the lower box */
1756 ( 2*box4->x1>=x0+x1 /* test if box4 is around box2 */
1757 && 2*box4->x0<=2*x1 /* +x0+1 */
1758 && ( x1-x0 <= box4->x1-box4->x0+4 )
1759 && ( 4*x0 <= 3*box4->x1+box4->x0 )
1760 && (( box2->m2 && box4->m2
1762 && 2*box4->y1 > box4->m3+box4->m2 // can be bigger than m3
1763 && 4*box4->y0 >= 3*box4->m2+box4->m3
1764 && 2*box2->y0 < box2->m3+box2->m2
1766 || ( (!box2->m2) || (!box4->m2) )
1770 { // fkt melt(box2,box4)
1771 if (JOB->cfg.verbose & 7)
1772 fprintf(stderr," glue objects (%3d %3d %+3d %+3d)"
1773 " (%3d %3d %+3d %+3d)\n# ...",
1774 x0, y0, x1-x0+1, y1-y0+1, box4->x0, box4->y0,
1775 box4->x1-box4->x0+1, box4->y1-box4->y0+1);
1776 // fprintf(stderr,"\n# DEBUG merge:"); // d=7x34 @ (109,51) ???
1779 merge_boxes( box2, box4 ); // add box4 to box2
1780 x0 = box2->x0; x1 = box2->x1;
1781 y0 = box2->y0; y1 = box2->y1;
1782 // if (JOB->cfg.verbose & 4) out_x(box2);
1783 // JOB->res.numC--; // dont count fragments as chars
1784 ii++; glued_frags++; // remove
1785 // output_list(JOB);
1786 list_del(&(JOB->res.boxlist), box4); /* ret&1: error-message ??? */
1787 // output_list(JOB);
1794 // horizontally broken w' K'
1795 if( 2*y1 < (box2->m3+box2->m2) )
1796 if( 2*(y1-y0) < (box2->m3+box2->m2) ) // fragment
1797 for_each_data(&(JOB->res.boxlist)) {
1798 box4=(struct box *)list_get_current(&(JOB->res.boxlist));
1799 if(box4!=box2 && box4->c != PICTURE )
1801 if( box4->line>=0 && box4->line==box2->line
1802 && box4->x1>=x0-1 && box4->x1<x0 // do not glue 6-
1803 && box4->x0+3*box4->x1<4*x0)
1804 if( get_bw(x0 ,x0 ,y1,y1 ,pp,cs,1) == 1)
1805 if( get_bw(x0-2,x0-1,y1,y1+2,pp,cs,1) == 1)
1806 { // fkt melt(box2,box4)
1807 put(pp,x0,y1+1,~(128+64),0);
1808 merge_boxes( box2, box4 );
1809 x0 = box2->x0; x1 = box2->x1;
1810 y0 = box2->y0; y1 = box2->y1;
1811 JOB->res.numC--; ii++; // remove
1813 list_del(&(JOB->res.boxlist), box4);
1817 } end_for_each(&(JOB->res.boxlist));
1819 // horizontally broken n h (h=l_) v0.2.5 Jun00
1820 if( abs(box2->m2-y0)<=(y1-y0)/8 )
1821 if( abs(box2->m3-y1)<=(y1-y0)/8 )
1822 if( num_cross(x0, x1,(y0+ y1)/2,(y0+ y1)/2,pp,cs) == 1)
1823 if( num_cross(x0, x1,(y0+3*y1)/4,(y0+3*y1)/4,pp,cs) == 1)
1824 if( get_bw((3*x0+x1)/4,(3*x0+x1)/4,(3*y0+y1)/4,y1,pp,cs,1) == 0)
1825 if( get_bw(x0,(3*x0+x1)/4,(3*y0+y1)/4,(y0+3*y1)/4,pp,cs,1) == 0)
1826 if( get_bw(x0, x0, y0,(3*y0+y1)/4,pp,cs,1) == 1)
1827 for_each_data(&(JOB->res.boxlist)) {
1828 box4=(struct box *)list_get_current(&(JOB->res.boxlist));
1829 if(box4!=box2 && box4->c != PICTURE )
1831 if( box4->line>=0 && box4->line==box2->line
1832 && box4->x1>x0-3 && box4->x1-2<x0
1833 && abs(box4->y1-box2->m3)<2)
1834 { // fkt melt(box2,box4)
1835 y=loop(pp,x0,y0,y1-y0,cs,0,DO);if(2*y>y1-y0) continue;
1836 put(pp,x0-1,y0+y ,~(128+64),0);
1837 put(pp,x0-1,y0+y+1,~(128+64),0);
1838 merge_boxes( box2, box4 ); // add box4 to box2
1839 x0 = box2->x0; x1 = box2->x1;
1840 y0 = box2->y0; y1 = box2->y1;
1841 JOB->res.numC--; ii++; // remove
1843 list_del(&(JOB->res.boxlist), box4);
1847 } end_for_each(&(JOB->res.boxlist));
1848 } end_for_each(&(JOB->res.boxlist));
1849 if (JOB->cfg.verbose)
1850 fprintf(stderr," glued: %3d fragments (found %3d), %3d rest, nC= %d\n",
1851 glued_frags, num_frags, glued_hor, JOB->res.numC);
1858 ** this is a simple way to improve results on noisy images:
1859 ** - find similar chars (build cluster of same chars)
1860 ** - analyze clusters (could be used for generating unknown font-base)
1861 ** - the quality of the result depends mainly on the distance function
1863 // ---- analyse boxes, compare chars, compress picture ------------
1864 // ToDo: - error-correction only on large chars!
1865 int find_same_chars( pix *pp){
1866 int i,k,d,cs,dist,n1,dx; struct box *box2,*box3,*box4,*box5;
1870 if(JOB->cfg.verbose)fprintf(stderr,"# packing");
1871 i = list_total(&(JOB->res.boxlist));
1872 for_each_data(&(JOB->res.boxlist)) {
1873 box4 = box2 = (struct box *)list_get_current(&(JOB->res.boxlist));
1874 dist=1000; // 100% maximum
1875 dx = box2->x1 - box2->x0 + 1;
1877 if(JOB->cfg.verbose)fprintf(stderr,"\r# packing %5d",i);
1879 for(box3=(struct box *)list_next(&(JOB->res.boxlist),box2);box3;
1880 box3=(struct box *)list_next(&(JOB->res.boxlist),box3)) {
1881 if(box2->num!=box3->num){
1882 int d=distance(&p,box2,&p,box3,cs);
1883 if ( d<dist ) { dist=d; box4=box3; } // best fit
1884 if ( d<5 ){ // good limit = 5% ???
1885 i--;n1=box3->num; // set all num==box2.num to box2.num
1886 for_each_data(&(JOB->res.boxlist)) {
1887 box5=(struct box *)(struct box *)list_get_current(&(JOB->res.boxlist));
1889 if( box5->num==n1 ) box5->num=box2->num;
1890 } end_for_each(&(JOB->res.boxlist));
1891 // out_x2(box2,box5);
1892 // fprintf(stderr," dist=%d\n",d);
1896 // nearest dist to box2 has box4
1897 // out_b2(box2,box4);
1898 // fprintf(stderr," dist=%d\n",dist);
1899 } end_for_each(&(JOB->res.boxlist));
1901 if(JOB->cfg.verbose)fprintf(stderr," %d different chars",i);
1902 for_each_data(&(JOB->res.boxlist)) {
1903 struct box *box3,*box4;
1905 box2=(struct box *)list_get_current(&(JOB->res.boxlist));
1906 for(box3=(struct box *)list_get_header(&(JOB->res.boxlist));
1907 box3!=box2 && box3!=NULL;
1908 box3=(struct box *)list_next(&(JOB->res.boxlist), box3))
1909 if(box3->num==box2->num)break;
1910 if(box3!=box2 && box3!=NULL)continue;
1912 // count number of same chars
1915 for(box3=box2,j=0;box3;
1916 box3=(struct box *)list_next(&(JOB->res.boxlist), box3)) {
1917 if(box3->num==box2->num){
1919 d=distance(&p,box2,&p,box3,cs);
1920 if ( d>dist ) { dist=d; box4=box3; } // worst fit
1923 if(JOB->cfg.verbose&8){
1924 fprintf(stderr," no %d char %4d %5d times maxdist=%d\n",i,box2->num,j,dist);
1926 // calculate mean-char (error-correction)
1927 // ToDo: calculate maxdist in group
1930 // out_b(box1,NULL,0,0,0,0,cs);
1931 if(JOB->cfg.verbose&8)
1932 fprintf(stderr," no %d char %4d %5d times sum=%d\n",i,box2->num,j,k);
1933 } end_for_each(&(JOB->res.boxlist));
1934 if(JOB->cfg.verbose)fprintf(stderr," ok\n");
1940 ** call the first engine for all boxes and set box->c=result;
1943 int char_recognition( pix *pp, int mo){
1944 int i,ii,ni,cs,x0,y0,x1,y1;
1946 progress_counter_t *pc;
1949 // ---- analyse boxes, find chars ---------------------------------
1950 if (JOB->cfg.verbose)
1951 fprintf(stderr,"# char recognition");
1953 for_each_data(&(JOB->res.boxlist)) { /* count boxes */
1954 box2 = (struct box *)list_get_current(&(JOB->res.boxlist));
1955 /* wew: isn't this just JOB->res.numC? */
1956 /* js: The program is very complex. I am not sure anymore
1957 wether numC is the number of boxes or the number of valid
1959 Because its not time consuming I count the boxes here. */
1960 if (box2->c==UNKNOWN) i++;
1961 if (box2->c==PICTURE) ii++;
1963 } end_for_each(&(JOB->res.boxlist));
1964 if(JOB->cfg.verbose)
1965 fprintf(stderr," unknown= %d picts= %d boxes= %d\n# ",i,ii,ni);
1968 pc = open_progress(ni,"char_recognition");
1969 for_each_data(&(JOB->res.boxlist)) {
1970 box2 = (struct box *)list_get_current(&(JOB->res.boxlist));
1971 x0=box2->x0;x1=box2->x1;
1972 y0=box2->y0;y1=box2->y1; // box
1974 if (cc==PICTURE) continue;
1976 if ((mo&256)==0) { /* this case should be default (main engine) */
1977 if(cc==UNKNOWN || box2->num_ac==0 || box2->wac[0]<JOB->cfg.certainty)
1978 cc=whatletter(box2,cs ,0);
1982 if(cc==UNKNOWN || box2->num_ac==0 || box2->wac[0]<JOB->cfg.certainty)
1986 // box2->c=cc; bad idea (May03 removed)
1987 // set(box2,cc,95); ToDo: is that better?
1993 if(JOB->cfg.verbose&8) {
1994 fprintf(stderr,"\n# code= %04lx %c",(long)cc,(char)((cc<255)?cc:'_'));
1995 //out_b(box2,pp,x0,y0,x1-x0+1,y1-y0+1,cs);
1997 progress(ii,pc); /* ii = 0..ni */
1999 } end_for_each(&(JOB->res.boxlist));
2001 if(JOB->cfg.verbose)fprintf(stderr," %d of %d chars unidentified\n",i,ii);
2007 ** compare unknown with known chars,
2008 ** very similar to the find_similar_char_function but here only to
2009 ** improve the result
2011 int compare_unknown_with_known_chars(pix * pp, int mo) {
2012 int i, cs = JOB->cfg.cs, dist, d, ad, wac, ni, ii;
2013 struct box *box2, *box3, *box4;
2014 progress_counter_t *pc=NULL;
2016 i = ii = 0; // ---- -------------------------------
2017 if (JOB->cfg.verbose)
2018 fprintf(stderr, "# try to compare unknown with known chars !(mode&8)");
2022 for_each_data(&(JOB->res.boxlist)) { ni++; } end_for_each(&(JOB->res.boxlist));
2023 pc = open_progress(ni,"compare_chars");
2024 for_each_data(&(JOB->res.boxlist)) {
2025 box2 = (struct box *)list_get_current(&(JOB->res.boxlist)); ii++;
2026 if (box2->c == UNKNOWN || (box2->num_ac>0 && box2->wac[0]<97))
2027 if (box2->y1 - box2->y0 > 4 && box2->x1 - box2->x0 > 1) { // no dots!
2028 box4 = (struct box *)list_get_header(&(JOB->res.boxlist));;
2029 dist = 1000; /* 100% maximum */
2030 bc = UNKNOWN; /* best fit char */
2031 for_each_data(&(JOB->res.boxlist)) {
2032 box3 = (struct box *)list_get_current(&(JOB->res.boxlist));
2033 wac=((box3->num_ac>0)?box3->wac[0]:100);
2034 if (box3 == box2 || box3->c == UNKNOWN
2035 || wac<JOB->cfg.certainty) continue;
2036 if (box2->y1 - box2->y0 < 5 || box2->x1 - box2->x0 < 3) continue;
2037 d = distance(pp, box2, pp, box3, cs);
2039 dist = d; bc = box3->c; box4 = box3;
2041 } end_for_each(&(JOB->res.boxlist));
2043 /* sureness can be maximal of box3 */
2044 if (box4->num_ac>0) ad = box4->wac[0];
2046 ad-=dist; if(ad<1) ad=1;
2047 /* ToDo: ad should depend on ad of bestfit */
2048 setac(box2,(wchar_t)bc,ad);
2050 } // limit as option???
2051 // => better max distance('e','e') ???
2052 if (dist < 50 && (JOB->cfg.verbose & 7)) { // only for debugging
2053 fprintf(stderr,"\n# L%02d best fit was %04x=%c dist=%3d%% i=%d",
2054 box2->line, (int)bc, (char)((bc<128)?bc:'_'), dist, i);
2055 if(box4->num_ac>0)fprintf(stderr," w= %3d%%",box4->wac[0]);
2059 } end_for_each(&(JOB->res.boxlist));
2062 if (JOB->cfg.verbose)
2063 fprintf(stderr, " - found %d (nC=%d)\n", i, ii);
2068 // ---- divide overlapping chars which !strchr("_,.:;",c);
2069 // block-splitting (two ore three glued chars)
2070 // division if dots>0 does not work properly! ???
2072 // what about glued "be"?
2073 // what about recursive division?
2074 // ToDo: mark divided boxes to give the engine a chance to
2075 // handle wrong divisions
2077 int try_to_divide_boxes( pix *pp, int mo){
2078 struct box *box2, boxa, boxb;
2079 int cs=JOB->cfg.cs, ad=100,
2080 a2[8], ar, // certainty of each part, ar = product of all certainties
2081 cbest; // best certainty, skip search of certainty<cbest-1 for speed
2082 wchar_t ci[8], // split max. 8 chars
2083 s1[]={ UNKNOWN, '_', '.', ',', '\'', '!', ';', '?', ':', '-',
2084 '=', '(', ')', '/', '\\', '\0' }; // not accepted chars, \0-terminated!
2086 xi[8+1]; // cutting positions
2087 int i, ii, n1, dy, dx;
2088 // pix p=(*pp); // remove!
2089 if (JOB->cfg.verbose)
2090 fprintf(stderr,"# try to divide unknown chars !(mode&16)");
2091 if(!(mo&16)) // put this to the caller
2092 for_each_data(&(JOB->res.boxlist)) {
2093 box2 = (struct box *)list_get_current(&(JOB->res.boxlist));
2094 // don't try to split simple structures (ex: 400x30 square)
2095 if ((!box2->num_frames)
2096 || box2->num_frame_vectors[ box2->num_frames-1 ]<9) continue;
2097 if((box2->c==UNKNOWN || (box2->num_ac && box2->wac[0]<JOB->cfg.certainty))
2098 && box2->x1-box2->x0>5 && box2->y1-box2->y0>4){
2099 x0=box2->x0; x1=box2->x1;
2100 y0=box2->y0; y1=box2->y1;
2104 /* get minimum vertical lines */
2105 n1 = num_cross(x0,x1,( y1+y0)/2,( y1+y0)/2,pp,cs);
2106 ii = num_cross(x0,x1,(3*y1+y0)/4,(3*y1+y0)/4,pp,cs); if (ii<n1) n1=ii;
2107 if (box2->m2 && box2->m3 > box2->m2+2)
2108 for (i=box2->m2+1;i<=box2->m3-1;i++) {
2109 if (loop(pp,x0+1,i,x1-x0,cs,1,RI) > (x1-x0-2)) continue; // ll
2110 ii = num_cross(x0,x1,i,i,pp,cs); if (ii<n1) n1=ii;
2111 } if (n1<2) continue; // seems to make no sense to divide
2112 if (n1<4) ad=99*ad/100; // not to strong because m2+m3 could be wrong
2113 if (n1<3) ad=99*ad/100;
2115 if( 2*y1 < box2->m3+box2->m4 /* baseline char ? */
2116 && num_cross(x0,x1,y1-1,y1-1,pp,cs)==1 // -1 for slopes
2117 && num_cross((x0+2*x1)/3,(x0+3*x1)/4,y0,y1,pp,cs)<3 // not exclude tz
2118 && num_cross((3*x0+x1)/4,(2*x0+x1)/3,y0,y1,pp,cs)<3 // not exclude zl
2119 && loop(pp,x0,y1-(y1-y0)/32,x1-x0,cs,0,RI)
2120 +loop(pp,x1,y1-(y1-y0)/32,x1-x0,cs,0,LE) > (x1-x0+1)/2
2121 ) continue; /* do not try on bvdo"o etc. */
2123 // one vertical line can not be two glued chars, lc?
2124 if ( num_cross(x0,x1,(y1+y0)/2,(y1+y0)/2,pp,cs)<=1 ) continue;
2125 { // doublet = 2 letters
2126 // char buf[4]="\0\0\0"; // 4th byte is string end == \0
2127 // buf[0]=c1; // c1 is wchar_t! (0xbf00 to 0) failes
2129 char buf[64]=""; // end == \0
2130 if (JOB->cfg.verbose&2){
2131 fprintf(stderr, "\n#\n# divide box: %4d %4d %3d %3d\n",
2132 x0, y0, x1-x0+1, y1-y0+1);
2134 // it would be better if testing is only if most right and left char
2135 // is has no horizontal gap (below m2) ex: be
2136 i=0; // num splittet chars
2137 xi[0]=x0; xi[1]=x0+3; xi[2]=x1;
2138 for ( ; ; xi[i+1]++) { // x[i] .. x[i+1], slower? but better v0.42
2139 /* break if x is to near to the right border */
2140 if (xi[i+1]>x1-3) { if (i==0) break; i--; xi[i+2]=x1; continue; }
2141 // ToDo: skip if not a local dy-min for speedup
2142 { int ymin=y1, ymax=y0, bow=0, // min max at cutting point
2143 max0=y0, max1=y0, // max y on left and right side
2144 min0=y1, min1=y1; // min y on left and right side
2145 for (dy=0,ii=0;ii<box2->num_frame_vectors[ 0 ];ii++) {
2146 int pre=ii-1, next=(ii+1)%box2->num_frame_vectors[ 0 ];
2147 if (pre<0) pre=box2->num_frame_vectors[ 0 ]-1;
2148 // check if vector is inside box to cut
2149 if ( box2->frame_vector[ii ][0]<=xi[i ]) continue;
2150 if ( box2->frame_vector[ii ][0]> xi[i+2]) continue;
2151 // 2nd derivation of y(x)
2152 if (abs(box2->frame_vector[ii ][0]-xi[i+1])<2) {
2153 dy= 2*box2->frame_vector[ii ][1]
2154 -box2->frame_vector[next][1]
2155 -box2->frame_vector[pre ][1];
2156 dx= box2->frame_vector[next][0]
2157 -box2->frame_vector[pre ][0];
2158 // rotate 180 degree if dx<0
2159 if (((dx>0)?dy:-dy)<-abs(dx)/2) { bow=1; }
2161 // its not the best if we think on glued fi fo etc.
2162 if (( box2->frame_vector[pre ][0]<=xi[i+1]
2163 && box2->frame_vector[next][0]>=xi[i+1])
2164 || ( box2->frame_vector[pre ][0]>=xi[i+1]
2165 && box2->frame_vector[next][0]<=xi[i+1])) {
2166 if ( box2->frame_vector[ii ][1]>ymax)
2167 ymax= box2->frame_vector[ii ][1];
2168 if ( box2->frame_vector[ii ][1]<ymin)
2169 ymin= box2->frame_vector[ii ][1];
2171 // min and max of left and right side
2172 if ( box2->frame_vector[ii ][1]>max0
2173 && box2->frame_vector[ii ][0]<=xi[i+1])
2174 max0=box2->frame_vector[ii ][1];
2175 if ( box2->frame_vector[ii ][1]>max1
2176 && box2->frame_vector[ii ][0]> xi[i+1])
2177 max1=box2->frame_vector[ii ][1];
2178 if ( box2->frame_vector[ii ][1]<min0
2179 && box2->frame_vector[ii ][0]<=xi[i+1])
2180 min0=box2->frame_vector[ii ][1];
2181 if ( box2->frame_vector[ii ][1]<min1
2182 && box2->frame_vector[ii ][0]> xi[i+1])
2183 min1=box2->frame_vector[ii ][1];
2185 if(JOB->cfg.verbose&2)
2186 fprintf(stderr,"\n# test if to split at x%d= %2d %2d %2d"
2187 " bow,(max-min)[i,0,1] %d %3d %3d %3d"
2188 , i, xi[i]-x0, xi[i+1]-x0, xi[i+2]-x0, bow, ymax-ymin, max0-min0, max1-min1);
2189 /* skip if no local minimum at xi[i+1] or if its not thin enough */
2190 if (bow==0 || 4*(ymax-ymin)>2*(y1-y0)) continue;
2191 // cuttet parts should have about the same height (max-min)
2192 // we dont want to cut an 'n' in three parts!
2193 if (2*(max0-min0+1)<(y1-y0+1)) continue; // left height
2194 if (2*(max1-min1+1)<(y1-y0+1)) continue; // right height
2195 // ToDo: thickness on xi[i+1]?
2197 // try to split successive right box if left box is recognised,
2198 // else shift the splitting point further to the right border
2199 // removing ->dots if dot only above one char !!! ??? not implemented
2200 if(JOB->cfg.verbose&2)
2201 fprintf(stderr,"\n# try to split, newbox[%d].x= %2d ... %2d "
2202 "dy= %d ", i, xi[i]-x0, xi[i+1]-x0, dy);
2203 boxa=*box2; // copy contents, ToDo: reset ac-list (in cut_box?)
2204 boxa.x=xi[i]; boxa.y=y0; // obsolete? mark pixel, overlap?
2205 boxa.x0=xi[i];boxa.x1=xi[i+1]; // new horizontal box range
2206 cut_box(&boxa); boxa.num_ac=0;
2208 // get wchar + certainty
2209 ci[i]=whatletter(&boxa,cs,0); a2[i]=testac(&boxa,ci[i]);
2210 if(JOB->cfg.verbose&2)
2211 fprintf(stderr,"\n# certainty %d limit= %d cbest= %d ",
2212 a2[i], JOB->cfg.certainty, cbest);
2213 if (a2[i]<JOB->cfg.certainty || a2[i]<cbest-1
2214 || wcschr(s1,ci[i]) ) { continue; } // dont split here
2216 for (ar=ad,ii=0;ii<=i;ii++) {
2217 ar=a2[ii]*ar/100; } // multiply all probabilities
2218 if (ar<98*JOB->cfg.certainty/100 || ar<cbest) {
2219 continue; } // dont go deeper, no longer string
2221 i++; if (i==8) break; // maximum splits
2222 if (i==4) break; // at the moment its to slow to go further
2223 if (i+1<8) xi[i+1]=x1; // right border of next box
2224 if (i+2<8) xi[i+2]=x1;
2226 if(JOB->cfg.verbose&2)
2227 fprintf(stderr,"\n try end split [%d]=%d [%d]=%d ",
2228 i, xi[i]-x0, i+1, xi[i+1]-x0);
2229 boxb=*box2; // try rest if it has to be split again
2230 boxb.x=xi[i]+1; boxb.y=y0;
2231 boxb.x0=xi[i]+1;boxb.x1=xi[i+1];
2232 cut_box(&boxb); boxb.num_ac=0;
2233 ci[i]=whatletter(&boxb,cs,0); a2[i]=testac(&boxb,ci[i]);
2234 if (a2[i]<JOB->cfg.certainty || a2[i]<cbest-1
2235 || wcschr(s1,ci[i]) ) { xi[i+1]=xi[i]+2; continue; } // split rest
2236 // now we have everything splittet
2238 if(JOB->cfg.verbose&2) {
2239 fprintf(stderr,"\n split at/to: ");
2240 for (ii=0;ii<=i;ii++)
2241 fprintf(stderr," %2d %s (%3d)", xi[ii+1]-x0,
2242 decode(ci[ii],ASCII), a2[ii]);
2243 fprintf(stderr,"\n");
2245 // boxa..c changed!!! dots should be modified!!!
2246 // Question: cut it into boxes v0.40 or set a string v0.41?
2247 // new way of building a string v0.41 (can call setas multiple)
2248 // usefull if compare unknown with known strings (except barcode?)
2249 // ToDo: also create alternate variants? ex: I <-> l
2250 for (buf[0]=0,ar=ad,ii=0;ii<=i;ii++) {
2251 ar=a2[ii]*ar/100; // multiply all probabilities
2252 if (i>0 && ci[ii]=='n' && ci[ii-1]=='r') ar--; // m == rn
2253 strncat(buf,decode(ci[ii],JOB->cfg.out_format),20);
2256 if (ar>cbest) cbest=ar; // best (highest) certainty found
2257 // reduce, but not if we cross certainty border
2258 if (99*ar/100 > JOB->cfg.certainty) ar=99*ar/100;
2259 if (JOB->cfg.verbose&2)
2260 fprintf(stderr,"\n split result= %s (%3d) ",buf, ar);
2261 setas(box2,buf,ar); // char *, does it disturb further splitting?
2267 } end_for_each(&(JOB->res.boxlist));
2268 if(JOB->cfg.verbose)fprintf(stderr,", numC %d\n",JOB->res.numC);
2273 // ---- divide vertical glued boxes (ex: g above T);
2275 int divide_vert_glued_boxes( pix *pp, int mo){
2276 struct box *box2,*box3,*box4;
2277 int y0,y1,y,dy,flag_found,dx;
2278 if(JOB->cfg.verbose)fprintf(stderr,"# divide vertical glued boxes");
2279 for_each_data(&(JOB->res.boxlist)) {
2280 box2 = (struct box *)list_get_current(&(JOB->res.boxlist));
2281 if (box2->c != UNKNOWN) continue; /* dont try on pictures */
2282 y0=box2->y0; y1=box2->y1; dy=y1-y0+1;
2283 dx=4*(JOB->res.avX+box2->x1-box2->x0+1); // we want to be sure to look at 4ex distance
2284 if ( dy>2*JOB->res.avY && dy<6*JOB->res.avY && box2->m1
2285 && y0<=box2->m2+2 && y0>=box2->m1-2
2286 && y1>=box2->m4+JOB->res.avY-2)
2287 { // test if lower end fits one of the other lines?
2288 box4=box2; flag_found=0;
2289 for_each_data(&(JOB->res.boxlist)) {
2290 box4 = (struct box *)list_get_current(&(JOB->res.boxlist));
2291 if (box4->c != UNKNOWN) continue; /* dont try on pictures */
2292 if (box4->x1<box2->x0-dx || box4->x0>box2->x1+dx) continue; // ignore far boxes
2293 if (box4->line==box2->line ) flag_found|=1; // near char on same line
2294 if (box4->line==box2->line+1) flag_found|=2; // near char on next line
2295 if (flag_found==3) break; // we have two vertical glued chars
2296 } end_for_each(&(JOB->res.boxlist));
2297 if (flag_found!=3) continue; // do not divide big chars or special symbols
2298 y=box2->m4; // lower end of the next line
2299 if(JOB->cfg.verbose&2){
2300 fprintf(stderr,"\n# divide box below y=%4d",y-y0);
2302 // --- insert box3 before box2
2303 box3= (struct box *) malloc_box(box2);
2305 box2->y0=y+1; box2->line++; // m1..m4 should be corrected!
2306 if (box4->line == box2->line){
2307 box2->m1=box4->m1; box2->m2=box4->m2;
2308 box2->m3=box4->m3; box2->m4=box4->m4;
2310 box3->num=JOB->res.numC;
2311 if (list_ins(&(JOB->res.boxlist), box2, box3)) {
2312 fprintf(stderr,"ERROR list_ins\n"); };
2315 } end_for_each(&(JOB->res.boxlist));
2316 if(JOB->cfg.verbose)fprintf(stderr,", numC %d\n",JOB->res.numC);
2322 on some systems isupper(>255) cause a segmentation fault SIGSEGV
2323 therefore this function
2324 ToDo: should be replaced (?) by wctype if available on every system
2326 int wisupper(wchar_t cc){ return ((cc<128)?isupper(cc):0); }
2327 int wislower(wchar_t cc){ return ((cc<128)?islower(cc):0); }
2328 int wisalpha(wchar_t cc){ return ((cc<128)?isalpha(cc):0); }
2329 int wisdigit(wchar_t cc){ return ((cc<128)?isdigit(cc):0); }
2330 int wisspace(wchar_t cc){ return ((cc<128)?isspace(cc):0); }
2332 /* set box2->c to cc if cc is in the ac-list of box2, return 1 on success */
2333 int setc(struct box *box2, wchar_t cc){
2335 w1=((box2->num_ac) ? box2->wac[0] : 0); // weight of replaced char
2337 if (JOB->cfg.verbose)
2338 fprintf(stderr, "\n# change %s (%d) to %s (%d) at (%d,%d)",
2339 decode(box2->c,ASCII), w1, decode(cc,ASCII), w2, box2->x0, box2->y0);
2340 if (w2) { if (box2->c!=cc) { ret=1; setac(box2,cc,(100+w2)/2); } }
2341 // if(JOB->cfg.verbose & 4) out_x(box2);
2342 // ToDo: modify per setac (shift ac)
2347 /* ---- proof difficult chars Il1 by context view ----
2348 context: separator, number, vowel, nonvowel, upper case ????
2349 could be also used to find unknown chars if the environment (nonumbers)
2350 can be found in other places!
2352 - box->tac[] as set of possible chars, ac set by engine, example:
2353 ac="l/" (not "Il|/\" because serifs detected and slant>0)
2354 correction only to one of the ac-set (alternative chars)!
2355 - should be language-settable; Unicode compatible
2356 - box2->ad and wac should be changed? (not proper yet)
2358 int context_correction( job_t *job ) {
2359 // const static char
2360 char *l_vowel="aeiouy";
2361 // *l_Vowel="AEIOU",chars if the environment (nonumbers)
2362 char *l_nonvo = "bcdfghjklmnpqrstvwxz";
2363 struct box *box4, *box3, *box2, *prev, *next;
2364 // pix *pp = &(job->src.p);
2365 int nc=0, ns=0; // num corrections
2367 if (job->cfg.verbose)
2368 fprintf(stderr, "# context correction Il1 0O");
2370 for_each_data(&(job->res.boxlist)) {
2371 box2 = (struct box *)list_get_current(&(job->res.boxlist));
2372 if (box2->c > 0xFF) continue; // temporary UNICODE fix
2373 prev = (struct box *)list_get_cur_prev(&(job->res.boxlist));
2374 next = (struct box *)list_get_cur_next(&(job->res.boxlist));
2375 if( (prev) && (prev->c > 0xFF)) continue; // temporary UNICODE fix 2
2376 if( (next) && (next->c > 0xFF)) continue; // temporary UNICODE fix 3
2377 if (box2->num_ac<2) continue; // no alternatives
2378 if (box2->wac[0]==100 && box2->wac[1]<100) continue;
2379 if (box2->num_ac && box2->tas[0]) continue; // buggy space_remove 0.42
2381 /* check for Il1| which are general difficult to distinguish */
2382 /* bbg: not very good. Should add some tests to check if is preceded by '.',
2384 /* ToDo: only correct if not 100% sure (wac[i]<100)
2385 and new char is in wat[] */
2386 if (strchr("Il1|", box2->c) && next && prev) {
2387 // if( strchr(" \n",prev->c) // SPC
2388 // && strchr(" \n",next->c) ) box2->c='I'; else // bad idea! I have ...
2389 if (wisalpha(next->c) && next->c!='i' &&
2390 ( prev->c == '\n' ||
2392 ( box4=(struct box *)list_prev(&(job->res.boxlist), prev)) &&
2393 box4->c == '.' ) ) ) { nc+=setc(box2,(wchar_t)'I'); }
2394 else if (box2->c!='1' && strchr(l_nonvo,next->c) &&
2395 strchr("\" \n",prev->c)) /* lnt => Int, but 1st */
2396 /* do not change he'll to he'Il! */
2397 { nc+=setc(box2,(wchar_t)'I'); } // set box2->c to 'I' if 'I' is in the ac-list
2398 else if (strchr(l_vowel,next->c)) /* unusual? Ii Ie Ia Iy Iu */
2399 /* && strchr("KkBbFfgGpP",prev->c)) */ /* kle Kla Kli */
2400 { nc+=setc(box2,(wchar_t)'l'); }
2401 else if (wisupper(next->c)
2402 && !strchr("O0I123456789",next->c)
2403 && !strchr("O0I123456789",prev->c)) /* avoid lO => IO (10) */
2404 { nc+=setc(box2,(wchar_t)'I'); }
2405 else if (wislower(prev->c))
2406 { nc+=setc(box2,(wchar_t)'l'); }
2407 else if (wisdigit(prev->c) || wisdigit(next->c)
2408 || (next->c=='O' && !wisalpha(prev->c))) /* lO => 10 */
2409 { nc+=setc(box2,(wchar_t)'1'); }
2413 else if (strchr("O0", box2->c) && next && prev) {
2414 if (wisspace(prev->c) && wisalpha(next->c)) /* initial letter */
2415 { nc+=setc(box2,(wchar_t)'O'); }
2416 else if (wisalpha(prev->c) && wisalpha(next->c)
2417 && wisupper(next->c)) /* word in upper case */
2418 { nc+=setc(box2,(wchar_t)'O'); }
2419 else if (wisdigit(prev->c) || wisdigit(next->c))
2420 { nc+=setc(box2,(wchar_t)'0'); }
2424 else if (strchr("5S", box2->c) && next && prev) {
2425 if (wisspace(prev->c) && wisalpha(next->c)) /* initial letter */
2426 { nc+=setc(box2,(wchar_t)'S'); }
2427 else if (wisalpha(prev->c) && wisalpha(next->c)
2428 && wisupper(next->c)) /* word in upper case */
2429 { nc+=setc(box2,(wchar_t)'S'); }
2430 else if (wisdigit(prev->c) || wisdigit(next->c))
2431 { nc+=setc(box2,(wchar_t)'5'); }
2434 /* was a space not found? xXx => x Xx ??? */
2435 if (wisupper(box2->c) && next && prev) {
2436 if (wislower(prev->c) && wislower(next->c)
2437 && 2 * (box2->x0 - prev->x1) > 3 * (next->x0 - box2->x1)) {
2438 struct box *box3 = malloc_box((struct box *) NULL);
2439 box3->x0 = prev->x1 + 2;
2440 box3->x1 = box2->x0 - 2;
2441 box3->y0 = box2->y0;
2442 box3->y1 = box2->y1;
2443 box3->x = box2->x0 - 1;
2446 box3->num_boxes = 0;
2447 box3->num_subboxes = 0;
2450 setac(box3,' ',99); /* ToDo: weight depends from distance */
2452 box3->line = prev->line;
2453 box3->m1 = box3->m2 = box3->m3 = box3->m4 = 0;
2454 box3->p = &(job->src.p);
2455 list_ins(&(job->res.boxlist), box2, box3);
2459 /* a space before punctuation? but not " ./file" */
2461 if (prev->c == ' ' && strchr(" \n" , next->c)
2462 && strchr(".,;:!?)", box2->c))
2463 if (prev->x1 - prev->x0 < 2 * job->res.avX) { // carefully on tables
2465 if ( !list_del(&(job->res.boxlist), box3) ) free_box(box3);
2466 prev = (struct box *)list_get_cur_prev(&(job->res.boxlist));
2472 if ( (prev->c == '`' || prev->c == '\'')
2473 && (box2->c == '`' || box2->c == '\'') )
2474 if (prev->x1 - box2->x0 < job->res.avX) { // carefully on tables
2477 list_del(&(job->res.boxlist), box3);
2480 } end_for_each(&(job->res.boxlist));
2481 if (job->cfg.verbose)
2482 fprintf(stderr, " num_corrected= %d removed_spaces= %d\n", nc, ns);
2487 /* ---- insert spaces ----
2488 * depends strongly from the outcome of measure_pitch()
2489 * ------------------------ */
2490 int list_insert_spaces( pix *pp, job_t *job ) {
2491 int i=0, j1, j2, i1, maxline=-1, dy=0; char cc;
2492 struct box *box2, *box3=NULL, *box4=NULL;
2494 // measure mean line height
2495 for(i1=1;i1<job->res.lines.num;i1++) {
2496 dy+=job->res.lines.m4[i1]-job->res.lines.m1[i1]+1;
2497 } if (job->res.lines.num>1) dy/=(job->res.lines.num-1);
2499 for(i1=1;i1<job->res.lines.num;i1++) {
2500 j1=job->res.lines.m4[i1]-job->res.lines.m1[i1]+1;
2501 if (j1>dy*120/100 || j1<dy*80/100) continue; // only most frequently
2503 } if (i>0 && j2/i>7) dy=j2/i;
2504 if( job->cfg.verbose&1 )
2505 fprintf(stderr,"# insert space between words (dy=%d) ...",dy);
2506 if (!dy) dy=(job->res.avY)*110/100+1;
2509 for_each_data(&(job->res.boxlist)) {
2510 box2 =(struct box *)list_get_current(&(job->res.boxlist));
2512 if (box2->line>maxline) { // lines and chars must be sorted!
2513 if (maxline>=0) cc='\n'; // NL
2516 if((box3 = (struct box *)list_prev(&(job->res.boxlist), box2))){
2517 if (maxline && !box2->line && cc==0) cc=' ';
2518 if (box2->line<=maxline && cc==0) { // lines and chars must be sorted!
2519 int thispitch = job->res.lines.pitch[box2->line];
2520 int thismono = job->res.lines.mono[box2->line];
2521 int mdist = (box2->x1 + box2->x0 - (box3->x1 + box3->x0) + 1)/2;
2522 int pdist = box2->x0 - box3->x1 + 1;
2523 if (box2->x1 - box2->x0 < thispitch) pdist=pdist*4/3;
2524 /* allow extra pixels around small characters .,'!: etc */
2525 // fprintf(stderr,"#\n ... mono= %2d pitch= %2d mdist= %2d pdist= %2d",
2526 // thismono, thispitch, mdist, pdist);
2527 if ((thismono!=0 && mdist >= thispitch)
2528 || (thismono==0 && pdist >= thispitch))
2529 cc=' '; // insert SPACE
2533 box4=(struct box *)list_prev(&(job->res.boxlist), box2);
2534 box3=(struct box *)malloc_box(NULL);
2535 box3->x0=box2->x0-2; box3->x1=box2->x0-2;
2536 box3->y0=box2->y0; box3->y1=box2->y1;
2537 if(cc!='\n' && box4)
2538 box3->x0=box4->x1+2;
2539 if(cc=='\n' || !box4)
2540 box3->x0=job->res.lines.x0[box2->line];
2541 if(cc=='\n' && box4){
2542 box3->y0=box4->y1; // better use lines.y1[box2->pre] ???
2545 box3->x =box2->x0-1; box3->y=box2->y0;
2546 box3->dots=0; box3->c=cc;
2547 box3->num_boxes = 0;
2548 box3->num_subboxes = 0;
2549 box3->modifier='\0';
2550 box3->num=-1; box3->line=box2->line;
2551 box3->m1=box2->m1; box3->m2=box2->m2;
2552 box3->m3=box2->m3; box3->m4=box2->m4;
2554 setac(box3,cc,99); /* ToDo: weight depends from distance */
2555 list_ins(&(job->res.boxlist),box2,box3);
2556 if( job->cfg.verbose&1 ) {
2557 fprintf(stderr,"\n# insert space &%d; at x= %4d %4d box= %p",
2558 (int)cc, box3->x0, box3->y0, (void*)box3);
2563 } end_for_each(&(job->res.boxlist));
2564 if( job->cfg.verbose&1 ) fprintf(stderr," found %d\n",i);
2570 add infos where the box is positioned to the box
2571 this is useful for better recognition
2573 int add_line_info(/* List *boxlist2 */){
2574 // pix *pp=&JOB->src.p;
2575 struct tlines *lines = &JOB->res.lines;
2577 int i,xx,m1,m2,m3,m4,num_line_members=0,num_rest=0;
2578 if( JOB->cfg.verbose&1 ) fprintf(stderr,"# add line infos to boxes ...");
2579 for_each_data(&(JOB->res.boxlist)) {
2580 box2 =(struct box *)list_get_current(&(JOB->res.boxlist));
2581 for(i=1;i<JOB->res.lines.num;i++) /* line 0 is a place holder */
2583 if (lines->dx) xx=lines->dy*((box2->x1+box2->x0)/2)/lines->dx; else xx=0;
2588 // fprintf(stderr," test line %d m1=%d %d %d %d\n",i,m1,m2,m3,m4);
2589 if (m4-m1==0) continue; /* no text line (line==0) */
2591 if( box2->y1+2*JOB->res.avY >= m1
2592 && box2->y0-2*JOB->res.avY <= m4 ) /* not to far away */
2594 /* give also a comma behind the line a chance */
2595 if( box2->x0 >= lines->x0[i] && box2->x1 <= lines->x1[i]+JOB->res.avX )
2596 if( box2->m2==0 || abs(box2->y0-box2->m2) > abs(box2->y0-m2) )
2597 { /* found nearest line */
2605 if( box2->y1+2 < box2->m1
2606 || box2->y0 < box2->m1 - (box2->m3-box2->m1)/2
2607 || box2->y0-2 > box2->m4
2608 || box2->y1 > box2->m3 + (box2->m3-box2->m1)
2617 } else num_line_members++;
2618 } end_for_each(&(JOB->res.boxlist));
2619 if( JOB->cfg.verbose&1 )
2620 fprintf(stderr," done, num_line_chars=%d rest=%d\n",
2621 num_line_members, num_rest);
2627 * bring the boxes in right order
2628 * add_line_info must be executed first!
2630 int sort_box_func (const void *a, const void *b) {
2631 struct box *boxa, *boxb;
2633 boxa = (struct box *)a;
2634 boxb = (struct box *)b;
2636 if ( ( boxb->line < boxa->line ) ||
2637 ( boxb->line == boxa->line && boxb->x0 < boxa->x0 ) )
2642 // -------------------------------------------------------------
2643 // ------ use this for entry from other programs
2644 // include pnm.h pgm2asc.h
2645 // -------------------------------------------------------------
2646 // entry point for gocr.c or if it is used as lib
2647 // better name is call_ocr ???
2648 // jb: OLD COMMENT: not removed due to set_options_* ()
2649 // args after pix *pp should be removed and new functions
2650 // set_option_mode(int mode), set_option_spacewidth() .... etc.
2651 // should be used instead, before calling pgm2asc(pix *pp)
2652 // ! change if you can ! - used by X11 frontend
2653 int pgm2asc(job_t *job)
2656 progress_counter_t *pc;
2659 /* FIXME jb: remove pp */
2662 if( job->cfg.verbose )
2663 fprintf(stderr, "# db_path= %s\n", job->cfg.db_path);
2665 pc = open_progress(100,"pgm2asc_main");
2666 progress(0,pc); /* start progress output 0% 0% */
2668 /* ----- count colors ------ create histogram -------
2669 - this should be used to create a upper and lower limit for cs
2670 - cs is the optimum gray value between cs_min and cs_max
2671 - also inverse scans could be detected here later */
2673 job->cfg.cs=otsu( pp->p,pp->y,pp->x,0,0,pp->x,pp->y, job->cfg.verbose & 1 );
2674 /* renormalize the image and set the normalized threshold value */
2675 job->cfg.cs=thresholding( pp->p,pp->y,pp->x,0,0,pp->x,pp->y, job->cfg.cs );
2676 if( job->cfg.verbose )
2677 fprintf(stderr, "# thresholding new_threshold= %d\n", job->cfg.cs);
2679 progress(5,pc); /* progress is only estimated */
2681 #if 0 /* dont vast memory */
2682 /* FIXME jb: malloc */
2683 if ( job->cfg.verbose & 32 ) {
2684 // generate 2nd imagebuffer for debugging output
2685 job->tmp.ppo.p = (unsigned char *)malloc(job->src.p.y * job->src.p.x);
2687 assert(job->tmp.ppo.p);
2688 copybox(&job->src.p,
2689 0, 0, job->src.p.x, job->src.p.y,
2691 job->src.p.x * job->src.p.y);
2694 job->tmp.ppo=job->src.p; /* temporarely, removed later */
2697 /* load character data base */
2698 if ( job->cfg.mode&2 )
2701 /* this is first step for reorganize the PG
2702 ---- look for letters, put rectangular frames around letters
2703 letter = connected points near color F
2704 should be used by dust removing (faster) and line detection!
2705 ---- 0..cs = black letters, last change = Mai99 */
2707 progress(8,pc); /* progress is only estimated */
2710 if ( !job->res.numC ){
2711 fprintf( stderr,"# no boxes found - stopped\n" );
2712 //if(job->cfg.verbose&32) debug_img("out01",job,8);
2713 /***** should free stuff, etc) */
2716 // if (job->cfg.verbose&32) debug_img("out00",job,4+8);
2718 progress(10,pc); /* progress is only estimated */
2719 // if(job->cfg.verbose&32) debug_img("out01",job,4+8);
2720 // output_list(job); // for debugging
2721 // ToDo: matrix printer preprocessing
2723 remove_dust( job ); /* from the &(job->res.boxlist)! */
2724 // if(job->cfg.verbose&32) debug_img("out02",job,4+8);
2725 // output_list(job); // for debugging
2726 smooth_borders( job ); /* only for big chars */
2727 progress(12,pc); /* progress is only estimated */
2728 // if(job->cfg.verbose&32) debug_img("out03",job,4+8);
2729 // output_list(job); // for debugging
2731 //detect_barcode( job ); /* mark barcode */
2732 // if(job->cfg.verbose&32) debug_img("out04",job,4+8);
2733 // output_list(job); // for debugging
2735 detect_pictures( job ); /* mark pictures */
2736 // if(job->cfg.verbose&32) debug_img("out05",job,4+8);
2737 // output_list(job); // for debugging
2739 remove_pictures( job ); /* do this as early as possible, before layout */
2740 // if(job->cfg.verbose&32) debug_img("out06",job,4+8);
2741 // output_list(job); // for debugging
2743 glue_holes_inside_chars( pp ); /* including count subboxes (holes) */
2745 detect_rotation_angle( job );
2747 #if 1 /* Rotate the whole picture! move boxes */
2748 if( job->res.lines.dy!=0 ){ // move down lowest first, move up highest first
2749 // in work! ??? (at end set dy=0) think on ppo!
2752 detect_text_lines( pp, job->cfg.mode ); /* detect and mark JOB->tmp.ppo */
2753 // if(job->cfg.verbose&32) debug_img("out07",job,4+8);
2754 progress(20,pc); /* progress is only estimated */
2756 add_line_info(/* &(job->res.boxlist) */);
2757 //if (job->cfg.verbose&32) debug_img("out10",job,4+8);
2759 divide_vert_glued_boxes( pp, job->cfg.mode); /* after add_line_info, before list_sort! */
2760 // if(job->cfg.verbose&32) debug_img("out11",job,0);
2762 remove_melted_serifs( pp ); /* make some corrections on pixmap */
2763 /* list_ins seems to sort in the boxes on the wrong place ??? */
2764 // if(job->cfg.verbose&32) debug_img("out12",job,4+8);
2766 glue_broken_chars( pp ); /* 2nd glue */
2767 // if(job->cfg.verbose&32) debug_img("out14",job,4+8);
2769 remove_rest_of_dust( );
2770 // if(job->cfg.verbose&32) debug_img("out15",job,4+8);
2772 /* better sort after dust is removed (slow for lot of pixels) */
2773 list_sort(&(job->res.boxlist), sort_box_func);
2775 measure_pitch( job );
2777 if(job->cfg.mode&64) find_same_chars( pp );
2778 progress(30,pc); /* progress is only estimated */
2779 // if(job->cfg.verbose&32) debug_img("out16",job,4+8);
2781 char_recognition( pp, job->cfg.mode);
2782 progress(60,pc); /* progress is only estimated */
2783 // if(job->cfg.verbose&32) debug_img("out17",job,4+8);
2785 if ( adjust_text_lines( pp, job->cfg.mode ) ) { /* correct using chars */
2786 /* may be, characters/pictures have changed line number */
2787 list_sort(&(job->res.boxlist), sort_box_func);
2788 // 2nd recognition call if lines are adjusted
2789 char_recognition( pp, job->cfg.mode);
2792 #define BlownUpDrawing 1 /* german: Explosionszeichnung, temporarly */
2793 #if BlownUpDrawing == 1 /* german: Explosionszeichnung */
2794 { /* just for debugging */
2795 int i,ii,ni; struct box *box2;
2797 for_each_data(&(JOB->res.boxlist)) { /* count boxes */
2798 box2 = (struct box *)list_get_current(&(JOB->res.boxlist));
2799 if (box2->c==UNKNOWN) i++;
2800 if (box2->c==PICTURE) ii++;
2802 } end_for_each(&(JOB->res.boxlist));
2803 if (JOB->cfg.verbose)
2804 fprintf(stderr,"# debug: unknown= %d picts= %d boxes= %d\n",i,ii,ni);
2807 // ----------- write out20.pgm ----------- mark lines + boxes
2808 //if (job->cfg.verbose&32) debug_img("out20",job,1+4+8);
2810 compare_unknown_with_known_chars( pp, job->cfg.mode);
2811 progress(70,pc); /* progress is only estimated */
2813 try_to_divide_boxes( pp, job->cfg.mode);
2814 progress(80,pc); /* progress is only estimated */
2816 /* --- list output ---- for debugging --- */
2817 //if (job->cfg.verbose&6) output_list(job);
2819 /* ---- insert spaces ---- */
2820 list_insert_spaces( pp , job );
2822 // ---- proof difficult chars Il1 by context view ----
2823 if (JOB->cfg.verbose)
2824 fprintf(stderr,"# context correction if !(mode&32)\n");
2825 if (!(job->cfg.mode&32)) context_correction( job );
2827 store_boxtree_lines( job->cfg.mode );
2828 progress(90,pc); /* progress is only estimated */
2830 /* 0050002.pgm.gz ca. 109 digits, only 50 recognized (only in lines?)
2831 * ./gocr -v 39 -m 56 -e - -m 4 -C 0-9 -f XML tmp0406/0050002.pbm.gz
2832 * awk 'BEGIN{num=0}/1<\/box>/{num++;}END{print num}' o
2833 * 15*0 24*1 18*2 19*3 15*4 6*5 6*6 6*7 4*8 8*9 sum=125digits counted boxes
2834 * 9*0 19*1 14*2 15*3 11*4 6*5 5*6 6*7 4*8 8*9 sum=97digits recognized
2835 * 1*1 1*7 not recognized (Oct04)
2836 * 33*SPC 76*NL = 109 spaces + 36*unknown sum=241 * 16 missed
2838 #if BlownUpDrawing == 1 /* german: Explosionszeichnung */
2839 { /* just for debugging */
2840 int i,ii,ni; struct box *box2; const char *testc="0123456789ABCDEFGHIJK";
2842 for_each_data(&(JOB->res.boxlist)) { /* count boxes */
2843 box2 = (struct box *)list_get_current(&(JOB->res.boxlist));
2844 if (box2->c==UNKNOWN) i++;
2845 if (box2->c==PICTURE) ii++;
2846 if (box2->c>' ' && box2->c<='z') ni++;
2847 } end_for_each(&(JOB->res.boxlist));
2848 if(JOB->cfg.verbose)
2849 fprintf(stderr,"# debug: (_)= %d picts= %d chars= %d",i,ii,ni);
2850 for (i=0;i<20;i++) {
2852 for_each_data(&(JOB->res.boxlist)) { /* count boxes */
2853 box2 = (struct box *)list_get_current(&(JOB->res.boxlist));
2854 if (box2->c==testc[i]) ni++;
2855 } end_for_each(&(JOB->res.boxlist));
2856 if(JOB->cfg.verbose && ni>0)
2857 fprintf(stderr," (%c)=%d",testc[i],ni);
2859 if(JOB->cfg.verbose)
2860 fprintf(stderr,"\n");
2864 // ---- frame-size-histogram
2865 // ---- (my own defined) distance between letters
2866 // ---- write internal picture of textsite
2867 // ----------- write out30.pgm -----------
2868 //if( job->cfg.verbose&32 ) debug_img("out30",job,2+4);
2870 progress(100,pc); /* progress is only estimated */
2874 return 0; /* what should I return? error-state? num-of-chars? */