-
Notifications
You must be signed in to change notification settings - Fork 154
Expand file tree
/
Copy pathmarkSystem.cpp
More file actions
1990 lines (1807 loc) · 75.6 KB
/
markSystem.cpp
File metadata and controls
1990 lines (1807 loc) · 75.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// markSystem.cpp - annotates the dictionary with what words/concepts are active in the current sentence
#include "common.h"
#ifdef INFORMATION
For every word in a sentence, the word knows it can be found somewhere in the sentence, and there is a 64-bit field of where it can be found in that sentence.
The field is in a hashmap and NOT in the dictionary word, where it would take up excessive memory.
Adjectives occur before nouns EXCEPT:
1. object complement (with some special verbs)
2. adjective participle (sometimes before and sometimes after)
In a pattern, an author can request:
1. a simple word like bottle
2. a form of a simple word non - canonicalized like bottled or apostrophe bottle
3. a WordNet concept like bottle~1
4. a set like ~dead or : dead
For #1 "bottle", the system should chase all upward all sets of the word itself, and all
WordNet parents of the synset it belongs to and all sets those are in.
Marking should be done for the original and canonical forms of the word.
For #2 "bottled", the system should only chase the original form.
For #3 "bottle~1", this means all words BELOW this in the wordnet hierarchy not including the word
"bottle" itself.This, in turn, means all words below the particular synset head it corresponds to
and so instead becomes a reference to the synset head : (char*)"0173335n" or some such.
For #4 "~dead", this means all words encompassed by the set ~dead, not including the word ~dead.
So each word in an input sentence is scanned for marking.
the actual word gets to see what sets it is in directly.
Thereafter the system chases up the synset hierarchy fanning out to sets marked from synset nodes.
#endif
#pragma warning(disable: 4068)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-value"
#define GENERIC_MEANING 0 // not a specific meaning of the word
int verbwordx = -1;
static bool failFired = false;
bool trustpos = false;
int marklimit = 0;
std::map <WORDP, HEAPINDEX> triedData; // per volley index into heap space
static HEAPREF pendingConceptList = NULL;
static int MarkSetPath(int depth, int exactWord, MEANING M, unsigned int start, unsigned int end, unsigned int level, int kind); // walks set hierarchy
// mark debug tracing
bool showMark = false;
static unsigned int markLength = 0; // prevent long lines in mark listing trace
#define MARK_LINE_LIMIT 80
int upperCount, lowerCount;
ExternalTaggerFunction externalPostagger = NULL;
char unmarked[MAX_SENTENCE_LENGTH]; // can completely disable a word from mark recognition
char oldunmarked[MAX_SENTENCE_LENGTH]; // cached version of marks for ^mark()/^unmark()
/**************************************/
/* Word Reference in Sentence system */
/**************************************/
// Wheredata is a 64bit tried by meaning field (aligned)
// + Sentence references for the word.
// Each dictionary word can have up thru 63 different addressable meanings
// as well as the generic word as a whole. These are the "tried" bits, used
// when marking meanings to avoid redundant sweeps.
// Sentence references are where the word/concept is found in the sentence and
// is used by pattern matching.
// Each reference is 4 bytes
// byte0: start index into sentence
// byte1: end index into sentence
// byte2: if fundamental meaning this is where start of subject in sentence is (fundamental start/end is on the verb)
// byte3: which exact dictionary index (capitalization) matched if exact match is involved
// bytes4-7: int index of dictionary word
bool RemoveMatchValue(WORDP D, int position)
{
HEAPINDEX access = GetAccess(D);
if (!access) return false;
bool changed = false;
unsigned char* data = (unsigned char*) (Index2Heap(access) + 8); // skip over 64bit tried by meaning field
unsigned char* tried = NULL;
bool didmod = false;
for (int i = 0; i < MAXREFSENTENCE_BYTES; i += REF_ELEMENT_SIZE)
{
if (data[i] == position)
{
didmod = true;
if (!changed)// protect by moving data to new area so restoresentence is safe
{
HEAPINDEX newaccess = CopyWhereInSentence(access);
tried = (unsigned char*)(Index2Heap(newaccess) + 8);
changed = true;
SetTried(D, newaccess);
}
memmove(tried +i, tried +i+ REF_ELEMENT_SIZE,(MAXREFSENTENCE_BYTES - i - REF_ELEMENT_SIZE));
// end will not have marker now if we deleted last reference, so insert insurance
tried[MAXREFSENTENCE_BYTES - REF_ELEMENT_SIZE] = END_OF_REFERENCES;
break;
}
}
return didmod;
}
static unsigned int WhereWordHitWithData(WORDP D,unsigned int start,unsigned char* data)
{
if (data) for (unsigned int i = 0; i < MAXREFSENTENCE_BYTES; i += REF_ELEMENT_SIZE)
{
if (data[i] >= start)
{
if (data[i] == start) return data[i + 1]; // return end of it
else break; // cannot match later - end of data will be END_OF_REFERENCES (0xff) for all
}
}
return 0;
}
static unsigned int WhereWordHit(WORDP D, unsigned int start)
{ // but phrases can be hit multiply, as can other things
unsigned char* data = GetWhereInSentence(D);
if (data) for (unsigned int i = 0; i < MAXREFSENTENCE_BYTES; i += REF_ELEMENT_SIZE)
{
if (data[i] >= start)
{
if (data[i] == start) return data[i + 1]; // return end of it
else break; // cannot match later
}
}
return 0;
}
bool HasMarks(int start)
{
WORDP D = FindWord(wordStarts[start]); // nominal word there
if (!D) return false;
unsigned char* data = GetWhereInSentence(D); // has 2 hidden int fields before this point
if (!data) return false;
int whereHitEnd = WhereWordHitWithData(D, start, data); // word in sentence index
return whereHitEnd != 0;
}
void ShowMarkData(char* word)
{
WORDP D = FindWord(word);
if (!D) return;
unsigned char* data = GetWhereInSentence(D); // has 2 hidden int fields before this point
if (!data) return;
for (int i = 0; i < MAXREFSENTENCE_BYTES; i += REF_ELEMENT_SIZE)
{
unsigned char begin = data[i];
unsigned char end = data[i + 1];
printf("%u-%u ", begin, end);
}
printf("\r\n");
}
bool MarkWordHit(int depth, MEANING exactWord, WORDP D, int meaningIndex, unsigned int start,unsigned int end,unsigned int prefix,unsigned int kind)
{ // keep closest to start at bottom, when run out, drop later ones
if (!D || !D->word) return false;
if (end > wordCount) end = wordCount;
if (start == (unsigned int)verbwordx && !stricmp(wordStarts[start], "verify"))
return false;
// if (*D->word == '~') D->inferMark = inferMark; // we have marked this concept in this position, avoid rescan
// but that code must account for start AND end, like Morphine Sulphate which both single and double trigger stuff
//
// been here before?
unsigned char* data = GetWhereInSentence(D); // has 2 hidden int fields before this point
if (!data) data = (unsigned char*)AllocateWhereInSentence(D);
if (!data) return false; // allocate failure
unsigned int whereHitEnd = WhereWordHitWithData(D, start,data); // word in sentence index
if (*D->word != '~') // real word, not a concept
{
uint64 meaningBit = 1ull << meaningIndex; // convert index into bit
if (whereHitEnd < end) SetTriedMeaningWithData(GENERIC_MEANING,(unsigned int*)data);
uint64 triedBits = GetTriedMeaning(D);
if (meaningBit & triedBits) return false; // did this meaning already
SetTriedMeaningWithData(triedBits | meaningBit, (unsigned int*)data); // update more
}
//else if (whereHitEnd >= end) return false; // no need since already covering concept in this area
if (++marklimit > 5000)
{
if (!failFired) ReportBug("INFO: Mark limit hit");
failFired = true;
return false;
}
// diff < 0 means peering INSIDE a multiword token before last word
// we label END as the word before it (so we can still see next word) and START as the actual multiword token
bool added = false;
for (int i = 0; i < MAXREFSENTENCE_BYTES; i += REF_ELEMENT_SIZE)
{
unsigned char begin = data[i];
if (begin < start) continue; // skip over it
else if (begin == start) // we have already marked this somewhat
{
if (end > data[i+1]) added = true; // prefer the longer match
}
else // insert here
{
if (prefix) // split verbal
{
// no room to store 2 things? we are on last element
if (i == (MAXREFSENTENCE_BYTES - REF_ELEMENT_SIZE)) break;
memmove(data + i + REF_ELEMENT_SIZE, data + i, MAXREFSENTENCE_BYTES - i - REF_ELEMENT_SIZE); // create a hole for entry
data[i] = 0;
data[i + 1] = (unsigned char)prefix;
i += REF_ELEMENT_SIZE;
}
memmove(data+i+ REF_ELEMENT_SIZE,data+i,MAXREFSENTENCE_BYTES - i - REF_ELEMENT_SIZE); // create a hole for entry
data[i] = (unsigned char)start;
added = true;
}
if (added)
{
data[i + 1] = (unsigned char)(end & 0x00ff);
data[i + 2] = (unsigned char) (end >> 8); // for fundamental meanings this is the 3rd field reference
data[i + 3] = 0;
MEANING* exact = (MEANING*)(data + i + 4);
if (exactWord == (MEANING)-1)
exactWord = Word2Index(D); // stay with this word
*exact = exactWord; // -1 means dont use an alternative lower case, stay upper
}
break; // have location
}
if (added)
{
if (*D->word == '~')// track the actual sets done matching start word location (good for verbs, not so good for nouns)
{
// concepts[i] and topics[i] are lists of word indices
if (!(D->internalBits & TOPIC)) Add2ConceptTopicList(concepts, D, start, end, false); // DOESNT need to be be marked as concept
else Add2ConceptTopicList(topics, D, start, end, false);
}
if ((trace & (TRACE_PREPARE | TRACE_HIERARCHY) || prepareMode == PREPARE_MODE || showMark) && (D->word[0] != '~' || !IsDigit(D->word[1])))
{
markLength += WORDLENGTH(D);
if (markLength > MARK_LINE_LIMIT)
{
markLength = 0;
Log(USERLOG,"\r\n");
Log(USERLOG,"");
}
int d = depth;
while (d-- >= 0) Log((showMark) ? ECHOUSERLOG : USERLOG, " ");
char which[20];
*which = 0;
which[1] = 0;
if (exactWord && D->internalBits & UPPERCASE_HASH) which[0] = '^';
char* kindlabel = "";
char other[MAX_WORD_SIZE];
*other = 0;
if (depth == 0 && *D->word != '~')
{
char lang[20];
char* status;
status = "";
if (D->foreignFlags) status = "universal multilanguage";
else if (!GET_LANGUAGE_INDEX(D)) status = "universal all";
*lang = 0;
if (multidict) sprintf(lang, "%x", GET_LANGUAGE_INDEX(D));
sprintf(other, " (languagebits 0x%s %s):\r\n", lang, status);
}
if (kind == RAW || kind == RAWCASE) kindlabel = "(raw)";
Log((showMark) ? ECHOUSERLOG : USERLOG, (D->internalBits & TOPIC) ? "+T%s%s " : (char*)" +%s%s", D->word, which);
char* exactd = "";
if (exactWord && D->word[0] == '~') exactd = Meaning2Word(exactWord)->word;
if (prefix) Log((showMark) ? ECHOUSERLOG : USERLOG, " (%d,%d-%d)\r\n", prefix,start, end);
else Log((showMark) ? ECHOUSERLOG : USERLOG," (%d-%d) %s %s %s\r\n", start, end,kindlabel,other,exactd);
markLength = 0;
}
}
return added;
}
HEAPINDEX GetAccess(WORDP D)
{
std::map<WORDP, HEAPINDEX>::iterator it;
it = triedData.find(D);
if (it == triedData.end()) return 0;
HEAPINDEX access = it->second; // heap index
return access;
}
unsigned char* GetWhereInSentence(WORDP D) // [0] is the meanings bits, the rest are start/end/case bytes for 8 locations
{
if (!D) return NULL;
HEAPINDEX access = GetAccess(D);
return (!access) ? NULL : (unsigned char*)Index2Heap(access) + 8; // skip over 64bit tried by meaning field
}
HEAPINDEX CopyWhereInSentence(int oldindex)
{
unsigned int* olddata = (unsigned int*)Index2Heap(oldindex); // original location
if (!olddata) return 0;
// 64bit tried by meaning field (aligned) + sentencerefs (2 bytes each + a byte for uppercase index)
unsigned int* data = (unsigned int*)AllocateHeap(NULL, TRIEDDATA_WORDSIZE, 4, false); // 64 bits (2 words) + 48 bytes (12 words) = 14 words
if (data) memcpy((char*)data, olddata, TRIEDDATA_WORDSIZE * sizeof(int));
return Heap2Index((char*)data);
}
void ClearWhereInSentence() // erases the WHEREINSENTENCE and the TRIEDBITS
{
memset(concepts, 0, sizeof(unsigned int) * MAX_SENTENCE_LENGTH);
memset(topics, 0, sizeof(unsigned int) * MAX_SENTENCE_LENGTH);
triedData.clear();
memset(unmarked, 0, MAX_SENTENCE_LENGTH);
oldunmarked[255] = 0; // no cache of unmarked in progress
}
unsigned int* AllocateWhereInSentence(WORDP D)
{
// 64bit tried by meaning field (aligned) + sentencerefs (3 bytes each + a byte for uppercase index)
unsigned int* data = (unsigned int*)AllocateHeap(NULL, TRIEDDATA_WORDSIZE, sizeof(int), false);
if (!data) return NULL;
memset((char*)data, END_OF_REFERENCES, TRIEDDATA_WORDSIZE * sizeof(int)); // clears sentence xref start/end bits and casing byte
data[0] = 0; // clears the tried meanings list
data[1] = 0;
// store where in the temps data
int index = Heap2Index((char*)data); // original index!
triedData[D] = index;
return data + 2; // analogous to GetWhereInSentence (hidden bits)
}
void SetTriedMeaningWithData(uint64 bits, unsigned int* data)
{
*(data - 2) = (unsigned int)(bits >> 32);
*(data - 1) = (unsigned int)(bits & 0xffffffff); // back up to the tried meaning area
}
void SetTriedMeaning(WORDP D, uint64 bits)
{
unsigned int* data = (unsigned int*)GetWhereInSentence(D);
if (!data)
{
data = AllocateWhereInSentence(D); // returns past the tried bits of chunk
if (!data) return; // failed to allocate
}
*(data - 2) = (unsigned int)(bits >> 32);
*(data - 1) = (unsigned int)(bits & 0xffffffff); // back up to the tried meaning area
}
uint64 GetTriedMeaning(WORDP D) // which meanings have been used (up to 64)
{
std::map<WORDP, HEAPINDEX>::iterator it;
it = triedData.find(D);
if (it == triedData.end()) return 0;
unsigned int* data = (unsigned int*)Index2Heap(it->second); // original location
if (!data) return 0;
uint64 value = ((uint64)(data[0])) << 32;
value |= (uint64)data[1];
return value; // back up to the correct meaning zone
}
unsigned int GetIthSpot(WORDP D, int i, unsigned int& start,unsigned int& end)
{
if (!D) return 0; // not in sentence
unsigned char* data = GetWhereInSentence(D);
if (!data) return 0;
i *= REF_ELEMENT_SIZE;
if (i >= MAXREFSENTENCE_BYTES) return 0; // at end
start = (unsigned char) data[i];
if (start == END_OF_REFERENCES) return 0;
end = (unsigned char) data[i + 1];
if (end > wordCount)
{
static bool did = false;
if (!did) ReportBug((char*)"INFO: Getith out of range %s at %d\r\n", D->word, volleyCount);
did = true;
}
if (data[i + 2]) end |= data[i + 2]; // fundamental Meaning extra value
return start;
}
static unsigned char* DataIntersect(WORDP D)
{
char* ptr = D->word;
char* at = strchr(ptr + 1, '~'); // joiner of disparate concepts, like ~pet~tasty
unsigned char* data = NULL;
// dont do word~1~concept or word~n~concept
if (at && at[2]) // word with ~casemarking data added, has no data on its own, not trial~n or trial~1
{
WORDP first = FindWord(ptr, (at - ptr)); // the first piece
data = GetWhereInSentence(first);
if (!data) return 0;
size_t len = strlen(at);
char* at1 = strchr(at + 1, '~'); // and a 3rd piece
if (at1) len = at1 - at;
WORDP second = FindWord(at, len); // the 2nd piece
unsigned char* seconddata = GetWhereInSentence(second);
if (!seconddata) return 0; // word not found so conjoin cant either
unsigned char* thirddata = NULL;
if (at1)
{
WORDP third = FindWord(at1);
thirddata = GetWhereInSentence(third);
if (!thirddata) return 0; // not there
}
unsigned char* commonData = (unsigned char*)AllocateWhereInSentence(D);
if (!commonData) return 0; // allocate failure
memcpy(commonData, data, REFSTRIEDDATA_WORDSIZE * sizeof(int)); // starts with the base
// keep common positions of this second word (and optionally third) with existing first
for (int i = 0; i < MAXREFSENTENCE_BYTES; i += REF_ELEMENT_SIZE) // walk commondata
{
if (commonData[i] == END_OF_REFERENCES) break; // no more data in base
bool found1 = false;
bool found2 = false;
for (int j = 0; j < MAXREFSENTENCE_BYTES; j += REF_ELEMENT_SIZE)
{
if (seconddata[j] == END_OF_REFERENCES) break; // end of this piece
if (seconddata[j] == commonData[i]) // found here
{
found1 = true;
break;
}
}
if (thirddata) for (int j = 0; j < MAXREFSENTENCE_BYTES; j += REF_ELEMENT_SIZE)
{
if (thirddata[j] == END_OF_REFERENCES) break; // end of this piece
if (thirddata[j] == commonData[i]) // found here
{
found2 = true;
break;
}
}
else found2 = true; // dont need a third
if (!found1 || !found2) // our common data is not common to all
{
memmove(commonData + i, commonData + i + REF_ELEMENT_SIZE, (MAXREFSENTENCE_BYTES - i - REF_ELEMENT_SIZE));
i -= REF_ELEMENT_SIZE;
}
}
if (commonData[0] == END_OF_REFERENCES) return 0; // nothing in common
data = commonData; // the common data of word and concept
}
return data;
}
unsigned int GetNextSpot(WORDP D, int start, bool reverse, unsigned int legalgap,MARKDATA* hitdata)
{// spot can be 1-31, range can be 0-7 -- 7 means its a string, set last marker back before start so can rescan
// BUG - we should note if match is literal or canonical, so can handle that easily during match eg
// '~shapes matches square but not squares (whereas currently literal fails because it is not ~shapes)
if (!D) return 0; // not in sentence
unsigned char* data = GetWhereInSentence(D);
if (!data)
{
const char* at = strchr(D->word + 1, '~');
if (at && at[2]) data = DataIntersect(D); // word with ~casemarking data added, has no data on its own, not trial~n or trial~1 - or concept intersect: ~pet~tasty - but BUG for trial~12
if (!concepts[1]) // marking has not happened, we are in input substitution mode
{
char* find = D->word;
unsigned int separation = 0;
if (!reverse)
{
for (unsigned int at = start+1; at <= wordCount; ++at)
{
if (unmarked[at]) continue;
++separation;
if (legalgap && separation > legalgap) return 0;
if (!stricmp(wordStarts[at], find))
{
if (hitdata)
{
hitdata->start = at;
hitdata->end = at;
hitdata->word = (int)MakeMeaning(D);
hitdata->disjoint = 0;
}
return at;
}
}
return 0;
}
else // reverse
{
for (int at = start-1; at > 0; --at)
{
if (unmarked[at]) continue;
++separation;
if (legalgap && separation > legalgap) return 0;
if (!stricmp(wordStarts[at], find))
{
if (hitdata)
{
hitdata->start = at;
hitdata->end = at;
hitdata->word = (int)MakeMeaning(D);
hitdata->disjoint = 0;
}
return at;
}
}
return 0;
}
}
}
if (!data) return 0;
// now perform the real analysis from marked data
if (hitdata) hitdata->word = 0;
int i;
int startPosition = 0;
for (i = 0; i < MAXREFSENTENCE_BYTES; i += REF_ELEMENT_SIZE) // each 8 byte ref is start,end,extra,unused, 4byte exact,
{
unsigned char at = data[i];
if (!at) continue; // skip over disjoint data
if (at == 0xff) break; // end of data
unsigned char end = data[i + 1];
bool unmarkedWords = false;
if (unmarked[0]) for (int j = at; j <= end; ++j)
{
if (unmarked[j])
{
unmarkedWords = true;
break;
}
}
if (unmarkedWords) { ; }
else if (reverse)
{
if (at < start) // valid. but starts far from where we are
{
startPosition = at; // bug fix backward gaps as well
if (hitdata)
{
unsigned char fundamentalExtra = data[i + 2]; // usually 0, except for fundamental meaning matches
hitdata->start = at;
hitdata->end = end | (fundamentalExtra << 8); // hidden subject data for fundamental meanings
MEANING* exact = (MEANING*)(data + i + 4);
hitdata->word = (int)*exact;
if (i > 0 && !data[i - REF_ELEMENT_SIZE]) hitdata->disjoint = data[i - REF_ELEMENT_SIZE + 1];// disjoint data
else hitdata->disjoint = 0;
}
continue; // find the CLOSEST without going over
}
else if (at >= start) break; // getting worse
}
else if (at > start) // scanning forward
{
if (legalgap && (at - start) > legalgap) startPosition = 0; // too far away and optional
else
{
startPosition = at;
if (hitdata)
{
hitdata->start = at;
unsigned char fundamentalExtra = data[i + 2]; // usually 0, except for fundamental meaning matches
hitdata->end = end | (fundamentalExtra << 8); // hidden subject data for fundamental meanings
MEANING* exact = (MEANING*)(data + i + 4);
MEANING MM = *exact;
WORDP DD = Meaning2Word(MM);
hitdata->word = (int) MM;
if (i > 0 && !data[i - REF_ELEMENT_SIZE]) hitdata->disjoint = data[i - REF_ELEMENT_SIZE + 1];// disjoint data
else hitdata->disjoint = 0;
}
}
break;
}
}
return startPosition; // we have a closest or we dont
}
/**************************************/
/* End of Word Reference in Sentence system */
/**************************************/
static void TraceHierarchy(FACT* F,char* msg)
{
if (TraceHierarchyTest(trace))
{
char* word = AllocateBuffer();
char* fact = WriteFact(F, false, word); // just so we can see it
unsigned int hold = globalDepth;
globalDepth = 4 + 1;
if (!msg) Log(USERLOG,"%s\r\n", fact); // \r\n
else Log(USERLOG,"%s (%s)\r\n", fact,msg); // \r\n
globalDepth = hold;
FreeBuffer();
}
}
static void AddPendingConcept(FACT* F, unsigned int start, unsigned int end)
{
pendingConceptList = AllocateHeapval(HV1_FACTI|HV2_INT|HV3_INT,pendingConceptList, (uint64)Fact2Index(F), start, end);
TraceHierarchy(F,"delayed");
}
static bool ProcessPendingFact(FACT* F, unsigned int start, unsigned int end)
{
WORDP O = Meaning2Word(F->object);
if (WhereWordHit(O, start) >= (int)end) return true; // already marked this set
// FACT wanted to map subject to concept object, but it had an exclude on a set that needed completion
int depth = 4;
WORDP S = Meaning2Word(F->subject);
MARKDATA hitdata;
if (GetNextSpot(S, start - 1,false,0,&hitdata) && hitdata.start == start && hitdata.end == end)
{
TraceHierarchy(F,"");
return true; // not allowed to proceed
}
return false; // unmarked. we dont know
}
static void ProcessPendingConcepts()
{
// (~setmember exclude ~set)
// has subject been marked at this position, if so, we cannot trigger concept for this position
if (!pendingConceptList) return;
HEAPREF startList = pendingConceptList; // F start|end
HEAPREF begin = startList;
bool changed = false;
while (1)
{
if (!startList) // we will be cycling list trying to get changes until no list or no changes
{
if (!changed) break; // no improvement
startList = begin;
changed = false;
}
uint64 start;
uint64 end;
uint64* currentEntry = (uint64*) startList;
uint64 Fx;
startList = UnpackHeapval(startList, Fx, start,end);
FACT* F = (FACT*)Index2Fact((unsigned int)Fx);
WORDP concept = NULL; // set we want to trigger
while (F) // will be NULL if we have already finished with it
{
// expect exit here because writing a concept with NO members is nuts
if (F->verb != Mexclude) break; // ran out of set restrictions
// before we can trigger this set membership
concept = Meaning2Word(F->object);
if (ProcessPendingFact(F, (unsigned int)start, (unsigned int)end)) // failed
{
currentEntry[1] = 0; // kill fact use
changed = true;
WORDP S = Meaning2Word(F->subject);
if (*S->word != '~') break; // specific words are a hard exclusion
}
F = GetObjectNondeadNext(F);
}
// now flow path of this set upwards since all excludes have been considered
if (!changed && F && F->verb != Mexclude)
{
TraceHierarchy(F,"resume");
if (MarkWordHit(4, EXACTNOTSET, concept, 0, (int)start, (int)end)) // new ref added
{
if (MarkSetPath(4 + 1, EXACTNOTSET, F->object, (int)start, (int)end, 4 + 1, FIXED) != -1) changed = true; // someone marked
}
}
}
// activate anything not already deactivated now
while (pendingConceptList) // one entry per pending set that had excludes
{
bool exact = false;
bool canonical = false;
uint64 Fx;
uint64 start;
uint64 end;
pendingConceptList = UnpackHeapval(pendingConceptList, Fx,start,end);
if (!Fx) continue;
FACT* F = (FACT*)Index2Fact((unsigned int)Fx);
WORDP E = (F) ? Meaning2Word(F->object) : NULL;
// mark all members of the link
TraceHierarchy(F,"defaulting");
if (MarkWordHit(4, EXACTNOTSET, E, 0, (int)start, (int)end)) // new ref added
{
MarkSetPath(4 + 1, EXACTNOTSET, F->object, (int)start, (int)end, 4 + 1, FIXED);
}
}
}
static bool IsValidStart(int start)
{
if (start == 1) return true;
if (!(tokenControl & DO_INTERJECTION_SPLITTING))
{
// if not splitting interjections into their own sentence, then could be a valid start
// if all previous words are an interjection
WORDP D = FindWord("~interjections");
if (!D) return false;
int i = 0;
while (++i<start)
{
if (*wordStarts[i] == ',') { ; }
else if (*wordStarts[i] == '-') { ; }
else
{
int end = WhereWordHit(D,i);
if (end == 0 || end >= start) return false;
i = end;
}
}
return true;
}
return false;
}
MEANING EncodeConceptMember(char* word,int& flags)
{
char hold[MAX_WORD_SIZE];
if (*word == '~') MakeLowerCopy(hold, word); // require concept name be lower case
else strcpy(hold, JoinWords(BurstWord(word, CONTRACTIONS)));
char* at = hold;
while ((at = strchr(at, '_'))) *at = ' '; // change _ to spaces
at = hold;
if (*word == '\'' && word[1] == '\'') // 2 quotes mean case sensitive user typing
{
flags |= RAWCASE_ONLY;
at += 2;
}
else if (*word == '\'') // 1 quote means not canonical (allowing FIXED or RAW)
{
flags |= ORIGINAL_ONLY;
++at;
}
return MakeMeaning(StoreWord(at,AS_IS));
}
static int MarkSetPath(int depth,int exactWord,MEANING M, unsigned int start, unsigned int end, unsigned int level, int kind) // walks set hierarchy
{// travels up concept/class sets only, though might start out on a synset node or a regular word
unsigned int flags = GETTYPERESTRICTION(M);
if (!flags) flags = BASIC_POS; // what POS we allow from Meaning
WORDP D = Meaning2Word(M);
int index = Meaning2Index(M); // always 0 for a synset or set
// check for any repeated accesses of this synset or set or word
uint64 offset = 1ull << index;
int result = NOPROBLEM_BIT;
FACT* H = GetSubjectNondeadHead(D); // thisword/concept member y
while (H)
{
FACT* F = H;
H = GetSubjectNondeadNext(H);
if (F->verb != Mmember) continue;
WORDP C = Meaning2Word(F->object);
// if (C->inferMark == inferMark) continue; // already scanned this concept
// ~concept members and word equivalent
if (trace & TRACE_HIERARCHY) TraceHierarchy(F,"");
WORDP concept = Meaning2Word(F->object);
if (concept->internalBits & OVERRIDE_CONCEPT) // override by ^testpattern, is this legal fact?
{
if (!(F->flags & OVERRIDE_MEMBER_FACT)) break; // pretend he and earlier facts doesnt exist
}
// if subject has type restriction, it must pass
unsigned int restrict = GETTYPERESTRICTION(F->subject );
if (!restrict && index) restrict = GETTYPERESTRICTION(GetMeaning(D,index)); // new (may be unneeded)
// reasons we cant use this fact
// for true interjection, END_ONLY can mean wordCount or next word is , or -
bool block = false;
if (kind != RAWCASE && F->flags & RAWCASE_ONLY) { block = true; } // incoming is not raw correctly cased words and must be
else if (kind == CANONICAL && F->flags & ORIGINAL_ONLY) { block = true; } // incoming is not original words and must be
else if (restrict && !(restrict & flags)) { block = true; } // type restriction in effect for this concept member
else if (F->flags & (START_ONLY | END_ONLY))
{
if (F->flags & START_ONLY && !IsValidStart(start)) { block = true; } // must begin the sentence
else if (F->flags & END_ONLY && end != wordCount && !(F->flags & START_ONLY)) { block = true; } // must begin the sentence
else if ((F->flags & (START_ONLY | END_ONLY)) == (START_ONLY | END_ONLY) && IsValidStart(start))
{
if (end == wordCount) { ; }
else if (end > wordCount) block = true;
else if (*wordStarts[end + 1] == ',') { ; }
else if (*wordStarts[end + 1] == '-') { ; }
else block = true;
}
}
int mindex = Meaning2Index(F->subject);
// index meaning restriction (0 means all)
if (!block && index == mindex) // match generic or exact subject
{
bool mark = true;
// test for word not included in set
if (index)
{
unsigned int pos = GETTYPERESTRICTION(GetMeaning(Meaning2Word(F->subject), index));
if (!(flags & pos))
mark = false; // we cannot be that meaning because type is wrong
}
if (!mark)
{
if (trace & TRACE_HIERARCHY) TraceHierarchy(F, "");
}
// concept might not be concept if member is to a word, not a concept
else if (*concept->word == '~' && WhereWordHit(concept, start) >= end) mark = false; // already marked this set
else //does set has some members it does not want
{
FACT* G = GetObjectNondeadHead(concept);
while (G)
{
// all simple excludes will be first
// all set excludes will be second
// actual values of set will be third
// User can defeat that by runtime addition of set members. Too bad for now.
// so technically we could free up the HAS_EXCLUDE bit
if (G->verb == Mexclude) // see if this is marked for this position, if so, DONT trigger topic
{
WORDP S = Meaning2Word(G->subject); // what to exclude
MARKDATA hitdata;
// need to test for original only as well - BUG
if (GetNextSpot(S,start-1,false,0,&hitdata) && hitdata.start == start && hitdata.end == end)
{
if (trace & TRACE_HIERARCHY) TraceHierarchy(F,"");
mark = false;
break;
}
// if we see a concept exclude, regular ones already passed
if (*S->word == '~') // concept exclusion - status unknown
{
AddPendingConcept(G, start, end); // must review later
mark = false; // dont mark now
break; // revisit all exclusions later
}
}
else if (G->verb == Mmember) // ran out of excludes
{
break;
}
G = GetObjectNondeadNext(G);
}
}
if (mark)
{
if (MarkWordHit(depth, exactWord, concept, index,start, end)) // new ref added
{
if (MarkSetPath(depth+1, exactWord, F->object, start, end, level + 1, kind) != -1) result = 1; // someone marked
}
}
}
else if (!index && mindex) // we are all meanings (limited by pos use) and he is a specific meaning
{
WORDP J = Meaning2Word(F->subject);
MEANING M1 = GetMeaning(J, mindex);
unsigned int pos = GETTYPERESTRICTION(M1);
if (flags & pos) // && start == end wont work if spanning multiple words revised due to "to fish" noun infinitive
{
if (MarkWordHit(depth, exactWord, Meaning2Word(F->object), Meaning2Index(F->object),start, end)) // new ref added
{
if (MarkSetPath(depth+1, exactWord, F->object, start, end, level + 1, kind) != -1) result = 1; // someone marked
}
}
}
}
return result;
}
static void RiseUp(int depth, int exactWord,MEANING M,unsigned int start, unsigned int end,unsigned int level,int kind) // walk wordnet hierarchy above a synset node
{ // M is always a synset head
M &= -1 ^ SYNSET_MARKER;
unsigned int index = Meaning2Index(M);
if (index > MAX_MEANING) return; // cant use this
WORDP D = Meaning2Word(M);
if (!D) return;
char word[MAX_WORD_SIZE];
sprintf(word,(char*)"%s~%u",D->word,index); // some meaning is directly referenced?
MarkWordHit(depth, exactWord, FindWord(word),0,start,end); // direct reference in a pattern
// now spread and rise up
if (MarkSetPath(depth, exactWord,M,start,end,level,kind) == -1) return; // did the path already
FACT* F = GetSubjectNondeadHead(D);
while (F)
{
if (F->verb == Mis && (index == 0 || F->subject == M)) RiseUp(depth+1,exactWord,F->object,start,end,level+1,kind); // allowed up
F = GetSubjectNondeadNext(F);
}
}
static void MarkAllMeaningAndImplications(int depth, MEANING M, int start, int end, int kind, bool sequence,bool once)
{ // M is always a word or sequence from a sentence
// but if uppercase, there may be multiple forms, so handle all
if (!M) return;
WORDP D = Meaning2Word(M);
unsigned int len = WORDLENGTH(D);
unsigned int hash = (D->hash % maxHashBuckets); // mod by the size of the table
int uindex = 0; // lowercase bucket
WORDP X = Index2Word(hashbuckets[hash + 1]); // look in uppercase bucket for this word
while (X && X != dictionaryBase) // all entries matching in upper case bucket
{
if (WORDLENGTH(D) != len) { ; }
else if (!IsValidLanguage(X)) { ; }
else if (!StricmpUTF(D->word, X->word, len))
{
MEANING M1 = M;
MEANING windex = MakeMeaning(X);
if ((M & MEANING_BASE) != windex) // alternate spelling to what was passed in
{
M1 = windex | (M & TYPE_RESTRICTION); // no idea what meaning, go generic
}
MarkMeaningAndImplications(depth, windex, M1, start, end, kind, sequence, once);
}
X = Index2Word( GETNEXTNODE(X));
}
}
void MarkMeaningAndImplications(int depth, MEANING exactWord,MEANING M,int start, int end,int kind,bool sequence,bool once,int prefix)
{ // M is always a word or sequence from a sentence
if (!M) return;
WORDP D = Meaning2Word(M);
if (D->properties & NOUN_TITLE_OF_WORK && kind == CANONICAL) return; // accidental canonical match of a title. not intended
// We want to avoid wandering fact relationships for meanings we have already scanned.
if (!exactWord) // has not been defined yet how to treat this word (original vs canonical)
{
if (D->internalBits & UPPERCASE_HASH) MarkAllMeaningAndImplications(depth, M, start, end, kind, sequence, once);
else if (kind == CANONICAL) exactWord = (MEANING)-1; // dont use concept word or lowercase word as the match
}
// We mark words/phrases and concepts and words/concepts implied by them.
// We mark words by meaning (63) + generic. They always have a fixed size match.
// We mark concepts by size match at a start position. You might match 1 word or several in a row.
// For match variable retrieval we want the longest match at a position.
// Because we can come in here with a general word with and without a type restriction,
// we have scan out from the word because we cant mark the different ways we scanned before,
int index = Meaning2Index(M);
//int whereHit = WhereWordHit(D, start);
unsigned int restrict = GETTYPERESTRICTION(M);
unsigned int size = GetMeaningCount(D);
if (size == 0)
{
M = MakeMeaning(D); // remove restriction
restrict = 0;
}
if (*D->word == '~')
{
//if (whereHit >= end) return; // already have best concept storage
}
else
{
//if (whereHit < end) SetTriedMeaning(D, 0); // found nothing at this index, insure nothing to start
}
// we mark word hit before using MarkSetPath, so that exclude is supported
// words we dont know we dont bother marking
if (!once || D->properties & (PART_OF_SPEECH | NOUN_TITLE_OF_WORK | NOUN_HUMAN) || D->systemFlags & PATTERN_WORD || *D->word == '~')
{
MarkWordHit(depth, exactWord, D, 0, start, end,0,(unsigned int) kind);
if (!failFired) MarkSetPath(depth + 2, exactWord, M, start, end, 0, kind); // generic membership of this word all the way to top
}
// we dont mark random junk discovered, only significant sequences
else if (sequence) // phrase is not a pattern word, maybe it goes to some concepts
{
FACT* F = GetSubjectNondeadHead(D);
while (F)
{
// mark sequence if someone cared about it
if (F->verb == Mmember) // ~concept members and word equivalent
{
MarkWordHit(depth, exactWord, D, 0, start, end); // we found something to relate to, so mark us
MarkSetPath(depth + 2, exactWord, M, start, end, 0, kind); // generic membership of this word all the way to top
break;
}
F = GetSubjectNext(F);
}
}
// check for POS restricted forms of this word
char word[MAX_WORD_SIZE];
if (*D->word != '~' && !once) // words, not concepts
{
if (restrict & NOUN && !(posValues[start] & NOUN_INFINITIVE)) // BUG- this wont work up the ontology, only at the root of what the script requests - doesnt accept "I like to *fish" as a noun, so wont refer to the animal
{
sprintf(word, (char*)"%s~n", D->word);
MarkWordHit(depth, exactWord, FindWord(word, 0, PRIMARY_CASE_ALLOWED), 0, start, end); // direct reference in a pattern
}
if ((restrict & VERB) || posValues[start] & NOUN_INFINITIVE)// accepts "I like to *swim as not a verb meaning"
{
sprintf(word, (char*)"%s~v", D->word);
MarkWordHit(depth, exactWord, FindWord(word, 0, PRIMARY_CASE_ALLOWED), 0, start, end); // direct reference in a pattern
}
if (restrict & ADJECTIVE) // and adverb
{
sprintf(word, (char*)"%s~a", D->word);
MarkWordHit(depth, exactWord, FindWord(word, 0, PRIMARY_CASE_ALLOWED), 0, start, end); // direct reference in a pattern
}
}
// now follow out the allowed synset hierarchies
if (!restrict)