Очень простой парсер

Предположим, что мы хотим создать свой парсер, который выделяет только один тип токена - слово (3,word,Word) и подключить его к полнотекстовому поиску. Для этого нам нужен еще один тип токена - это разделитель (12, blank,Space symbols). Идентификаторы типов (3,12) выбраны таким образом, чтобы можно было использовать стандартную функцию headline.

Поместите файлы test_parser.c, Makefile, test_parser.sql.in

в директорию contrib/test_parser, затем загрузите парсер в базу данных (в данном примере regression).

make make install psql regression < test_parser.sql

Мы создали тестовую FTS конфигурацию testcfg, для которой определен парсер testparser.

Для написания своего парсера необходимо разработать как-минимум 4 функции, см. SQL команду .

=# SELECT * FROM parse('testparser','That''s my first own parser'); tokid | token -------+-------- 3 | That's 12 | 3 | my 12 | 3 | first 12 | 3 | own 12 | 3 | parser =# SELECT to_tsvector('testcfg','That''s my first own parser'); to_tsvector ------------------------------------------------- 'my':2 'own':4 'first':3 'parser':5 'that''s':1 =# SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star')); headline ----------------------------------------------------------------- Supernovae stars are the brightest phenomena in galaxies

Файл test_parser.c

#ifdef PG_MODULE_MAGIC PG_MODULE_MAGIC; #endif

/* * types */

/* self-defined type */ typedef struct { char * buffer; /* text to parse */ int len; /* length of the text in buffer */ int pos; /* position of the parser */ } ParserState;

/* copy-paste from wparser.h of tsearch2 */ typedef struct { int lexid; char *alias; char *descr; } LexDescr;

/* * prototypes */ PG_FUNCTION_INFO_V1(testprs_start); Datum testprs_start(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(testprs_getlexeme); Datum testprs_getlexeme(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(testprs_end); Datum testprs_end(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(testprs_lextype); Datum testprs_lextype(PG_FUNCTION_ARGS);

/* * functions */ Datum testprs_start(PG_FUNCTION_ARGS) { ParserState *pst = (ParserState *) palloc(sizeof(ParserState)); pst->buffer = (char *) PG_GETARG_POINTER(0); pst->len = PG_GETARG_INT32(1); pst->pos = 0;

PG_RETURN_POINTER(pst); }

Datum testprs_getlexeme(PG_FUNCTION_ARGS) { ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); char **t = (char **) PG_GETARG_POINTER(1); int *tlen = (int *) PG_GETARG_POINTER(2); int type;

*tlen = pst->pos; *t = pst->buffer + pst->pos;

if ((pst->buffer)[pst->pos] == ' ') { /* blank type */ type = 12; /* go to the next non-white-space character */ while (((pst->buffer)[pst->pos] == ' ') && (pst->pos < pst->len)) { (pst->pos)++; } } else { /* word type */ type = 3; /* go to the next white-space character */ while (((pst->buffer)[pst->pos] != ' ') && (pst->pos < pst->len)) { (pst->pos)++; } }

*tlen = pst->pos - *tlen;

/* we are finished if (*tlen == 0) */ if (*tlen == 0) type=0;

PG_RETURN_INT32(type); } Datum testprs_end(PG_FUNCTION_ARGS) { ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); pfree(pst); PG_RETURN_VOID(); }

Datum testprs_lextype(PG_FUNCTION_ARGS) { /* Remarks: - we have to return the blanks for headline reason - we use the same lexids like Teodor in the default word parser; in this way we can reuse the headline function of the default word parser. */ LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));

/* there are only two types in this parser */ descr[0].lexid = 3; descr[0].alias = pstrdup("word"); descr[0].descr = pstrdup("Word"); descr[1].lexid = 12; descr[1].alias = pstrdup("blank"); descr[1].descr = pstrdup("Space symbols"); descr[2].lexid = 0;

PG_RETURN_POINTER(descr); }

Файл Makefile

override CPPFLAGS := -I. $(CPPFLAGS)

MODULE_big = test_parser OBJS = test_parser.o

DATA_built = test_parser.sql DATA = DOCS = README.test_parser REGRESS = test_parser

ifdef USE_PGXS PGXS := $(shell pg_config --pgxs) include $(PGXS) else subdir = contrib/test_parser top_builddir = ../.. include $(top_builddir)/src/Makefile.global include $(top_srcdir)/contrib/contrib-global.mk endif

Файл test_parser.sql.in

SET search_path = public;

BEGIN;

CREATE FUNCTION testprs_start(internal,int4) RETURNS internal AS 'MODULE_PATHNAME' LANGUAGE 'C' with (isstrict);

CREATE FUNCTION testprs_getlexeme(internal,internal,internal) RETURNS internal AS 'MODULE_PATHNAME' LANGUAGE 'C' with (isstrict);

CREATE FUNCTION testprs_end(internal) RETURNS void AS 'MODULE_PATHNAME' LANGUAGE 'C' with (isstrict);

CREATE FUNCTION testprs_lextype(internal) RETURNS internal AS 'MODULE_PATHNAME' LANGUAGE 'C' with (isstrict);

CREATE FULLTEXT PARSER testparser START 'testprs_start' GETTOKEN 'testprs_getlexeme' END 'testprs_end' LEXTYPES 'testprs_lextype' ;

CREATE FULLTEXT CONFIGURATION testcfg PARSER 'testparser' LOCALE NULL; CREATE FULLTEXT MAPPING ON testcfg FOR word WITH simple;

END;

Содержание раздела