From 683cfe06150e0be16f61fd5f9628c48e4535d894 Mon Sep 17 00:00:00 2001 From: delthas Date: Mon, 21 Feb 2022 19:44:56 +0100 Subject: [PATCH] Add support for the SEARCH extension --- doc/ext/search.md | 104 ++++++++++++++++++++++++++++++++++++++++++++++ downstream.go | 87 ++++++++++++++++++++++++++++++++++++++ msgstore.go | 18 ++++++++ msgstore_fs.go | 71 ++++++++++++++++++++++++------- 4 files changed, 265 insertions(+), 15 deletions(-) create mode 100644 doc/ext/search.md diff --git a/doc/ext/search.md b/doc/ext/search.md new file mode 100644 index 0000000..cf45504 --- /dev/null +++ b/doc/ext/search.md @@ -0,0 +1,104 @@ +# search + +This is a work-in-progress specification. + +## Description + +This document describes the format of the `search` extension. This enables clients to run a server-side search of messages according to specified selectors. + +This specification lets clients run an efficient search query on a bouncer or server who has quick access to the client message history, instead of having to download all logs and run the search locally. + +The server as mentioned in this document may refer to either an IRC server or an IRC bouncer. + +## Implementation + +The `search` extension uses the `soju.im/search` capability and introduces a new command, `SEARCH`, and batch type, `soju.im/search`. + +Full support for this extension requires support for the batch, server-time and message-tags capabilities. However, limited functionality is available to clients without support for these CAPs. Servers SHOULD NOT enforce that clients support all related capabilities before using the search extension. + +The `soju.im/search` capability MUST be negotiated. + +### `SEARCH` Command + +The client can request a message search by sending the `SEARCH` command to the server. This command has the following general syntax: + + SEARCH + +If the batch capability was negotiated, the server MUST reply to a successful SEARCH command using a batch with batch type `search`. If no content exists to return, the server SHOULD return an empty batch in order to avoid the client waiting for a reply. + +The server then replies with a batch of batch type `search` containing messages matching all the specified attributes. These messages MUST be `PRIVMSG` or `NOTICE` messages. + +### Returned message notes + +The order of returned messages within the batch is implementation-defined, but SHOULD be ascending time order or some approximation thereof, regardless of the subcommand used. The server-time tag on each message SHOULD be the time at which the message was received by the IRC server. When provided, the msgid tag that identifies each individual message in a response MUST be the msgid tag as originally sent by the IRC server. + +Servers SHOULD provide clients with a consistent message order that is valid across the lifetime of a single connection, and which determinately orders any two messages (even if they share a timestamp). This order SHOULD coincide with the order in which messages are returned within a response batch. It need not coincide with the delivery order of messages when they were relayed on any particular server. + +#### Errors and Warnings + +Errors are returned using the standard replies syntax. + +If the selectors were invalid, the `INVALID_PARAMS` error code SHOULD be returned. + + FAIL SEARCH INVALID_PARAMS [invalid_parameters] :Invalid parameters + +If the search cannot be run due to an internal error, the `INTERNAL_ERROR` error code SHOULD be returned. + + FAIL SEARCH INTERNAL_ERROR [extra_context] :The search could not be run + +### Standard search attributes + +Servers MUST recognise the following attributes. + +The following attributes are considered a match when: +* `in`: the message was sent to this target (channel or user). +* `from`: the message was sent with this nick. +* `after`: the message was sent at or after this time (same format as the `server-time` specification). +* `before`: the message was sent at or before this time (same format as the `server-time` specification). +* `text`: the message text matches the specified text. The actual algorithm used for matching the text is implementation defined. + +If `after` is specified, messages SHOULD be searched from that time. Otherwise, messages SHOULD be searched from the `before` time, which defaults to the current server time. + +Additionally, the following attributes MUST be recognized: +* `limit`: a number representing an upper bound on the count of messages to return. The server MAY return less messages than this number. + +### Examples + +Searching messages sent by `jackie` in `#chan` +~~~~ +[c] SEARCH from=jackie;in=#chan +[s] :irc.host BATCH +ID soju.im/search +[s] @batch=ID;msgid=1234;time=2019-01-04T14:33:26.123Z :jackie!indent@host PRIVMSG #chan :Be what you want +[s] @batch=ID;msgid=1234;time=2019-01-04T14:35:26.123Z :jackie!indent@host PRIVMSG #chan :Want what you be +[s] :irc.host BATCH -ID +~~~~ + +Searching messages matching the text `fast` in `#chan`, returning up to 2 messages +~~~~ +[c] SEARCH text=fast;in=#chan;limit=2 +[s] :irc.host BATCH +ID soju.im/search +[s] @batch=ID;msgid=1234;time=2019-01-04T14:33:26.123Z :bill!indent@host PRIVMSG #chan :That was fast! +[s] @batch=ID;msgid=1234;time=2019-01-04T14:35:26.123Z :jackie!indent@host PRIVMSG #chan :Fasting is hard. +[s] :irc.host BATCH -ID +~~~~ + +Searching messages when none match +~~~~ +[c] SEARCH before=2010-01-01T00:00:00.000Z;in=#chan +[s] :irc.host BATCH +ID soju.im/search +[s] :irc.host BATCH -ID +~~~~ + +## Use Cases + +Clients can run a fast server-side search across months of history and channels without having to download all their logs and run the search locally. + +This enables client interfaces to provide a search feature with quick matches. Additional context can be fetched thanks to the separate `CHATHISTORY` extension. + +## Implementation Considerations + +Server implementations may use different algorithms for matching messages against the specified `text`. Some implementation may choose to match by substrings, by whole words, or by other algorithms such as what is offered by their database (e.g. SQLite full-text search). The comparison may be case-insensitive or case-sensitive. + +## Security Considerations + +Processing logs can be slow, and arbitrary regular expressions can take a virtually infinite amount of time when maliciously crafted, even on small input sizes. Servers offering this feature should implement a timeout on their total request time, including regular expression compile time, as well as message fetching, parsing and selecting. diff --git a/downstream.go b/downstream.go index eae4cdd..8f6ef2c 100644 --- a/downstream.go +++ b/downstream.go @@ -361,6 +361,7 @@ func newDownstreamConn(srv *Server, ic ircConn, id uint64) *downstreamConn { // chatHistoryMessageStore if srv.Config().LogPath != "" { dc.caps.Available["draft/chathistory"] = "" + dc.caps.Available["soju.im/search"] = "" } return dc } @@ -2961,6 +2962,92 @@ func (dc *downstreamConn) handleMessageRegistered(ctx context.Context, msg *irc. }) } }) + case "SEARCH": + store, ok := dc.user.msgStore.(searchMessageStore) + if !ok { + return ircError{&irc.Message{ + Command: irc.ERR_UNKNOWNCOMMAND, + Params: []string{dc.nick, "SEARCH", "Unknown command"}, + }} + } + var attrsStr string + if err := parseMessageParams(msg, &attrsStr); err != nil { + return err + } + attrs := irc.ParseTags(attrsStr) + + var uc *upstreamConn + const searchMaxLimit = 100 + opts := searchOptions{ + limit: searchMaxLimit, + } + for name, v := range attrs { + value := string(v) + switch name { + case "before", "after": + timestamp, err := time.Parse(serverTimeLayout, value) + if err != nil { + return ircError{&irc.Message{ + Command: "FAIL", + Params: []string{"SEARCH", "INVALID_PARAMS", name, "Invalid criteria"}, + }} + } + switch name { + case "after": + opts.start = timestamp + case "before": + opts.end = timestamp + } + case "from": + opts.from = value + case "in": + u, upstreamName, err := dc.unmarshalEntity(value) + if err != nil { + return ircError{&irc.Message{ + Command: "FAIL", + Params: []string{"SEARCH", "INVALID_PARAMS", name, "Invalid criteria"}, + }} + } + uc = u + opts.in = u.network.casemap(upstreamName) + case "text": + opts.text = value + case "limit": + limit, err := strconv.Atoi(value) + if err != nil || limit <= 0 { + return ircError{&irc.Message{ + Command: "FAIL", + Params: []string{"SEARCH", "INVALID_PARAMS", name, "Invalid limit"}, + }} + } + opts.limit = limit + } + } + if uc == nil { + return ircError{&irc.Message{ + Command: "FAIL", + Params: []string{"SEARCH", "INVALID_PARAMS", "in", "The in parameter is mandatory"}, + }} + } + if opts.limit > searchMaxLimit { + opts.limit = searchMaxLimit + } + + messages, err := store.Search(ctx, &uc.network.Network, opts) + if err != nil { + dc.logger.Printf("failed fetching messages for search: %v", err) + return ircError{&irc.Message{ + Command: "FAIL", + Params: []string{"SEARCH", "INTERNAL_ERROR", "Messages could not be retrieved"}, + }} + } + + dc.SendBatch("soju.im/search", nil, nil, func(batchRef irc.TagValue) { + for _, msg := range messages { + msg.Tags["batch"] = batchRef + dc.SendMessage(dc.marshalMessage(msg, uc.network)) + } + }) case "BOUNCER": var subcommand string if err := parseMessageParams(msg, &subcommand); err != nil { diff --git a/msgstore.go b/msgstore.go index deb63b4..dced6d9 100644 --- a/msgstore.go +++ b/msgstore.go @@ -51,6 +51,24 @@ type chatHistoryMessageStore interface { LoadAfterTime(ctx context.Context, network *Network, entity string, start, end time.Time, limit int, events bool) ([]*irc.Message, error) } +type searchOptions struct { + start time.Time + end time.Time + limit int + from string + in string + text string +} + +// searchMessageStore is a message store that supports server-side search +// operations. +type searchMessageStore interface { + messageStore + + // Search returns messages matching the specified options. + Search(ctx context.Context, network *Network, search searchOptions) ([]*irc.Message, error) +} + type msgIDType uint const ( diff --git a/msgstore_fs.go b/msgstore_fs.go index b7bd850..14a79cf 100644 --- a/msgstore_fs.go +++ b/msgstore_fs.go @@ -88,6 +88,7 @@ type fsMessageStore struct { var _ messageStore = (*fsMessageStore)(nil) var _ chatHistoryMessageStore = (*fsMessageStore)(nil) +var _ searchMessageStore = (*fsMessageStore)(nil) func newFSMessageStore(root string, user *User) *fsMessageStore { return &fsMessageStore{ @@ -398,7 +399,7 @@ func (ms *fsMessageStore) parseMessage(line string, network *Network, entity str return msg, t, nil } -func (ms *fsMessageStore) parseMessagesBefore(network *Network, entity string, ref time.Time, end time.Time, events bool, limit int, afterOffset int64) ([]*irc.Message, error) { +func (ms *fsMessageStore) parseMessagesBefore(network *Network, entity string, ref time.Time, end time.Time, events bool, limit int, afterOffset int64, selector func(m *irc.Message) bool) ([]*irc.Message, error) { path := ms.logPath(network, entity, ref) f, err := os.Open(path) if err != nil { @@ -430,6 +431,9 @@ func (ms *fsMessageStore) parseMessagesBefore(network *Network, entity string, r } else if !t.Before(ref) { break } + if selector != nil && !selector(msg) { + continue + } historyRing[cur%limit] = msg cur++ @@ -454,7 +458,7 @@ func (ms *fsMessageStore) parseMessagesBefore(network *Network, entity string, r } } -func (ms *fsMessageStore) parseMessagesAfter(network *Network, entity string, ref time.Time, end time.Time, events bool, limit int) ([]*irc.Message, error) { +func (ms *fsMessageStore) parseMessagesAfter(network *Network, entity string, ref time.Time, end time.Time, events bool, limit int, selector func(m *irc.Message) bool) ([]*irc.Message, error) { path := ms.logPath(network, entity, ref) f, err := os.Open(path) if err != nil { @@ -476,6 +480,9 @@ func (ms *fsMessageStore) parseMessagesAfter(network *Network, entity string, re } else if !t.Before(end) { break } + if selector != nil && !selector(msg) { + continue + } history = append(history, msg) } @@ -486,14 +493,18 @@ func (ms *fsMessageStore) parseMessagesAfter(network *Network, entity string, re return history, nil } -func (ms *fsMessageStore) LoadBeforeTime(ctx context.Context, network *Network, entity string, start time.Time, end time.Time, limit int, events bool) ([]*irc.Message, error) { - start = start.In(time.Local) +func (ms *fsMessageStore) getBeforeTime(ctx context.Context, network *Network, entity string, start time.Time, end time.Time, limit int, events bool, selector func(m *irc.Message) bool) ([]*irc.Message, error) { + if start.IsZero() { + start = time.Now() + } else { + start = start.In(time.Local) + } end = end.In(time.Local) - history := make([]*irc.Message, limit) + messages := make([]*irc.Message, limit) remaining := limit tries := 0 for remaining > 0 && tries < fsMessageStoreMaxTries && end.Before(start) { - buf, err := ms.parseMessagesBefore(network, entity, start, end, events, remaining, -1) + buf, err := ms.parseMessagesBefore(network, entity, start, end, events, remaining, -1, selector) if err != nil { return nil, err } @@ -502,7 +513,7 @@ func (ms *fsMessageStore) LoadBeforeTime(ctx context.Context, network *Network, } else { tries = 0 } - copy(history[remaining-len(buf):], buf) + copy(messages[remaining-len(buf):], buf) remaining -= len(buf) year, month, day := start.Date() start = time.Date(year, month, day, 0, 0, 0, 0, start.Location()).Add(-1) @@ -512,17 +523,25 @@ func (ms *fsMessageStore) LoadBeforeTime(ctx context.Context, network *Network, } } - return history[remaining:], nil + return messages[remaining:], nil } -func (ms *fsMessageStore) LoadAfterTime(ctx context.Context, network *Network, entity string, start time.Time, end time.Time, limit int, events bool) ([]*irc.Message, error) { +func (ms *fsMessageStore) LoadBeforeTime(ctx context.Context, network *Network, entity string, start time.Time, end time.Time, limit int, events bool) ([]*irc.Message, error) { + return ms.getBeforeTime(ctx, network, entity, start, end, limit, events, nil) +} + +func (ms *fsMessageStore) getAfterTime(ctx context.Context, network *Network, entity string, start time.Time, end time.Time, limit int, events bool, selector func(m *irc.Message) bool) ([]*irc.Message, error) { start = start.In(time.Local) - end = end.In(time.Local) - var history []*irc.Message + if end.IsZero() { + end = time.Now() + } else { + end = end.In(time.Local) + } + var messages []*irc.Message remaining := limit tries := 0 for remaining > 0 && tries < fsMessageStoreMaxTries && start.Before(end) { - buf, err := ms.parseMessagesAfter(network, entity, start, end, events, remaining) + buf, err := ms.parseMessagesAfter(network, entity, start, end, events, remaining, selector) if err != nil { return nil, err } @@ -531,7 +550,7 @@ func (ms *fsMessageStore) LoadAfterTime(ctx context.Context, network *Network, e } else { tries = 0 } - history = append(history, buf...) + messages = append(messages, buf...) remaining -= len(buf) year, month, day := start.Date() start = time.Date(year, month, day+1, 0, 0, 0, 0, start.Location()) @@ -540,7 +559,11 @@ func (ms *fsMessageStore) LoadAfterTime(ctx context.Context, network *Network, e return nil, err } } - return history, nil + return messages, nil +} + +func (ms *fsMessageStore) LoadAfterTime(ctx context.Context, network *Network, entity string, start time.Time, end time.Time, limit int, events bool) ([]*irc.Message, error) { + return ms.getAfterTime(ctx, network, entity, start, end, limit, events, nil) } func (ms *fsMessageStore) LoadLatestID(ctx context.Context, network *Network, entity, id string, limit int) ([]*irc.Message, error) { @@ -569,7 +592,7 @@ func (ms *fsMessageStore) LoadLatestID(ctx context.Context, network *Network, en offset = afterOffset } - buf, err := ms.parseMessagesBefore(network, entity, t, time.Time{}, false, remaining, offset) + buf, err := ms.parseMessagesBefore(network, entity, t, time.Time{}, false, remaining, offset, nil) if err != nil { return nil, err } @@ -670,6 +693,24 @@ func (ms *fsMessageStore) ListTargets(ctx context.Context, network *Network, sta return targets, nil } +func (ms *fsMessageStore) Search(ctx context.Context, network *Network, opts searchOptions) ([]*irc.Message, error) { + text := strings.ToLower(opts.text) + selector := func(m *irc.Message) bool { + if opts.from != "" && m.User != opts.from { + return false + } + if text != "" && !strings.Contains(strings.ToLower(m.Params[1]), text) { + return false + } + return true + } + if !opts.start.IsZero() { + return ms.getAfterTime(ctx, network, opts.in, opts.start, opts.end, opts.limit, false, selector) + } else { + return ms.getBeforeTime(ctx, network, opts.in, opts.end, opts.start, opts.limit, false, selector) + } +} + func (ms *fsMessageStore) RenameNetwork(oldNet, newNet *Network) error { oldDir := filepath.Join(ms.root, escapeFilename(oldNet.GetName())) newDir := filepath.Join(ms.root, escapeFilename(newNet.GetName()))