uast.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. package hercules
  2. import (
  3. "bytes"
  4. "context"
  5. "errors"
  6. "fmt"
  7. "io"
  8. goioutil "io/ioutil"
  9. "os"
  10. "path"
  11. "runtime"
  12. "strings"
  13. "sync"
  14. "time"
  15. "github.com/gogo/protobuf/proto"
  16. "github.com/jeffail/tunny"
  17. "gopkg.in/bblfsh/client-go.v2"
  18. "gopkg.in/bblfsh/sdk.v1/protocol"
  19. "gopkg.in/bblfsh/sdk.v1/uast"
  20. "gopkg.in/src-d/enry.v1"
  21. "gopkg.in/src-d/go-git.v4"
  22. "gopkg.in/src-d/go-git.v4/plumbing"
  23. "gopkg.in/src-d/go-git.v4/plumbing/object"
  24. "gopkg.in/src-d/go-git.v4/utils/ioutil"
  25. "gopkg.in/src-d/go-git.v4/utils/merkletrie"
  26. "gopkg.in/src-d/hercules.v3/pb"
  27. )
  28. // UASTExtractor retrieves UASTs from Babelfish server which correspond to changed files in a commit.
  29. // It is a PipelineItem.
  30. type UASTExtractor struct {
  31. Endpoint string
  32. Context func() (context.Context, context.CancelFunc)
  33. PoolSize int
  34. Languages map[string]bool
  35. FailOnErrors bool
  36. ProcessedFiles map[string]int
  37. clients []*bblfsh.Client
  38. pool *tunny.WorkPool
  39. }
  40. const (
  41. uastExtractionSkipped = -(1 << 31)
  42. // ConfigUASTEndpoint is the name of the configuration option (UASTExtractor.Configure())
  43. // which sets the Babelfish server address.
  44. ConfigUASTEndpoint = "ConfigUASTEndpoint"
  45. // ConfigUASTTimeout is the name of the configuration option (UASTExtractor.Configure())
  46. // which sets the maximum amount of time to wait for a Babelfish server response.
  47. ConfigUASTTimeout = "ConfigUASTTimeout"
  48. // ConfigUASTPoolSize is the name of the configuration option (UASTExtractor.Configure())
  49. // which sets the number of goroutines to run for UAST parse queries.
  50. ConfigUASTPoolSize = "ConfigUASTPoolSize"
  51. // ConfigUASTFailOnErrors is the name of the configuration option (UASTExtractor.Configure())
  52. // which enables early exit in case of any Babelfish UAST parsing errors.
  53. ConfigUASTFailOnErrors = "ConfigUASTFailOnErrors"
  54. // ConfigUASTLanguages is the name of the configuration option (UASTExtractor.Configure())
  55. // which sets the list of languages to parse. Language names are at
  56. // https://doc.bblf.sh/languages.html Names are joined with a comma ",".
  57. ConfigUASTLanguages = "ConfigUASTLanguages"
  58. // FeatureUast is the name of the Pipeline feature which activates all the items related to UAST.
  59. FeatureUast = "uast"
  60. // DependencyUasts is the name of the dependency provided by UASTExtractor.
  61. DependencyUasts = "uasts"
  62. )
  63. type uastTask struct {
  64. Client *bblfsh.Client
  65. Lock *sync.RWMutex
  66. Dest map[plumbing.Hash]*uast.Node
  67. File *object.File
  68. Errors *[]error
  69. Status chan int
  70. }
  71. type worker struct {
  72. Client *bblfsh.Client
  73. Job func(interface{}) interface{}
  74. }
  75. func (w worker) TunnyReady() bool {
  76. return true
  77. }
  78. func (w worker) TunnyJob(data interface{}) interface{} {
  79. task := data.(uastTask)
  80. task.Client = w.Client
  81. return w.Job(task)
  82. }
  83. func (exr *UASTExtractor) Name() string {
  84. return "UAST"
  85. }
  86. func (exr *UASTExtractor) Provides() []string {
  87. arr := [...]string{DependencyUasts}
  88. return arr[:]
  89. }
  90. func (exr *UASTExtractor) Requires() []string {
  91. arr := [...]string{DependencyTreeChanges, DependencyBlobCache}
  92. return arr[:]
  93. }
  94. func (exr *UASTExtractor) Features() []string {
  95. arr := [...]string{FeatureUast}
  96. return arr[:]
  97. }
  98. func (exr *UASTExtractor) ListConfigurationOptions() []ConfigurationOption {
  99. options := [...]ConfigurationOption{{
  100. Name: ConfigUASTEndpoint,
  101. Description: "How many days there are in a single band.",
  102. Flag: "bblfsh",
  103. Type: StringConfigurationOption,
  104. Default: "0.0.0.0:9432"}, {
  105. Name: ConfigUASTTimeout,
  106. Description: "Babelfish's server timeout in seconds.",
  107. Flag: "bblfsh-timeout",
  108. Type: IntConfigurationOption,
  109. Default: 20}, {
  110. Name: ConfigUASTPoolSize,
  111. Description: "Number of goroutines to extract UASTs.",
  112. Flag: "bblfsh-pool-size",
  113. Type: IntConfigurationOption,
  114. Default: runtime.NumCPU()}, {
  115. Name: ConfigUASTFailOnErrors,
  116. Description: "Panic if there is a UAST extraction error.",
  117. Flag: "bblfsh-fail-on-error",
  118. Type: BoolConfigurationOption,
  119. Default: false}, {
  120. Name: ConfigUASTLanguages,
  121. Description: "Programming languages from which to extract UASTs. Separated by comma \",\".",
  122. Flag: "languages",
  123. Type: StringConfigurationOption,
  124. Default: "Python,Java"},
  125. }
  126. return options[:]
  127. }
  128. func (exr *UASTExtractor) Configure(facts map[string]interface{}) {
  129. if val, exists := facts[ConfigUASTEndpoint].(string); exists {
  130. exr.Endpoint = val
  131. }
  132. if val, exists := facts[ConfigUASTTimeout].(int); exists {
  133. exr.Context = func() (context.Context, context.CancelFunc) {
  134. return context.WithTimeout(context.Background(),
  135. time.Duration(val)*time.Second)
  136. }
  137. }
  138. if val, exists := facts[ConfigUASTPoolSize].(int); exists {
  139. exr.PoolSize = val
  140. }
  141. if val, exists := facts[ConfigUASTLanguages].(string); exists {
  142. exr.Languages = map[string]bool{}
  143. for _, lang := range strings.Split(val, ",") {
  144. exr.Languages[strings.TrimSpace(lang)] = true
  145. }
  146. }
  147. if val, exists := facts[ConfigUASTFailOnErrors].(bool); exists {
  148. exr.FailOnErrors = val
  149. }
  150. }
  151. func (exr *UASTExtractor) Initialize(repository *git.Repository) {
  152. if exr.Context == nil {
  153. exr.Context = func() (context.Context, context.CancelFunc) {
  154. return context.Background(), nil
  155. }
  156. }
  157. poolSize := exr.PoolSize
  158. if poolSize == 0 {
  159. poolSize = runtime.NumCPU()
  160. }
  161. var err error
  162. exr.clients = make([]*bblfsh.Client, poolSize)
  163. for i := 0; i < poolSize; i++ {
  164. client, err := bblfsh.NewClient(exr.Endpoint)
  165. if err != nil {
  166. panic(err)
  167. }
  168. exr.clients[i] = client
  169. }
  170. if exr.pool != nil {
  171. exr.pool.Close()
  172. }
  173. workers := make([]tunny.TunnyWorker, poolSize)
  174. for i := 0; i < poolSize; i++ {
  175. workers[i] = worker{Client: exr.clients[i], Job: exr.extractTask}
  176. }
  177. exr.pool, err = tunny.CreateCustomPool(workers).Open()
  178. if err != nil {
  179. panic(err)
  180. }
  181. exr.ProcessedFiles = map[string]int{}
  182. if exr.Languages == nil {
  183. exr.Languages = map[string]bool{}
  184. }
  185. }
  186. func (exr *UASTExtractor) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  187. cache := deps[DependencyBlobCache].(map[plumbing.Hash]*object.Blob)
  188. treeDiffs := deps[DependencyTreeChanges].(object.Changes)
  189. uasts := map[plumbing.Hash]*uast.Node{}
  190. lock := sync.RWMutex{}
  191. errs := make([]error, 0)
  192. status := make(chan int)
  193. pending := 0
  194. submit := func(change *object.Change) {
  195. {
  196. reader, err := cache[change.To.TreeEntry.Hash].Reader()
  197. if err != nil {
  198. errs = append(errs, err)
  199. return
  200. }
  201. defer ioutil.CheckClose(reader, &err)
  202. buf := new(bytes.Buffer)
  203. if _, err := buf.ReadFrom(reader); err != nil {
  204. errs = append(errs, err)
  205. return
  206. }
  207. lang := enry.GetLanguage(change.To.Name, buf.Bytes())
  208. if _, exists := exr.Languages[lang]; !exists {
  209. exr.ProcessedFiles[change.To.Name] = uastExtractionSkipped
  210. return
  211. }
  212. exr.ProcessedFiles[change.To.Name]++
  213. }
  214. pending++
  215. exr.pool.SendWorkAsync(uastTask{
  216. Lock: &lock,
  217. Dest: uasts,
  218. File: &object.File{Name: change.To.Name, Blob: *cache[change.To.TreeEntry.Hash]},
  219. Errors: &errs, Status: status}, nil)
  220. }
  221. for _, change := range treeDiffs {
  222. action, err := change.Action()
  223. if err != nil {
  224. return nil, err
  225. }
  226. switch action {
  227. case merkletrie.Insert:
  228. submit(change)
  229. case merkletrie.Delete:
  230. continue
  231. case merkletrie.Modify:
  232. submit(change)
  233. }
  234. }
  235. for i := 0; i < pending; i++ {
  236. _ = <-status
  237. }
  238. if len(errs) > 0 {
  239. msgs := make([]string, len(errs))
  240. for i, err := range errs {
  241. msgs[i] = err.Error()
  242. }
  243. joined := strings.Join(msgs, "\n")
  244. if exr.FailOnErrors {
  245. return nil, errors.New(joined)
  246. }
  247. fmt.Fprintln(os.Stderr, joined)
  248. }
  249. return map[string]interface{}{DependencyUasts: uasts}, nil
  250. }
  251. func (exr *UASTExtractor) extractUAST(
  252. client *bblfsh.Client, file *object.File) (*uast.Node, error) {
  253. request := client.NewParseRequest()
  254. contents, err := file.Contents()
  255. if err != nil {
  256. return nil, err
  257. }
  258. request.Content(contents)
  259. request.Filename(file.Name)
  260. ctx, cancel := exr.Context()
  261. if cancel != nil {
  262. defer cancel()
  263. }
  264. response, err := request.DoWithContext(ctx)
  265. if err != nil {
  266. if strings.Contains("missing driver", err.Error()) {
  267. return nil, nil
  268. }
  269. return nil, err
  270. }
  271. if response.Status != protocol.Ok {
  272. return nil, errors.New(strings.Join(response.Errors, "\n"))
  273. }
  274. if err != nil {
  275. return nil, err
  276. }
  277. return response.UAST, nil
  278. }
  279. func (exr *UASTExtractor) extractTask(data interface{}) interface{} {
  280. task := data.(uastTask)
  281. defer func() { task.Status <- 0 }()
  282. node, err := exr.extractUAST(task.Client, task.File)
  283. task.Lock.Lock()
  284. defer task.Lock.Unlock()
  285. if err != nil {
  286. *task.Errors = append(*task.Errors, errors.New(task.File.Name+": "+err.Error()))
  287. return nil
  288. }
  289. if node != nil {
  290. task.Dest[task.File.Hash] = node
  291. }
  292. return nil
  293. }
  294. // UASTChange is the type of the items in the list of changes which is provided by UASTChanges.
  295. type UASTChange struct {
  296. Before *uast.Node
  297. After *uast.Node
  298. Change *object.Change
  299. }
  300. const (
  301. // DependencyUastChanges is the name of the dependency provided by UASTChanges.
  302. DependencyUastChanges = "changed_uasts"
  303. )
  304. // UASTChanges is a structured analog of TreeDiff: it provides UASTs for every logical change
  305. // in a commit. It is a PipelineItem.
  306. type UASTChanges struct {
  307. cache map[plumbing.Hash]*uast.Node
  308. }
  309. func (uc *UASTChanges) Name() string {
  310. return "UASTChanges"
  311. }
  312. func (uc *UASTChanges) Provides() []string {
  313. arr := [...]string{DependencyUastChanges}
  314. return arr[:]
  315. }
  316. func (uc *UASTChanges) Requires() []string {
  317. arr := [...]string{DependencyUasts, DependencyTreeChanges}
  318. return arr[:]
  319. }
  320. func (uc *UASTChanges) Features() []string {
  321. arr := [...]string{FeatureUast}
  322. return arr[:]
  323. }
  324. func (uc *UASTChanges) ListConfigurationOptions() []ConfigurationOption {
  325. return []ConfigurationOption{}
  326. }
  327. func (uc *UASTChanges) Configure(facts map[string]interface{}) {}
  328. func (uc *UASTChanges) Initialize(repository *git.Repository) {
  329. uc.cache = map[plumbing.Hash]*uast.Node{}
  330. }
  331. func (uc *UASTChanges) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  332. uasts := deps[DependencyUasts].(map[plumbing.Hash]*uast.Node)
  333. treeDiffs := deps[DependencyTreeChanges].(object.Changes)
  334. commit := make([]UASTChange, 0, len(treeDiffs))
  335. for _, change := range treeDiffs {
  336. action, err := change.Action()
  337. if err != nil {
  338. return nil, err
  339. }
  340. switch action {
  341. case merkletrie.Insert:
  342. hashTo := change.To.TreeEntry.Hash
  343. uastTo := uasts[hashTo]
  344. commit = append(commit, UASTChange{Before: nil, After: uastTo, Change: change})
  345. uc.cache[hashTo] = uastTo
  346. case merkletrie.Delete:
  347. hashFrom := change.From.TreeEntry.Hash
  348. commit = append(commit, UASTChange{Before: uc.cache[hashFrom], After: nil, Change: change})
  349. delete(uc.cache, hashFrom)
  350. case merkletrie.Modify:
  351. hashFrom := change.From.TreeEntry.Hash
  352. hashTo := change.To.TreeEntry.Hash
  353. uastTo := uasts[hashTo]
  354. commit = append(commit, UASTChange{Before: uc.cache[hashFrom], After: uastTo, Change: change})
  355. delete(uc.cache, hashFrom)
  356. uc.cache[hashTo] = uastTo
  357. }
  358. }
  359. return map[string]interface{}{DependencyUastChanges: commit}, nil
  360. }
  361. // UASTChangesSaver dumps changed files and corresponding UASTs for every commit.
  362. // it is a LeafPipelineItem.
  363. type UASTChangesSaver struct {
  364. // OutputPath points to the target directory with UASTs
  365. OutputPath string
  366. repository *git.Repository
  367. result [][]UASTChange
  368. }
  369. const (
  370. // ConfigUASTChangesSaverOutputPath is the name of the configuration option
  371. // (UASTChangesSaver.Configure()) which sets the target directory where to save the files.
  372. ConfigUASTChangesSaverOutputPath = "UASTChangesSaver.OutputPath"
  373. )
  374. func (saver *UASTChangesSaver) Name() string {
  375. return "UASTChangesSaver"
  376. }
  377. func (saver *UASTChangesSaver) Provides() []string {
  378. return []string{}
  379. }
  380. func (saver *UASTChangesSaver) Requires() []string {
  381. arr := [...]string{DependencyUastChanges}
  382. return arr[:]
  383. }
  384. func (saver *UASTChangesSaver) Features() []string {
  385. arr := [...]string{FeatureUast}
  386. return arr[:]
  387. }
  388. func (saver *UASTChangesSaver) ListConfigurationOptions() []ConfigurationOption {
  389. options := [...]ConfigurationOption{{
  390. Name: ConfigUASTChangesSaverOutputPath,
  391. Description: "The target directory where to store the changed UAST files.",
  392. Flag: "changed-uast-dir",
  393. Type: StringConfigurationOption,
  394. Default: "."},
  395. }
  396. return options[:]
  397. }
  398. func (saver *UASTChangesSaver) Flag() string {
  399. return "dump-uast-changes"
  400. }
  401. func (saver *UASTChangesSaver) Configure(facts map[string]interface{}) {
  402. if val, exists := facts[ConfigUASTChangesSaverOutputPath]; exists {
  403. saver.OutputPath = val.(string)
  404. }
  405. }
  406. func (saver *UASTChangesSaver) Initialize(repository *git.Repository) {
  407. saver.repository = repository
  408. saver.result = [][]UASTChange{}
  409. }
  410. func (saver *UASTChangesSaver) Consume(deps map[string]interface{}) (map[string]interface{}, error) {
  411. changes := deps[DependencyUastChanges].([]UASTChange)
  412. saver.result = append(saver.result, changes)
  413. return nil, nil
  414. }
  415. func (saver *UASTChangesSaver) Finalize() interface{} {
  416. return saver.result
  417. }
  418. func (saver *UASTChangesSaver) Serialize(result interface{}, binary bool, writer io.Writer) error {
  419. saverResult := result.([][]UASTChange)
  420. fileNames := saver.dumpFiles(saverResult)
  421. if binary {
  422. return saver.serializeBinary(fileNames, writer)
  423. }
  424. saver.serializeText(fileNames, writer)
  425. return nil
  426. }
  427. func (saver *UASTChangesSaver) dumpFiles(result [][]UASTChange) []*pb.UASTChange {
  428. fileNames := []*pb.UASTChange{}
  429. for i, changes := range result {
  430. for j, change := range changes {
  431. if change.Before == nil || change.After == nil {
  432. continue
  433. }
  434. record := &pb.UASTChange{FileName: change.Change.To.Name}
  435. bs, _ := change.Before.Marshal()
  436. record.UastBefore = path.Join(saver.OutputPath, fmt.Sprintf(
  437. "%d_%d_before_%s.pb", i, j, change.Change.From.TreeEntry.Hash.String()))
  438. goioutil.WriteFile(record.UastBefore, bs, 0666)
  439. blob, _ := saver.repository.BlobObject(change.Change.From.TreeEntry.Hash)
  440. s, _ := (&object.File{Blob: *blob}).Contents()
  441. record.SrcBefore = path.Join(saver.OutputPath, fmt.Sprintf(
  442. "%d_%d_before_%s.src", i, j, change.Change.From.TreeEntry.Hash.String()))
  443. goioutil.WriteFile(record.SrcBefore, []byte(s), 0666)
  444. bs, _ = change.After.Marshal()
  445. record.UastAfter = path.Join(saver.OutputPath, fmt.Sprintf(
  446. "%d_%d_after_%s.pb", i, j, change.Change.To.TreeEntry.Hash.String()))
  447. goioutil.WriteFile(record.UastAfter, bs, 0666)
  448. blob, _ = saver.repository.BlobObject(change.Change.To.TreeEntry.Hash)
  449. s, _ = (&object.File{Blob: *blob}).Contents()
  450. record.SrcAfter = path.Join(saver.OutputPath, fmt.Sprintf(
  451. "%d_%d_after_%s.src", i, j, change.Change.To.TreeEntry.Hash.String()))
  452. goioutil.WriteFile(record.SrcAfter, []byte(s), 0666)
  453. fileNames = append(fileNames, record)
  454. }
  455. }
  456. return fileNames
  457. }
  458. func (saver *UASTChangesSaver) serializeText(result []*pb.UASTChange, writer io.Writer) {
  459. for _, sc := range result {
  460. kv := [...]string{
  461. "file: " + sc.FileName,
  462. "src0: " + sc.SrcBefore, "src1: " + sc.SrcAfter,
  463. "uast0: " + sc.UastBefore, "uast1: " + sc.UastAfter,
  464. }
  465. fmt.Fprintf(writer, " - {%s}\n", strings.Join(kv[:], ", "))
  466. }
  467. }
  468. func (saver *UASTChangesSaver) serializeBinary(result []*pb.UASTChange, writer io.Writer) error {
  469. message := pb.UASTChangesSaverResults{Changes: result}
  470. serialized, err := proto.Marshal(&message)
  471. if err != nil {
  472. return err
  473. }
  474. writer.Write(serialized)
  475. return nil
  476. }
  477. func init() {
  478. Registry.Register(&UASTExtractor{})
  479. Registry.Register(&UASTChanges{})
  480. Registry.Register(&UASTChangesSaver{})
  481. }